1.  BeautifulSoup

๐Ÿฏ  ๊ตฌ๋ฌธ์„ ๋ถ„์„ํ•ด์„œ ํ•„์š”ํ•œ ๋‚ด์šฉ๋งŒ ์ถ”์ถœ ํ•  ์ˆ˜ ์žˆ๋Š” ๊ธฐ๋Šฅ์„ ๊ฐ€์ง€๊ณ  ์žˆ๋Š” ํŒจํ‚ค์ง€

        โžก๏ธ  xml or html์„ ์ˆ˜ํ”„๊ฐ์ฒด๋กœ ๋งŒ๋“ค์–ด์„œ ์ถ”์ถœํ•˜๊ธฐ ์‰ฝ๊ฒŒ ๋งŒ๋“ค์–ด ์ค€๋‹ค.

from bs4 import BeautifulSoup as bs  # bs4 ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์—์„œ Beautiful Soup๋ฅผ import
import requests
import pprint

# html ํŒŒ์ผ ๊ฐ€์ ธ์˜ค๊ธฐ
with open('./sample.html', 'r', encoding='utf-8') as file:
    html = file.read()

# html.parser : html ์ฝ”๋“œ๋ฅผ ์‚ฌ์šฉํ•˜๊ธฐ ์‰ฝ๊ฒŒ BeautifulSoup์˜ ๊ฐ์ฒด๋กœ ๋ถ„์„
soup = bs(html, 'html.parser')  
# ์ฒซ๋ฒˆ์งธ ์ธ์ž: ๋ถ„์„ํ•  ๋‚ด์šฉ ์ „๋‹ฌ
# ๋‘๋ฒˆ์งธ ์ธ์ž: "html"๋กœ ๋ถ„์„ํ•œ๋‹ค๋Š” ๊ฒƒ์„ ๋ช…์‹œ

print(type(soup))  # <class 'bs4.BeautifulSoup'>
print(soup)  # (html ์ถœ๋ ฅ)

print(soup.find('title').text)  # This is title
print(soup.find('div').text)  # Division์˜ ์•ฝ์ž๋กœ, ๋ ˆ์ด์•„์›ƒ์„ ๋‚˜๋ˆ„๋Š”๋ฐ ์‚ฌ์šฉ.
print(soup.find('h1').text.strip())  # This is heading1 text.

 


 

1) find ()

๐Ÿฅ‘  ์ง€์ •๋œ ํƒœ๊ทธ๋“ค ์ค‘์—์„œ ๊ฐ€์žฅ ์ฒซ ๋ฒˆ์งธ ํƒœ๊ทธ๋งŒ ๊ฐ€์ ธ์˜ค๋Š” ๋ฉ”์†Œ๋“œ(ํ•˜๋‚˜์˜ ๊ฐ’๋งŒ ๋ฐ˜ํ™˜)๋กœ ๋ฌธ์ž์—ด ํ˜•ํƒœ๋กœ ๋ฐ˜ํ™˜
       โžก๏ธ  ์ผ๋ฐ˜์ ์œผ๋กœ ํ•˜๋‚˜์˜ ํƒœ๊ทธ๋งŒ ์กด์žฌํ•˜๋Š” ๊ฒฝ์šฐ์— ์‚ฌ์šฉ  ๋งŒ์•ฝ ์—ฌ๋Ÿฌ ํƒœ๊ทธ๊ฐ€ ์žˆ์œผ๋ฉด ์ฒซ ๋ฒˆ์งธ ํƒœ๊ทธ๋งŒ ๊ฐ€์ ธ์˜ด

 

# ์†์„ฑ๊ฐ’ ๊ฐ€์ ธ์˜ค๋Š” ๊ฒฝ์šฐ ํ˜•์‹ (find, find_all ๋™์ผ)
find(ํƒœ๊ทธ๋ช…['์†์„ฑ๋ช…'])
find(ํƒœ๊ทธ๋ช….attrs['์†์„ฑ๋ช…'])
find(ํƒœ๊ทธ๋ช…).get(์†์„ฑ๋ช…)
find_all('ํƒœ๊ทธ๋ช…', attrs={'์†์„ฑ๋ช…':'๊ฐ’'})
# ์œ„ํ‚คํ”ผ๋””์•„ '๋Œ€๊ตฌ๊ด‘์—ญ์‹œ' ํŽ˜์ด์ง€
url = 'https://ko.wikipedia.org/wiki/%EB%8C%80%EA%B5%AC%EA%B4%91%EC%97%AD%EC%8B%9C'
resp = requests.get(url)
soup = bs(resp.text, 'html.parser')

first_img = soup.find(name='img')  # img ํƒœ๊ทธ ์ค‘์— ์ œ์ผ ๋จผ์ € ๋‚˜์˜ค๋Š” ๊ฒƒ
print(type(first_img))  # <class 'bs4.element.Tag'>
print(first_img)  
# <img alt="" aria-hidden="true" class="mw-logo-icon" height="50" 
# src="/static/images/icons/wikipedia.png" width="50"/>

target_img = soup.find(name='img', attrs={'alt': 'Daedongyeojido (Gyujanggak) 17-02.jpg'})
print(target_img)
# <img alt="Daedongyeojido (Gyujanggak) 17-02.jpg" class="mw-file-element" 
# data-file-height="3005" data-file-width="4000" decoding="async" height="376" 
# src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c5/
# Daedongyeojido_%28Gyujanggak%29_17-02.jpg/500px-Daedongyeojido_%28Gyujanggak%29_17-02.
# jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c5/Daedongyeojido_
# %28Gyujanggak%29_17-02.jpg/750px-Daedongyeojido_%28Gyujanggak%29_17-02.jpg 1.5x, 
# //upload.wikimedia.org/wikipedia/commons/thumb/c/c5/Daedongyeojido_%28Gyujanggak%
# 29_17-02.jpg/1000px-Daedongyeojido_%28Gyujanggak%29_17-02.jpg 2x" width="500"/>


 

2)  find_all() 

๐Ÿฅ‘  ์ง€์ •ํ•œ ํƒœ๊ทธ๋“ค์„ ๋ชจ๋‘ ๊ฐ€์ ธ์˜ค๋Š” ๋ฉ”์†Œ๋“œ๋กœ ๊ฐ€์ ธ์˜จ ํƒœ๊ทธ๋“ค์€ ๋ชจ๋‘ ๋ฆฌ์ŠคํŠธ์— ๋ณด๊ด€

# ๋„ค์ด๋ฒ„ ์Šคํฌ์ธ  ํŽ˜์ด์ง€์—์„œ ๋ฐ•์Šค ๋‰ด์Šค ์ œ๋ชฉ ๋“ค๊ณ  ์˜ด
url = 'https://sports.news.naver.com/index.nhn'
response = requests.get(url)
soup = bs(response.text, 'html.parser')

today_list = soup.find('ul', {'class': 'today_list'})
print(today_list)

today_list_title = today_list.find_all('strong', {'class', 'title'})
pprint.pprint(today_list_title)  # ๋ฆฌ์ŠคํŠธ๋กœ ๋ฐ˜ํ™˜

for title in today_list_title:
    print(title.text.strip())

1. today_list ๊ฒฐ๊ณผ
2. today_list_title ๊ฒฐ๊ณผ
3. ๋ฐ˜๋ณต๋ฌธ ์‹คํ–‰๊ฒฐ๊ณผ

 


 

3) find_all ์‚ฌ์šฉ ์˜ˆ์ œ

a. ๋‹ค์Œ ๋‰ด์Šค ์‚ฌ์ดํŠธ html ๋ถ„์„

url = 'https://news.daum.net/'
response = requests.get(url)
soup = bs(response.text, 'html.parser')


 

b. a ํƒœ๊ทธ์˜ ๊ฐฏ์ˆ˜ ์ถœ๋ ฅ

 

   ๐Ÿ‘พ  html์˜ 'a' ํƒœ๊ทธ : ๋‹ค๋ฅธ ์ฝ˜ํ…์ธ ์™€ ์—ฐ๊ฒฐ๋˜๋Š” ํ•˜์ดํผ๋งํฌ(hyperlink)๋ฅผ ์ •์˜

print('1. a ํƒœ๊ทธ์˜ ๊ฐฏ์ˆ˜')
print(len(soup.find_all('a')))  # 124

 

c. a ํƒœ๊ทธ 20๊ฐœ๋งŒ ์ถœ๋ ฅ

print('2. a ํƒœ๊ทธ 20๊ฐœ๋งŒ ์ถœ๋ ฅ')
for news in soup.find_all('a')[:20]:
    print(news.text.strip())

์‹คํ–‰๊ฒฐ๊ณผ


 

d.  a ํƒœ๊ทธ ๋งํฌ 5๊ฐœ ์ถœ๋ ฅ

print('3. a ํƒœ๊ทธ ๋งํฌ 5๊ฐœ ์ถœ๋ ฅ')
for i in soup.find_all('a')[:5]:
    print(i.attrs['href'])
    print(i.get('href'))
    # -> ๋‘˜ ์ค‘ ํ•˜๋‚˜๋งŒ ์“ฐ๋ฉด ๋œ๋‹ค.
print("=" * 20)

 

์‹คํ–‰๊ฒฐ๊ณผ


 

e.  ํŠน์ • ํด๋ž˜์Šค ์†์„ฑ์„ ์ถœ๋ ฅํ•˜๊ธฐ

print('4. ํŠน์ • ํด๋ž˜์Šค ์†์„ฑ์„ ์ถœ๋ ฅ')
print(soup.find_all('div', {'class': 'item_issue'}))
print("=" * 20)

์‹คํ–‰๊ฒฐ๊ณผ


f.  ๋งํฌ๋ฅผ ํ…์ŠคํŠธ ํŒŒ์ผ๋กœ ์ €์žฅ

print('5. ๋งํฌ๋ฅผ ํ…์ŠคํŠธ ํŒŒ์ผ๋กœ ์ €์žฅ')
file = open('../output_02/links.txt', 'w')  # ์“ฐ๊ธฐ ์ „์šฉ ํŒŒ์ผ ์ƒ์„ฑ

for i in soup.find_all('div', {'class': 'item_issue'}):
    file.write(i.find('a').get('href') + '\n')
file.close()

์‹คํ–‰๊ฒฐ๊ณผ

 

 

 

 

 

[ ๋‚ด์šฉ ์ฐธ๊ณ  : IT ํ•™์› ๊ฐ•์˜ ]

+ Recent posts