Python (연습문제, 크롤링 select, 크롤링 image, 크롤링 chart)

Link

« 2025/06 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

Recent Comments

Recent Posts

Archives

Today

Total

Tags more

관리 메뉴

Learn & Record

Python (연습문제, 크롤링 select, 크롤링 image, 크롤링 chart) 본문

Dev/Python

Python (연습문제, 크롤링 select, 크롤링 image, 크롤링 chart)

Walker_ 2024. 3. 8. 17:22

1. 연습문제

import requests
from bs4 import BeautifulSoup as bs

# BeautifulSoup 실습 : find_all() 메소드 이용하기

# 다음 뉴스에서 제목, 링크, 뉴스 본문 추출하기

# 다음 뉴스에서 제목, 링크, 뉴스 본문 추출해서 파일로 저장하기
# 1) 이전 예제를 활용할 것
# 2) 링크를 추출한 for 안에서 제목을 추출
# 3) 뉴스 본문은 링크를 이용
# 4) 링크를 타고 뉴스 본문을 들고 와야 되니
# for문 안에서 requests, BeautifulSoup가 실행이 되어야 함
# 5) 제목, 링크, 뉴스 순으로 csv 저장

# 기사 모으기
url = 'https://news.daum.net/'
response = requests.get(url)
soup = bs(response.text, 'html.parser')
# print(soup.prettify())

tags = soup.find_all('div', {'class' : 'item_issue'})
# print(len(tags)) # 20

for tag in tags:
    title : str = tag.find_all('a')[1].text.strip()
    link: str = tag.find_all('a')[1].get('href')

    print(f'title: {title}, link : {link}')

    response = requests.get(link)
    soup = bs(response.text, 'html.parser')
    print(soup.find('div',{'class' : 'article_view'}).text.strip())

    print('-' * 20)

2. 크롤링 select

- find가 원하는 태그를 찾는게 목적이라면 select는 CSS selector로 tag 객체를 찾아 반환

- select_one()은 원하는 태그 하나만 가져오고, 태그가 많은 경우에는 맨 앞의 것만 가져옴

# select 계열의 메서드는 css selector 이용 가능
# '.' -> class 속성, '#' -> id 속성
# class : 하나의 html에서 여러 태그에 중복 사용 가능
# id : 하나의 html에서 한번만 사용. 권장사항.

# 실습 1. 대구광역시 위키피디아에서 상징 > 시조 > 독수리 들고 오기
url = 'https://ko.wikipedia.org/wiki/%EB%8C%80%EA%B5%AC%EA%B4%91%EC%97%AD%EC%8B%9C'
resp = requests.get(url)
soup = bs(resp.text, 'html.parser')

tag_symbol = soup.select_one('.mw-parser-output ul')
print(tag_symbol)

text_symbol = tag_symbol.find('a').text
print(text_symbol) # 독수리

# 실습 2. 다음 > 뉴스 > IT > 오늘의 연재 의 첫번째 글 제목과 신문사 들고오기
url = 'https://news.daum.net/digital#1'
resp = requests.get(url)
soup = bs(resp.text, 'html.parser')

tag_series = soup.select_one('.list_todayseries li')
print(tag_series)

print()
tag_series_title = tag_series.select_one('.link_txt').text
print(f'제목: {tag_series_title}')

tag_series_press = tag_series.select_one('.txt_info').text
print(f'신문사 : {tag_series_press}')

# 실습 3. seelct_one() 메소드 이용하기
# 할리스 커피 : 매장 검색

url = 'https://www.hollys.co.kr/store/korea/korStore2.do'
resp = requests.get(url)
soup = bs(resp.text, 'html.parser')
print(soup.prettify())

# 매장 테이블 가져오기
stores = soup.select_one('table.tb_store')
# print(stores)

first_store = stores.select_one('#contents > div.content > fieldset > fieldset > div.tableType01 > table > tbody > tr:nth-child(1)')
# print(first_store)

# td:nth-child(1) -> td 태그중 첫번째
second_store_name = first_store.select_one('td:nth-child(2)')
print(second_store_name.text)

second_store_address = first_store.select_one('td:nth-child(4)')
print(second_store_address.text)

# 실습 4. select() 메소드 실습하기

# select() 메소드
# CSS selector로 지정한 태그들을 모두 가져오는 메소드. 가져온 태그들은 모두 리스트에 보관

# 다음 > 뉴스 > IT > 오늘의 연재의 첫번째 글 제목과 신문사 들고오기

url = 'https://news.daum.net/digital#1'
resp = requests.get(url)
soup = bs(resp.text, 'html.parser')

tag_series = soup.select('.list_todayseries li')
print(tag_series)

for tag in tag_series:
    print()
    tag_series_title = tag.select_one('.link_txt').text
    print(f'제목 : {tag_series_title}')

    tag_series_press = tag.select_one('.txt_info').text
    print(f'신문사 : {tag_series_press}')

# 실습 5. select() 메소드 실습하기

# 네이버 환률 크롤링
# https://finance.naver.com/marketindex

url = 'https://finance.naver.com/marketindex'
response = requests.get(url)
soup = bs(response.text, 'html.parser')

nations = soup.select('#exchangeList > li > a.head > h3 > span')
print(nations)

exchange_rates = soup.select('#exchangeList > li > a.head > div > span.value') # 환률 가져옴
print(exchange_rates)

# 6. BeautifulSoup 실습 : select() 메소드
# 크롤링을 이용한 환률 계산기 : 다른 나라의 통화를 원으로 계산

def get_exchange_rate(menu : int) -> float: # 원하는 환률을 가져옴
    url = 'https://finance.naver.com/marketindex'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    exchange_rates = soup.select('#exchangeList > li > a.head > div > span.value')
    exchange_rate: float = float(exchange_rates[menu - 1].text.replace(',','')) # , 지우기
    return exchange_rate

print('=== 메뉴 ===')
print('1. 미국')
print('2. 일본')
print('3. 유럽')
print('4. 중국')
print('=============')
menu = int(input('선택 >> '))
unit = ['달러', '엔', '유로', '위안']
money = float(input(f'금액 입력 (단위 : {unit[menu -1]}) >> '))

if menu == 2: # 일본을 선택한다면? 네이버에서는 100엔 기준 금액을 제공
    money = money / 100
print(get_exchange_rate(menu) * money, '원')

# 7. 실습 : select() 메소드 이용하기

# 크롤링을 이용한 환률 계산기 : 다른 나라의 통화를 원으로 계산 > 원을 다른 나라 통화로 계산으로 변경

# 함수는 그대로 사용
def get_exchange_rate(menu : int) -> float: # 원하는 환률을 가져옴
    url = 'https://finance.naver.com/marketindex'
    response = requests.get(url)
    soup = bs(response.text, 'html.parser')
    exchange_rates = soup.select('#exchangeList > li > a.head > div > span.value')
    exchange_rate: float = float(exchange_rates[menu - 1].text.replace(',','')) # , 지우기
    return exchange_rate

money = int(input(f'step 01) 환률 계산할 금액을 입력해주세요. (단위 :원) >> '))
unit = ['달러', '엔', '유로', '위안']
print('=== 메뉴 ===')
print('1. 미국')
print('2. 일본')
print('3. 유럽')
print('4. 중국')
print('=============')
menu = int(input('선택 >> '))

if menu == 2: # 일본을 선택했다면? 네이버에서는 100엔 기준 금액을 제공
    trans_money = money / get_exchange_rate(menu) * 100
else:
    trans_money = money / get_exchange_rate(menu)
print(f'step 03) {money}원은 {unit[menu-1]}로 환전하면 {trans_money}{unit[menu-1]}입니다.')

# 8. 실습 : select() 메소드
url = 'https://finance.naver.com/marketindex'
response = requests.get(url)
soup = bs(response.text, 'html.parser')

# select 태그에 있는 데이터를 사용

# stpe 1) 환률 데이터 들고옴. 딕셔너리(나라, 단위, 환률, 원화비율)로 저장
datas= soup.select_one('#select_to') # id 값이 select_to인 태그
# print(datas)

dict_list: list[dict] = list()
for data in datas.select('option'): # option 태그만 반복
    # print(data)
    # print(data.text)
    # print(data.get('value'))
    new_dict: dict = dict()
    nation = data.text.split(" ")[0]
    if nation == '남아프리카':
        new_dict['나라'] = ' '.join(data.text.split(" ")[0:2])
        new_dict['단위'] = ' '.join(data.text.split(" ")[2:])
    else:
        new_dict['나라'] = nation
        new_dict['단위'] = ' '.join(data.text.split(" ")[1:])
    new_dict['환률'] = data.get('value')
    new_dict['원화비율'] = data.get('label')

    dict_list.append(new_dict)

pprint.pprint(dict_list)

3.크롤링 Image

import requests
import os.path

# 파이썬 공식 홈페이지에서 이미지 링크 가져옴

url = 'https://www.python.org/static/img/python-logo.png'
response = requests.get(url)

# 1. 파일 이름 가지고 오기
image_file_name = os.path.basename(url) # 파일 이름 가져오기
print(image_file_name) # pyhton-logo@2x.png

# 2. 파일 저장 response.content 사용
with open(f'./output_image/{image_file_name}', 'wb') as image_file:
    image_file.write(response.content)
    print('이미지 파일로 저장하였습니다.')

# 실습 2 : 할리스커피에서 태그를 이용해서 이미지 저장
url = 'https://www.hollys.co.kr/menu/espresso.do'
response = requests.get(url)
soup = bs(response.text, 'html.parser')
# print(soup.prettify())

# 1) 커피이미지 태그 가져오기
image_tag = soup.select_one('#menuView1_877 > img')
print('HTML 요소: ',image_tag)
print()

# 2) 이미지 경로 가져오기
# img_source = image_tag.get('src')
img_source = image_tag.attrs['src']
print('이미지 파일 경로: ', img_source)
print()

response = requests.get('https:' + img_source)
with open('./output_image/download_image.png', 'wb') as image_file:
    image_file.write(response.content)
    print('이미지 파일로 저장하였습니다.')

4. 크롤링 chart

import pprint

import requests
from bs4 import BeautifulSoup as bs
from requests import Response


# 멜론차트 가져오기
# user-agent를 확인해서 bot의 접근을 막음.
# 1) user-agent를 변경해서 결과 값을 가지고 올 것
# 2) 구현 후에 이 부분을 함수화 할 것

def requests_get(url: str) -> Response:
    request_headers: dict = {
        'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/122.0.0.0 Safari/537.36'),

        'Referer': '',
    }
    return requests.get(url, headers=request_headers)


url = 'https://www.melon.com/chart/'
response = requests_get(url)
soup = bs(response.text, 'html.parser')
# print(soup)
# html 문서에 table 태그가 하나만 있음
pprint.pprint(soup.select('table tbody tr')[0])

# 순위, 곡제목, 가수를 csv로 저장
# 파일명 melon_chart.csv

print(f'제목 / 아티스트 / 앨범')
dict_list = list()
for idx, item in enumerate(soup.select('table tbody tr')):
    print(f'{idx + 1}')
    print(f'곡제목: {item.select_one(".rank01").text.strip()}')
    print(f'가수: {item.select_one(".rank02 a").text.strip()}')
    print(f'앨범: {item.select_one(".rank03").text.strip()}')
    print(f'앨범이미지: {item.select_one("td:nth-child(4) img").get("src")}')

    new_data = dict()
    new_data['순위'] = idx + 1
    new_data['곡제목'] = item.select_one('div.rank01').text.strip()
    new_data['가수'] = item.select_one('div.rank02 a').text.strip()
    new_data['앨범'] = item.select_one('div.rank03').text.strip()
    dict_list.append(new_data)

    response = requests_get(item.select_one("td:nth-child(4) img").get("src"))
    with open(f"./output_image/melon/{new_data['순위']}_{new_data['곡제목']}_{new_data['가수']}.jpg", 'wb') as image_file:
        image_file.write(response.content)

공부 과정을 정리한 것이라 내용이 부족할 수 있습니다.

부족한 내용은 추가 자료들로 보충해주시면 좋을 것 같습니다.

읽어주셔서 감사합니다 :)

'Dev > Python' 카테고리의 다른 글

Python (pandas, 날짜자동생성, DataFrame 데이터프레임, 연산, 파일읽기, 파일쓰기, 데이터확인, 데이터선택, 무료웹호스팅, 결측데이터, 데이터가공) (0)	2024.03.12
Python (넘파이 Numpy, 배열 생성, 배열의 연산, 배열 인덱싱과 슬라이싱, 판다스 Pandas) (0)	2024.03.11
Pyhton (공공데이터활용, 2개 숫자 문자열 결합 후 정렬, URL-HTML, beautifulsoup4, requests 사용법, 크롤링 robot, HTML 태그, HTML-beautifulsoup4 활용, beautifulsoup4 실습) (0)	2024.03.07
Python (공공데이터활용(에어코리아), 공공데이터활용(근접측정소), XML, 공공데이터활용(인천공항)) (2)	2024.03.06
Python (공공데이터활용, 서버-데이터 수신 방식, JSON, openweathermap, 공공데이터포털, JSON과 API, API 이용하기, 공공데이터 활용하기) (2)	2024.03.05

'Dev/Python' Related Articles

Learn & Record

Python (연습문제, 크롤링 select, 크롤링 image, 크롤링 chart) 본문

Python (연습문제, 크롤링 select, 크롤링 image, 크롤링 chart)

'Dev > Python' 카테고리의 다른 글

티스토리툴바

'Dev > Python' 카테고리의 다른 글