read word

example python code of web crawling with selenium and beautifulsoup

cella

2020

Branchable

0

12736

from datetime import datetime
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By

def get_news_content(driver, publisher):
writer, date, content = '', None, ''
if publisher == '조선일보':
writer = driver.find_element(By.XPATH, '//div[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[1]/div/a').text.split()[0]

date_str = driver.find_element(By.XPATH, '//div[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[2]/span').text[3:]
date = datetime.strptime(date_str, '%Y.%m.%d %H:%M') # like 2021.10.16 14:48

i = 1
while True:
try:
content += driver.find_element(By.XPATH, '//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/section/p[{0}]'.format(i)).text
i+=1
except:
break
elif publisher == 'MBC뉴스':
writer = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[1]/div[1]/div/span[2]/a').text

date_str = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[1]/div[3]/div[1]/span[1]').text[3:]
date = datetime.strptime(date_str, '%Y-%m-%d %H:%M') # like 2021-10-16 14:48

content = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[2]/div[5]').text
content = content.split('', '')

return writer, date, content

def get_news_heads(driver, url):
driver.implicitly_wait(3)
driver.get(url)
html = driver.page_source # get elements
soup = BeautifulSoup(html, 'html5lib')

#articles = soup.select('main > div > div > div > div > article')
divs = soup.select('main > div > div > div')
for div in divs:
article = div.select_one('div > article')
if article == None:
article = div.select_one('article')
if article == None:
continue

titleA = article.select_one('h3 > a')
title = titleA.text
news_url = 'https://news.google.com' + str(titleA.get('href'))

publisherA = article.select_one('div > div > a')
publisher = publisherA.get_text()

print('---------------------------------')
print(publisher, title, news_url)

driver.implicitly_wait(3)
driver.get(news_url)
writer, date, content = get_news_content(driver, publisher)

print('================================================')
print(publisher,', writer : ', writer)
print(' date :', date)
print(' content :', content)
print('================================================')

url = 'https://news.google.com/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNRFp4WkRNU0FtdHZLQUFQAQ?hl=ko&gl=KR&ceid=KR%3Ako' # google news page

options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,1500")

driver = webdriver.Chrome(options=options)

try:
get_news_heads(driver, url)

finally:
driver.quit()