from datetime import datetime import requests from bs4 import BeautifulSoup from selenium.webdriver.chrome.options import Options from selenium import webdriver from selenium.webdriver.common.by import By
def get_news_content(driver, publisher): writer, date, content = '', None, '' if publisher == '조선일보': writer = driver.find_element(By.XPATH, '//div[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[1]/div/a').text.split()[0] date_str = driver.find_element(By.XPATH, '//div[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[2]/span').text[3:] date = datetime.strptime(date_str, '%Y.%m.%d %H:%M') # like 2021.10.16 14:48 i = 1 while True: try: content += driver.find_element(By.XPATH, '//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/section/p[{0}]'.format(i)).text i+=1 except: break elif publisher == 'MBC뉴스': writer = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[1]/div[1]/div/span[2]/a').text
date_str = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[1]/div[3]/div[1]/span[1]').text[3:] date = datetime.strptime(date_str, '%Y-%m-%d %H:%M') # like 2021-10-16 14:48
content = driver.find_element(By.XPATH, '//*[@id="content"]/div/section[1]/article/div[2]/div[5]').text content = content.split('', '')
return writer, date, content
def get_news_heads(driver, url): driver.implicitly_wait(3) driver.get(url) html = driver.page_source # get elements soup = BeautifulSoup(html, 'html5lib')
#articles = soup.select('main > div > div > div > div > article') divs = soup.select('main > div > div > div') for div in divs: article = div.select_one('div > article') if article == None: article = div.select_one('article') if article == None: continue
titleA = article.select_one('h3 > a') title = titleA.text news_url = 'https://news.google.com' + str(titleA.get('href'))
publisherA = article.select_one('div > div > a') publisher = publisherA.get_text()
print('---------------------------------') print(publisher, title, news_url)
driver.implicitly_wait(3) driver.get(news_url) writer, date, content = get_news_content(driver, publisher)
print('================================================') print(publisher,', writer : ', writer) print(' date :', date) print(' content :', content) print('================================================')
url = 'https://news.google.com/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNRFp4WkRNU0FtdHZLQUFQAQ?hl=ko&gl=KR&ceid=KR%3Ako' # google news page
options = Options() options.add_argument("--headless") options.add_argument("window-size=1400,1500")
driver = webdriver.Chrome(options=options)
try: get_news_heads(driver, url)
finally: driver.quit()
|