https://jhleed.tistory.com/195
$ cat /proc/version Linux version 4.14.171-136.231.amzn2.x86_64 (mockbuild@ip-10-0-1-138) (gcc version 7.3.1 20180712 (Red Hat 7.3.1-6) (GCC)) #1 SMP Thu Feb 27 20:22:48 UTC 2020
$ sudo vi /etc/yum.repos.d/google-chrome.repo [google-chrome] name=google-chrome baseurl=https://dl.google.com/linux/chrome/rpm/stable/x86_64 enabled=1 gpgcheck=1 gpgkey=https://dl.google.com/linux/linux_signing_key.pub
$ sudo yum install google-chrome-stable $ google-chrome --version Google Chrome 97.0.4692.71
$ cd /tmp/ $ sudo wget https://chromedriver.storage.googleapis.com/97.0.4692.71/chromedriver_linux64.zip // <= version must match $ sudo unzip chromedriver_linux64.zip $ sudo mv chromedriver /usr/bin/chromedriver $ chromedriver --version
$ pip3 install selenium $ vi test.py
from selenium.webdriver.chrome.options import Options from selenium import webdriver
url = 'https://naver.com/'
options = Options() options.add_argument("--headless") options.add_argument("window-size=1400,1500")
driver = webdriver.Chrome(options=options)
try: driver.get(url) text = driver.find_element_by_class_name('blind').text print(text)
finally: driver.quit()
$ python3 test.py NAVER whale
$ pip3 install bs4 // <= used for parsing $ pip3 install requests $ pip3 install lxml $ pip3 install html5lib
$ vi getnews.py
from datetime import datetime import requests from bs4 import BeautifulSoup from selenium.webdriver.chrome.options import Options from selenium import webdriver
def get_news_info(driver, news_url, publisher): date, article, writer = None,'', '' if publisher == '조선일보': driver.implicitly_wait(3) driver.get(news_url) writer = driver.find_element_by_xpath('//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[1]/div/a').text.split()[0] # date like 2021.10.16 14:48 date_time_obj = driver.find_element_by_xpath('//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[2]/span').text[3:] date = datetime.strptime(date_time_obj, '%Y.%m.%d %H:%M') i = 1 while True: try: article += driver.find_element_by_xpath('//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/section/p[{0}]'.format(i)).text i+=1 except: break driver.quit() print('================================================') print(publisher,', writer : ', writer) print(' date :', date) print(' article :', article) print('================================================') return writer, article, date
def get_news_heads(driver): resp = requests.get(url) soup = BeautifulSoup(resp.text, 'lxml') items = soup.select('div > article > h3 > a') for idx, item in enumerate(items): # nth-child 는 nth-of-type 으로 바꾸어줘야 한다. publisherA = soup.select('main > div.lBwEZb.BL5WZb.GndZbb > div:nth-of-type({0}) > div > div > article > div > div > a'.format(idx+1)) if len(publisherA) == 0: continue
publisher = publisherA[0].getText() title = item.text news_url = 'https://news.google.com' + str(item.get('href'))
print('---------------------------------') print('IDX : ', idx, publisher, title , news_url)
get_news_info(driver, news_url, publisher)
url = 'https://news.google.com/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNRFp4WkRNU0FtdHZLQUFQAQ?hl=ko&gl=KR&ceid=KR%3Ako' # google news page
options = Options() options.add_argument("--headless") options.add_argument("window-size=1400,1500")
driver = webdriver.Chrome(options=options)
try: get_news_heads(driver)
finally: driver.quit()
|