read word

install selenium on Amazon Linux 2 AMI (EC2)

cella

2020

Branchable

0

12735

https://jhleed.tistory.com/195

$ cat /proc/version
Linux version 4.14.171-136.231.amzn2.x86_64 (mockbuild@ip-10-0-1-138) (gcc version 7.3.1 20180712 (Red Hat 7.3.1-6) (GCC)) #1 SMP Thu Feb 27 20:22:48 UTC 2020

$ sudo vi /etc/yum.repos.d/google-chrome.repo
[google-chrome]
name=google-chrome
baseurl=https://dl.google.com/linux/chrome/rpm/stable/x86_64
enabled=1
gpgcheck=1
gpgkey=https://dl.google.com/linux/linux_signing_key.pub

$ sudo yum install google-chrome-stable
$ google-chrome --version
Google Chrome 97.0.4692.71

$ cd /tmp/
$ sudo wget https://chromedriver.storage.googleapis.com/97.0.4692.71/chromedriver_linux64.zip // <= version must match
$ sudo unzip chromedriver_linux64.zip
$ sudo mv chromedriver /usr/bin/chromedriver
$ chromedriver --version

$ pip3 install selenium
$ vi test.py

from selenium.webdriver.chrome.options import Options
from selenium import webdriver

url = 'https://naver.com/'

options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,1500")

driver = webdriver.Chrome(options=options)

try:
driver.get(url)
text = driver.find_element_by_class_name('blind').text
print(text)

finally:
driver.quit()

$ python3 test.py
NAVER whale

$ pip3 install bs4 // <= used for parsing
$ pip3 install requests
$ pip3 install lxml
$ pip3 install html5lib

$ vi getnews.py

from datetime import datetime
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

def get_news_info(driver, news_url, publisher):
date, article, writer = None,'', ''
if publisher == '조선일보':
driver.implicitly_wait(3)
driver.get(news_url)

writer = driver.find_element_by_xpath('//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[1]/div/a').text.split()[0]

# date like 2021.10.16 14:48
date_time_obj = driver.find_element_by_xpath('//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/div[2]/span').text[3:]
date = datetime.strptime(date_time_obj, '%Y.%m.%d %H:%M')

i = 1
while True:
try:
article += driver.find_element_by_xpath('//*[@id="fusion-app"]/div[1]/div[2]/div/section/article/section/p[{0}]'.format(i)).text
i+=1
except:
break
driver.quit()
print('================================================')
print(publisher,', writer : ', writer)
print(' date :', date)
print(' article :', article)
print('================================================')
return writer, article, date

def get_news_heads(driver):
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'lxml')
items = soup.select('div > article > h3 > a')
for idx, item in enumerate(items): # nth-child 는 nth-of-type 으로 바꾸어줘야 한다.
publisherA = soup.select('main > div.lBwEZb.BL5WZb.GndZbb > div:nth-of-type({0}) > div > div > article > div > div > a'.format(idx+1))
if len(publisherA) == 0: continue

publisher = publisherA[0].getText()
title = item.text
news_url = 'https://news.google.com' + str(item.get('href'))

print('---------------------------------')
print('IDX : ', idx, publisher, title , news_url)

get_news_info(driver, news_url, publisher)

url = 'https://news.google.com/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNRFp4WkRNU0FtdHZLQUFQAQ?hl=ko&gl=KR&ceid=KR%3Ako' # google news page

options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1400,1500")

driver = webdriver.Chrome(options=options)

try:
get_news_heads(driver)

finally:
driver.quit()