[Python] スクレイピング

Selenium

Webブラウザーのオートメーションを実現、支援するためのツール及びライブラリ群。Python等から使用できます。

https://www.selenium.dev/documentation/en/ (公式)

https://readthedocs.org/projects/selenium-python/downloads/pdf/latest/ (非公式 – 詳細あり)

簡単な例

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located

#This example requires Selenium WebDriver 3.13 or newer
with webdriver.Firefox() as driver:
    wait = WebDriverWait(driver, 10)
    driver.get("https://google.com/ncr")
    driver.find_element(By.NAME, "q").send_keys("cheese" + Keys.RETURN)
    first_result = wait.until(presence_of_element_located((By.CSS_SELECTOR, "h3>div")))
    print(first_result.get_attribute("textContent"))

Webドライバーの設定(Chromeの場合)

executable_path = '/path/to/chromedriver'

#Simple assignment
from selenium.webdriver import Chrome

driver = Chrome(executable_path)

#Or use the context manager
from selenium.webdriver import Chrome

with Chrome(executable_path) as driver:
    #your code inside this indent

Webサイトのナビゲーション

driver.get("https://selenium.dev")

#Get current URL
driver.current_url

driver.back()

driver.forward()

driver.refresh()

driver.title

#Quitting the browser at the end of a session
driver.quit()

例外処理

try:
    #WebDriver code here...
finally:
    driver.quit()

例外処理(コンテキストマネージャを使用した場合)

with webdriver.Firefox() as driver:
  # WebDriver code here...

# WebDriver will automatically quit after indentation

要素の特定

cheese = driver.find_element(By.ID, "cheese")
cheddar = cheese.find_element(By.ID, "cheddar")

cheddar = driver.find_element_by_css_selector("#cheese #cheddar")

mucho_cheese = driver.find_elements_by_css_selector("#cheese li")

#from selenium.webdriver.support.relative_locator import with_tag_name
passwordField = driver.find_element(By.ID, "password")
emailAddressField = driver.find_element(with_tag_name("input").above(passwordField))

#from selenium.webdriver.support.relative_locator import with_tag_name
emailAddressField = driver.find_element(By.ID, "email")
passwordField = driver.find_element(with_tag_name("input").below(emailAddressField))

#from selenium.webdriver.support.relative_locator import with_tag_name
submitButton = driver.find_element(By.ID, "submit")
cancelButton = driver.find_element(with_tag_name("button").to_left_of(submitButton))

#from selenium.webdriver.support.relative_locator import with_tag_name
cancelButton = driver.find_element(By.ID, "cancel")
submitButton = driver.find_element(with_tag_name("button").to_right_of(cancelButton))

#from selenium.webdriver.support.relative_locator import with_tag_name
emailAddressLabel = driver.find_element(By.ID, "lbl-email")
emailAddressField = driver.find_element(with_tag_name("input").near(emailAddressLabel))

アクションの実行

name = "Charles"
driver.find_element(By.NAME, "name").send_keys(name)

source = driver.find_element(By.ID, "source")
target = driver.find_element(By.ID, "target")
ActionChains(driver).drag_and_drop(source, target).perform()

driver.find_element(By.CSS_SELECTOR, "input[type='submit']").click()