mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-14 11:46:17 +00:00
feat: add in media downloading (#62)
* feat: add in media downloading * fix: build issue
This commit is contained in:
@@ -1,13 +1,19 @@
|
||||
import time
|
||||
from typing import cast
|
||||
|
||||
from selenium import webdriver
|
||||
from seleniumwire import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
from api.backend.utils import LOG
|
||||
|
||||
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
|
||||
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
|
||||
|
||||
|
||||
def scrape_content(
|
||||
driver: webdriver.Chrome, pages: set[tuple[str, str]], collect_media: bool
|
||||
):
|
||||
_ = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
@@ -27,4 +33,9 @@ def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
|
||||
last_height = new_height
|
||||
|
||||
pages.add((driver.page_source, driver.current_url))
|
||||
|
||||
if collect_media:
|
||||
LOG.info("Collecting media")
|
||||
collect_media_utils(driver)
|
||||
|
||||
return driver.page_source
|
||||
|
||||
Reference in New Issue
Block a user