mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-15 12:16:37 +00:00
wip: mutli page scraping and worker
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
# STL
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
# PDM
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import etree
|
||||
from selenium import webdriver
|
||||
from seleniumwire import webdriver
|
||||
from lxml.etree import _Element # type: ignore [reportPrivateImport]
|
||||
from fake_useragent import UserAgent
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
@@ -13,6 +14,7 @@ from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
# LOCAL
|
||||
from api.backend.models import Element, CapturedElement
|
||||
@@ -23,6 +25,14 @@ LOG = logging.getLogger(__name__)
|
||||
class HtmlElement(_Element): ...
|
||||
|
||||
|
||||
def is_same_domain(url: str, original_url: str) -> bool:
|
||||
parsed_url = urlparse(url)
|
||||
parsed_original_url = urlparse(original_url)
|
||||
LOG.info(f"PARSED: {parsed_url.netloc}")
|
||||
LOG.info(f"PARSED_ORIGINAL: {parsed_original_url.netloc}")
|
||||
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
|
||||
|
||||
|
||||
def clean_xpath(xpath: str) -> str:
|
||||
parts = xpath.split("/")
|
||||
clean_parts: list[str] = []
|
||||
@@ -41,10 +51,25 @@ def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
|
||||
return context.xpath(xpath) # type: ignore [reportReturnType]
|
||||
|
||||
|
||||
async def make_site_request(url: str) -> str:
|
||||
"""Make basic `GET` request to site using Selenium."""
|
||||
ua = UserAgent()
|
||||
def interceptor(headers: dict[str, Any]):
|
||||
def _interceptor(request: Any):
|
||||
for key, val in headers.items():
|
||||
if request.headers.get(key):
|
||||
del request.headers[key]
|
||||
|
||||
request.headers[key] = val
|
||||
|
||||
if "sec-ch-ua" in request.headers:
|
||||
original_value = request.headers["sec-ch-ua"]
|
||||
del request.headers["sec-ch-ua"]
|
||||
modified_value = original_value.replace("HeadlessChrome", "Chrome")
|
||||
request.headers["sec-ch-ua"] = modified_value
|
||||
|
||||
return _interceptor
|
||||
|
||||
|
||||
def create_driver():
|
||||
ua = UserAgent()
|
||||
chrome_options = ChromeOptions()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
@@ -52,23 +77,65 @@ async def make_site_request(url: str) -> str:
|
||||
chrome_options.add_argument(f"user-agent={ua.random}")
|
||||
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(options=chrome_options, service=service)
|
||||
return webdriver.Chrome(options=chrome_options, service=service)
|
||||
|
||||
|
||||
async def make_site_request(
|
||||
url: str,
|
||||
headers: Optional[dict[str, Any]],
|
||||
multi_page_scrape: bool = False,
|
||||
visited_urls: set[str] = set(),
|
||||
pages: set[tuple[str, str]] = set(),
|
||||
original_url: str = "",
|
||||
) -> None:
|
||||
"""Make basic `GET` request to site using Selenium."""
|
||||
# Check if URL has already been visited
|
||||
if url in visited_urls:
|
||||
return
|
||||
|
||||
driver = create_driver()
|
||||
|
||||
if headers:
|
||||
driver.request_interceptor = interceptor(headers)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
visited_urls.add(url)
|
||||
_ = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
page_source = driver.page_source
|
||||
LOG.debug(f"Page source for url: {url}\n{page_source}")
|
||||
pages.add((page_source, url))
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
LOG.debug(f"Page source for url: {url}\n{page_source}")
|
||||
return page_source
|
||||
if not multi_page_scrape:
|
||||
return
|
||||
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
for a_tag in soup.find_all("a"):
|
||||
link = a_tag.get("href")
|
||||
|
||||
if link:
|
||||
if not urlparse(link).netloc:
|
||||
base_url = "{0.scheme}://{0.netloc}".format(urlparse(original_url))
|
||||
link = urljoin(base_url, link)
|
||||
|
||||
if link not in visited_urls and is_same_domain(link, original_url):
|
||||
await make_site_request(
|
||||
link,
|
||||
headers=headers,
|
||||
multi_page_scrape=multi_page_scrape,
|
||||
visited_urls=visited_urls,
|
||||
pages=pages,
|
||||
original_url=original_url,
|
||||
)
|
||||
|
||||
|
||||
async def collect_scraped_elements(page: str, xpaths: list[Element]):
|
||||
soup = BeautifulSoup(page, "lxml")
|
||||
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]):
|
||||
soup = BeautifulSoup(page[0], "lxml")
|
||||
root = etree.HTML(str(soup))
|
||||
|
||||
elements: dict[str, list[CapturedElement]] = dict()
|
||||
@@ -86,11 +153,30 @@ async def collect_scraped_elements(page: str, xpaths: list[Element]):
|
||||
|
||||
elements[elem.name] = [captured_element]
|
||||
|
||||
return elements
|
||||
return {page[1]: elements}
|
||||
|
||||
|
||||
async def scrape(url: str, xpaths: list[Element]):
|
||||
page = await make_site_request(url)
|
||||
elements = await collect_scraped_elements(page, xpaths)
|
||||
async def scrape(
|
||||
url: str,
|
||||
xpaths: list[Element],
|
||||
headers: Optional[dict[str, Any]],
|
||||
multi_page_scrape: bool = False,
|
||||
):
|
||||
visited_urls: set[str] = set()
|
||||
pages: set[tuple[str, str]] = set()
|
||||
|
||||
_ = await make_site_request(
|
||||
url,
|
||||
headers,
|
||||
multi_page_scrape=multi_page_scrape,
|
||||
visited_urls=visited_urls,
|
||||
pages=pages,
|
||||
original_url=url,
|
||||
)
|
||||
|
||||
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
|
||||
|
||||
for page in pages:
|
||||
elements.append(await collect_scraped_elements(page, xpaths))
|
||||
|
||||
return elements
|
||||
|
||||
Reference in New Issue
Block a user