wip: mutli page scraping and worker

This commit is contained in:
Jayden
2024-07-21 00:22:33 -05:00
parent 2b2de523d4
commit 14b229e07e
14 changed files with 2521 additions and 1672 deletions

View File

@@ -1,10 +1,11 @@
# STL
import logging
from typing import Any, Optional
# PDM
from bs4 import BeautifulSoup
from lxml import etree
from selenium import webdriver
from seleniumwire import webdriver
from lxml.etree import _Element # type: ignore [reportPrivateImport]
from fake_useragent import UserAgent
from webdriver_manager.chrome import ChromeDriverManager
@@ -13,6 +14,7 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service
from urllib.parse import urlparse, urljoin
# LOCAL
from api.backend.models import Element, CapturedElement
@@ -23,6 +25,14 @@ LOG = logging.getLogger(__name__)
class HtmlElement(_Element): ...
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
LOG.info(f"PARSED: {parsed_url.netloc}")
LOG.info(f"PARSED_ORIGINAL: {parsed_original_url.netloc}")
return parsed_url.netloc == parsed_original_url.netloc or parsed_url.netloc == ""
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts: list[str] = []
@@ -41,10 +51,25 @@ def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
return context.xpath(xpath) # type: ignore [reportReturnType]
async def make_site_request(url: str) -> str:
"""Make basic `GET` request to site using Selenium."""
ua = UserAgent()
def interceptor(headers: dict[str, Any]):
def _interceptor(request: Any):
for key, val in headers.items():
if request.headers.get(key):
del request.headers[key]
request.headers[key] = val
if "sec-ch-ua" in request.headers:
original_value = request.headers["sec-ch-ua"]
del request.headers["sec-ch-ua"]
modified_value = original_value.replace("HeadlessChrome", "Chrome")
request.headers["sec-ch-ua"] = modified_value
return _interceptor
def create_driver():
ua = UserAgent()
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
@@ -52,23 +77,65 @@ async def make_site_request(url: str) -> str:
chrome_options.add_argument(f"user-agent={ua.random}")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(options=chrome_options, service=service)
return webdriver.Chrome(options=chrome_options, service=service)
async def make_site_request(
url: str,
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(),
original_url: str = "",
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
if url in visited_urls:
return
driver = create_driver()
if headers:
driver.request_interceptor = interceptor(headers)
try:
driver.get(url)
visited_urls.add(url)
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
page_source = driver.page_source
LOG.debug(f"Page source for url: {url}\n{page_source}")
pages.add((page_source, url))
finally:
driver.quit()
LOG.debug(f"Page source for url: {url}\n{page_source}")
return page_source
if not multi_page_scrape:
return
soup = BeautifulSoup(page_source, "html.parser")
for a_tag in soup.find_all("a"):
link = a_tag.get("href")
if link:
if not urlparse(link).netloc:
base_url = "{0.scheme}://{0.netloc}".format(urlparse(original_url))
link = urljoin(base_url, link)
if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request(
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls,
pages=pages,
original_url=original_url,
)
async def collect_scraped_elements(page: str, xpaths: list[Element]):
soup = BeautifulSoup(page, "lxml")
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]):
soup = BeautifulSoup(page[0], "lxml")
root = etree.HTML(str(soup))
elements: dict[str, list[CapturedElement]] = dict()
@@ -86,11 +153,30 @@ async def collect_scraped_elements(page: str, xpaths: list[Element]):
elements[elem.name] = [captured_element]
return elements
return {page[1]: elements}
async def scrape(url: str, xpaths: list[Element]):
page = await make_site_request(url)
elements = await collect_scraped_elements(page, xpaths)
async def scrape(
url: str,
xpaths: list[Element],
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
_ = await make_site_request(
url,
headers,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls,
pages=pages,
original_url=url,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
for page in pages:
elements.append(await collect_scraped_elements(page, xpaths))
return elements