Fix/UI and backend fixes (#67)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Unit Tests / cypress-tests (push) Has been cancelled
Unit Tests / success-message (push) Has been cancelled

* chore: wip

* chore: wip

* chore: wip

* fix: cypress test

* chore: cleanup code
This commit is contained in:
Jayden Pyles
2025-05-11 17:33:29 -05:00
committed by GitHub
parent 48d3bf9214
commit 031572325f
27 changed files with 672 additions and 503 deletions

View File

@@ -1,28 +1,20 @@
import logging
from typing import Any, Optional
import random
from typing import Any, Optional, cast
from bs4 import BeautifulSoup, Tag
from lxml import etree
from seleniumwire import webdriver
from lxml.etree import _Element
from fake_useragent import UserAgent
from selenium.webdriver.chrome.options import Options as ChromeOptions
from camoufox import AsyncCamoufox
from playwright.async_api import Page
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
from api.backend.job.site_mapping.site_mapping import (
handle_site_mapping,
)
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from api.backend.job.scraping.scraping_utils import scrape_content
from api.backend.job.site_mapping.site_mapping import handle_site_mapping
LOG = logging.getLogger(__name__)
class HtmlElement(_Element): ...
def is_same_domain(url: str, original_url: str) -> bool:
parsed_url = urlparse(url)
parsed_original_url = urlparse(original_url)
@@ -31,68 +23,15 @@ def is_same_domain(url: str, original_url: str) -> bool:
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts: list[str] = []
for part in parts:
if part == "":
clean_parts.append("/")
else:
clean_parts.append(part)
clean_xpath = "//".join(clean_parts).replace("////", "//")
clean_xpath = clean_xpath.replace("'", "\\'")
clean_parts = ["/" if part == "" else part for part in parts]
clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'")
LOG.info(f"Cleaned xpath: {clean_xpath}")
return clean_xpath
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
return context.xpath(xpath) # pyright: ignore [reportReturnType]
def interceptor(headers: dict[str, Any]):
def _interceptor(request: Any):
for key, val in headers.items():
if request.headers.get(key):
del request.headers[key]
request.headers[key] = val
if "sec-ch-ua" in request.headers:
original_value = request.headers["sec-ch-ua"]
del request.headers["sec-ch-ua"]
modified_value = original_value.replace("HeadlessChrome", "Chrome")
request.headers["sec-ch-ua"] = modified_value
return _interceptor
def create_driver(proxies: Optional[list[str]] = []):
ua = UserAgent()
chrome_options = ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument(f"user-agent={ua.random}")
sw_options = {}
if proxies:
selected_proxy = random.choice(proxies)
LOG.info(f"Using proxy: {selected_proxy}")
sw_options = {
"proxy": {
"https": f"https://{selected_proxy}",
"http": f"http://{selected_proxy}",
"no_proxy": "localhost,127.0.0.1",
}
}
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(
service=service,
options=chrome_options,
seleniumwire_options=sw_options,
)
return driver
def sxpath(context: etree._Element, xpath: str):
return context.xpath(xpath)
async def make_site_request(
@@ -102,91 +41,105 @@ async def make_site_request(
visited_urls: set[str] = set(),
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = [],
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
):
if url in visited_urls:
return
driver = create_driver(proxies)
driver.implicitly_wait(10)
proxy = None
if proxies:
proxy = random.choice(proxies)
LOG.info(f"Using proxy: {proxy}")
if headers:
driver.request_interceptor = interceptor(headers)
async with AsyncCamoufox(headless=True, proxy=proxy) as browser:
page: Page = await browser.new_page()
if headers:
await page.set_extra_http_headers(headers)
try:
LOG.info(f"Visiting URL: {url}")
driver.get(url)
final_url = driver.current_url
visited_urls.add(url)
visited_urls.add(final_url)
try:
await page.goto(url, timeout=60000)
await page.wait_for_load_state("networkidle", timeout=10000)
page_source = scrape_content(driver, pages, collect_media)
final_url = page.url
if site_map:
LOG.info("Site map: %s", site_map)
_ = await handle_site_mapping(
site_map,
driver,
pages,
)
finally:
driver.quit()
visited_urls.add(url)
visited_urls.add(final_url)
html_content = await scrape_content(page, pages, collect_media)
html_content = await page.content()
pages.add((html_content, final_url))
if site_map:
await handle_site_mapping(
site_map, page, pages, collect_media=collect_media
)
finally:
await page.close()
await browser.close()
if not multi_page_scrape:
return
soup = BeautifulSoup(page_source, "html.parser")
soup = BeautifulSoup(html_content, "html.parser")
for a_tag in soup.find_all("a"):
if not isinstance(a_tag, Tag):
continue
link = str(a_tag.get("href", ""))
link = cast(str, a_tag.get("href", ""))
if link:
if not urlparse(link).netloc:
base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url))
link = urljoin(base_url, link)
if not link:
continue
if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request(
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls,
pages=pages,
original_url=original_url,
)
if not urlparse(link).netloc:
base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url))
link = urljoin(base_url, link)
if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request(
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls,
pages=pages,
original_url=original_url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
)
async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]):
soup = BeautifulSoup(page[0], "lxml")
root = etree.HTML(str(soup))
elements: dict[str, list[CapturedElement]] = dict()
elements: dict[str, list[CapturedElement]] = {}
for elem in xpaths:
el = sxpath(root, elem.xpath)
for e in el:
if isinstance(e, etree._Element): # type: ignore
text = "\t".join(str(t) for t in e.itertext())
else:
text = str(e)
for e in el: # type: ignore
text = (
"\t".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element)
else str(e) # type: ignore
)
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
)
if elem.name in elements:
elements[elem.name].append(captured_element)
continue
elements[elem.name] = [captured_element]
else:
elements[elem.name] = [captured_element]
return {page[1]: elements}
@@ -194,18 +147,18 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
async def scrape(
url: str,
xpaths: list[Element],
headers: Optional[dict[str, Any]],
headers: Optional[dict[str, Any]] = None,
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
proxies: Optional[list[str]] = None,
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
_ = await make_site_request(
await make_site_request(
url,
headers,
headers=headers,
multi_page_scrape=multi_page_scrape,
visited_urls=visited_urls,
pages=pages,
@@ -215,7 +168,7 @@ async def scrape(
collect_media=collect_media,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
elements: list[dict[str, dict[str, list[CapturedElement]]]] = []
for page in pages:
elements.append(await collect_scraped_elements(page, xpaths))