feat: add media viewer + other fixes (#79)

* feat: add media viewer + other fixes

* chore: remove logging [skip ci]

* chore: remove logging [skip ci]

* feat: add unit test for media

* feat: add unit test for media

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* feat: add unit test for media [skip ci]

* chore: update docs [skip ci]
This commit is contained in:
Jayden Pyles
2025-05-17 16:31:34 -05:00
committed by GitHub
parent f815a58efc
commit 263e46ba4d
38 changed files with 1047 additions and 84 deletions

View File

@@ -1,5 +1,4 @@
import logging
from pickle import FALSE
import random
from typing import Any, Optional, cast
@@ -40,6 +39,7 @@ def sxpath(context: etree._Element, xpath: str):
async def make_site_request(
id: str,
url: str,
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
@@ -71,14 +71,14 @@ async def make_site_request(
try:
await page.goto(url, timeout=60000)
await page.wait_for_load_state("networkidle", timeout=10000)
await page.wait_for_load_state("networkidle")
final_url = page.url
visited_urls.add(url)
visited_urls.add(final_url)
html_content = await scrape_content(page, pages, collect_media)
html_content = await scrape_content(id, page, pages, collect_media)
html_content = await page.content()
pages.add((html_content, final_url))
@@ -112,6 +112,7 @@ async def make_site_request(
if link not in visited_urls and is_same_domain(link, original_url):
await make_site_request(
id,
link,
headers=headers,
multi_page_scrape=multi_page_scrape,
@@ -136,11 +137,20 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
for e in el: # type: ignore
text = (
"\t".join(str(t) for t in e.itertext())
" ".join(str(t) for t in e.itertext())
if isinstance(e, etree._Element)
else str(e) # type: ignore
)
text = text.strip()
text = text.replace("\n", " ")
text = text.replace("\t", " ")
text = text.replace("\r", " ")
text = text.replace("\f", " ")
text = text.replace("\v", " ")
text = text.replace("\b", " ")
text = text.replace("\a", " ")
captured_element = CapturedElement(
xpath=elem.xpath, text=text, name=elem.name
)
@@ -154,6 +164,7 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
async def scrape(
id: str,
url: str,
xpaths: list[Element],
headers: Optional[dict[str, Any]] = None,
@@ -167,6 +178,7 @@ async def scrape(
pages: set[tuple[str, str]] = set()
await make_site_request(
id,
url,
headers=headers,
multi_page_scrape=multi_page_scrape,