mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-11 18:26:08 +00:00
feat: add media viewer + other fixes (#79)
* feat: add media viewer + other fixes * chore: remove logging [skip ci] * chore: remove logging [skip ci] * feat: add unit test for media * feat: add unit test for media * feat: add unit test for media [skip ci] * feat: add unit test for media [skip ci] * feat: add unit test for media [skip ci] * feat: add unit test for media [skip ci] * chore: update docs [skip ci]
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
import re
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from typing import Dict, List
|
||||
|
||||
import aiohttp
|
||||
@@ -9,12 +10,12 @@ from playwright.async_api import Page
|
||||
from api.backend.utils import LOG
|
||||
|
||||
|
||||
async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
||||
async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]:
|
||||
media_types = {
|
||||
"images": "img",
|
||||
"videos": "video",
|
||||
"audio": "audio",
|
||||
"pdfs": 'a[href$=".pdf"]',
|
||||
"pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]',
|
||||
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
|
||||
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
|
||||
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
|
||||
@@ -48,6 +49,11 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
||||
root_domain = f"{root_url.scheme}://{root_url.netloc}"
|
||||
url = f"{root_domain}{url}"
|
||||
|
||||
if url and re.match(r"^[\w\-]+/", url):
|
||||
root_url = urlparse(page.url)
|
||||
root_domain = f"{root_url.scheme}://{root_url.netloc}"
|
||||
url = urljoin(root_domain + "/", url)
|
||||
|
||||
if url and url.startswith(("http://", "https://")):
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
@@ -67,15 +73,20 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]:
|
||||
}.get(media_type, "")
|
||||
filename += ext
|
||||
|
||||
file_path = media_dir / filename
|
||||
if not os.path.exists(media_dir / id):
|
||||
os.makedirs(media_dir / id, exist_ok=True)
|
||||
|
||||
file_path = media_dir / id / f"{filename}"
|
||||
|
||||
async with session.get(url) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
with open(file_path, "wb") as f:
|
||||
while True:
|
||||
chunk = await response.content.read(8192)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
f.write(chunk)
|
||||
|
||||
urls.append({"url": url, "local_path": str(file_path)})
|
||||
|
||||
Reference in New Issue
Block a user