feat: add in media downloading (#62)

* feat: add in media downloading

* fix: build issue
This commit is contained in:
Jayden Pyles
2025-05-10 15:14:54 -05:00
committed by GitHub
parent a58212b214
commit 8cd30599fa
13 changed files with 136 additions and 8 deletions

View File

@@ -12,3 +12,4 @@ class JobOptions(BaseModel):
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
site_map: Optional[SiteMap] = None
collect_media: bool = False

View File

@@ -0,0 +1,91 @@
import os
import requests
from pathlib import Path
from selenium.webdriver.common.by import By
from seleniumwire import webdriver
from urllib.parse import urlparse
from api.backend.utils import LOG
def collect_media(driver: webdriver.Chrome):
media_types = {
"images": "img",
"videos": "video",
"audio": "audio",
"pdfs": 'a[href$=".pdf"]',
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
}
base_dir = Path("media")
base_dir.mkdir(exist_ok=True)
media_urls = {}
for media_type, selector in media_types.items():
elements = driver.find_elements(By.CSS_SELECTOR, selector)
urls: list[dict[str, str]] = []
media_dir = base_dir / media_type
media_dir.mkdir(exist_ok=True)
for element in elements:
if media_type == "images":
url = element.get_attribute("src")
elif media_type == "videos":
url = element.get_attribute("src") or element.get_attribute("data-src")
else:
url = element.get_attribute("href")
if url and url.startswith(("http://", "https://")):
try:
filename = os.path.basename(urlparse(url).path)
if not filename:
filename = f"{media_type}_{len(urls)}"
if media_type == "images":
filename += ".jpg"
elif media_type == "videos":
filename += ".mp4"
elif media_type == "audio":
filename += ".mp3"
elif media_type == "pdfs":
filename += ".pdf"
elif media_type == "documents":
filename += ".doc"
elif media_type == "presentations":
filename += ".ppt"
elif media_type == "spreadsheets":
filename += ".xls"
response = requests.get(url, stream=True)
response.raise_for_status()
# Save the file
file_path = media_dir / filename
with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
urls.append({"url": url, "local_path": str(file_path)})
LOG.info(f"Downloaded {filename} to {file_path}")
except Exception as e:
LOG.error(f"Error downloading {url}: {str(e)}")
continue
media_urls[media_type] = urls
with open(base_dir / "download_summary.txt", "w") as f:
for media_type, downloads in media_urls.items():
if downloads:
f.write(f"\n=== {media_type.upper()} ===\n")
for download in downloads:
f.write(f"URL: {download['url']}\n")
f.write(f"Saved to: {download['local_path']}\n\n")
return media_urls

View File

@@ -1,13 +1,19 @@
import time
from typing import cast
from selenium import webdriver
from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from api.backend.utils import LOG
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
def scrape_content(
driver: webdriver.Chrome, pages: set[tuple[str, str]], collect_media: bool
):
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
@@ -27,4 +33,9 @@ def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
last_height = new_height
pages.add((driver.page_source, driver.current_url))
if collect_media:
LOG.info("Collecting media")
collect_media_utils(driver)
return driver.page_source

View File

@@ -104,6 +104,7 @@ async def make_site_request(
original_url: str = "",
proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
@@ -124,7 +125,7 @@ async def make_site_request(
visited_urls.add(url)
visited_urls.add(final_url)
page_source = scrape_content(driver, pages)
page_source = scrape_content(driver, pages, collect_media)
if site_map:
LOG.info("Site map: %s", site_map)
@@ -197,6 +198,7 @@ async def scrape(
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
collect_media: bool = False,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
@@ -210,6 +212,7 @@ async def scrape(
original_url=url,
proxies=proxies,
site_map=site_map,
collect_media=collect_media,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()

View File

@@ -27,6 +27,7 @@ async def process_job():
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
job["job_options"]["site_map"],
job["job_options"]["collect_media"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"