mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-05 07:15:34 +00:00
feat: add in media downloading (#62)
* feat: add in media downloading * fix: build issue
This commit is contained in:
@@ -12,3 +12,4 @@ class JobOptions(BaseModel):
|
||||
custom_headers: dict[str, Any] = {}
|
||||
proxies: list[str] = []
|
||||
site_map: Optional[SiteMap] = None
|
||||
collect_media: bool = False
|
||||
|
||||
91
api/backend/job/scraping/collect_media.py
Normal file
91
api/backend/job/scraping/collect_media.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import os
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from selenium.webdriver.common.by import By
|
||||
from seleniumwire import webdriver
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from api.backend.utils import LOG
|
||||
|
||||
|
||||
def collect_media(driver: webdriver.Chrome):
|
||||
media_types = {
|
||||
"images": "img",
|
||||
"videos": "video",
|
||||
"audio": "audio",
|
||||
"pdfs": 'a[href$=".pdf"]',
|
||||
"documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]',
|
||||
"presentations": 'a[href$=".ppt"], a[href$=".pptx"]',
|
||||
"spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]',
|
||||
}
|
||||
|
||||
base_dir = Path("media")
|
||||
base_dir.mkdir(exist_ok=True)
|
||||
|
||||
media_urls = {}
|
||||
|
||||
for media_type, selector in media_types.items():
|
||||
elements = driver.find_elements(By.CSS_SELECTOR, selector)
|
||||
urls: list[dict[str, str]] = []
|
||||
|
||||
media_dir = base_dir / media_type
|
||||
media_dir.mkdir(exist_ok=True)
|
||||
|
||||
for element in elements:
|
||||
if media_type == "images":
|
||||
url = element.get_attribute("src")
|
||||
elif media_type == "videos":
|
||||
url = element.get_attribute("src") or element.get_attribute("data-src")
|
||||
else:
|
||||
url = element.get_attribute("href")
|
||||
|
||||
if url and url.startswith(("http://", "https://")):
|
||||
try:
|
||||
filename = os.path.basename(urlparse(url).path)
|
||||
|
||||
if not filename:
|
||||
filename = f"{media_type}_{len(urls)}"
|
||||
|
||||
if media_type == "images":
|
||||
filename += ".jpg"
|
||||
elif media_type == "videos":
|
||||
filename += ".mp4"
|
||||
elif media_type == "audio":
|
||||
filename += ".mp3"
|
||||
elif media_type == "pdfs":
|
||||
filename += ".pdf"
|
||||
elif media_type == "documents":
|
||||
filename += ".doc"
|
||||
elif media_type == "presentations":
|
||||
filename += ".ppt"
|
||||
elif media_type == "spreadsheets":
|
||||
filename += ".xls"
|
||||
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Save the file
|
||||
file_path = media_dir / filename
|
||||
with open(file_path, "wb") as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
urls.append({"url": url, "local_path": str(file_path)})
|
||||
LOG.info(f"Downloaded {filename} to {file_path}")
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Error downloading {url}: {str(e)}")
|
||||
continue
|
||||
|
||||
media_urls[media_type] = urls
|
||||
|
||||
with open(base_dir / "download_summary.txt", "w") as f:
|
||||
for media_type, downloads in media_urls.items():
|
||||
if downloads:
|
||||
f.write(f"\n=== {media_type.upper()} ===\n")
|
||||
for download in downloads:
|
||||
f.write(f"URL: {download['url']}\n")
|
||||
f.write(f"Saved to: {download['local_path']}\n\n")
|
||||
|
||||
return media_urls
|
||||
@@ -1,13 +1,19 @@
|
||||
import time
|
||||
from typing import cast
|
||||
|
||||
from selenium import webdriver
|
||||
from seleniumwire import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
from api.backend.utils import LOG
|
||||
|
||||
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
|
||||
from api.backend.job.scraping.collect_media import collect_media as collect_media_utils
|
||||
|
||||
|
||||
def scrape_content(
|
||||
driver: webdriver.Chrome, pages: set[tuple[str, str]], collect_media: bool
|
||||
):
|
||||
_ = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
@@ -27,4 +33,9 @@ def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
|
||||
last_height = new_height
|
||||
|
||||
pages.add((driver.page_source, driver.current_url))
|
||||
|
||||
if collect_media:
|
||||
LOG.info("Collecting media")
|
||||
collect_media_utils(driver)
|
||||
|
||||
return driver.page_source
|
||||
|
||||
@@ -104,6 +104,7 @@ async def make_site_request(
|
||||
original_url: str = "",
|
||||
proxies: Optional[list[str]] = [],
|
||||
site_map: Optional[dict[str, Any]] = None,
|
||||
collect_media: bool = False,
|
||||
) -> None:
|
||||
"""Make basic `GET` request to site using Selenium."""
|
||||
# Check if URL has already been visited
|
||||
@@ -124,7 +125,7 @@ async def make_site_request(
|
||||
visited_urls.add(url)
|
||||
visited_urls.add(final_url)
|
||||
|
||||
page_source = scrape_content(driver, pages)
|
||||
page_source = scrape_content(driver, pages, collect_media)
|
||||
|
||||
if site_map:
|
||||
LOG.info("Site map: %s", site_map)
|
||||
@@ -197,6 +198,7 @@ async def scrape(
|
||||
multi_page_scrape: bool = False,
|
||||
proxies: Optional[list[str]] = [],
|
||||
site_map: Optional[dict[str, Any]] = None,
|
||||
collect_media: bool = False,
|
||||
):
|
||||
visited_urls: set[str] = set()
|
||||
pages: set[tuple[str, str]] = set()
|
||||
@@ -210,6 +212,7 @@ async def scrape(
|
||||
original_url=url,
|
||||
proxies=proxies,
|
||||
site_map=site_map,
|
||||
collect_media=collect_media,
|
||||
)
|
||||
|
||||
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
|
||||
|
||||
@@ -27,6 +27,7 @@ async def process_job():
|
||||
job["job_options"]["multi_page_scrape"],
|
||||
job["job_options"]["proxies"],
|
||||
job["job_options"]["site_map"],
|
||||
job["job_options"]["collect_media"],
|
||||
)
|
||||
LOG.info(
|
||||
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
||||
|
||||
Reference in New Issue
Block a user