diff --git a/.github/actions/run-cypress-tests/action.yaml b/.github/actions/run-cypress-tests/action.yaml index a4d845b..e657886 100644 --- a/.github/actions/run-cypress-tests/action.yaml +++ b/.github/actions/run-cypress-tests/action.yaml @@ -15,7 +15,7 @@ runs: - name: Setup Docker project shell: bash - run: make build up-dev + run: make build-ci up-ci - name: Install dependencies shell: bash diff --git a/.gitignore b/.gitignore index f74c7d3..8ac86da 100644 --- a/.gitignore +++ b/.gitignore @@ -188,6 +188,18 @@ postgres_data .vscode ollama data -media + +media/images +media/videos +media/audio +media/pdfs +media/spreadsheets +media/presentations +media/documents +media/recordings +media/download_summary.txt + cypress/screenshots -cypress/videos \ No newline at end of file +cypress/videos + +docker-compose.dev.local.yml \ No newline at end of file diff --git a/Makefile b/Makefile index b33f273..94bc27d 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ .DEFAULT_GOAL := help -COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml +COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml COMPOSE_PROD = docker compose -f docker-compose.yml .PHONY: help deps build pull up up-dev down setup deploy @@ -53,5 +53,11 @@ setup: deploy: ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v +build-ci: + docker compose -f docker-compose.yml -f docker-compose.dev.yml build + +up-ci: + docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate + cypress-start: DISPLAY=:0 npx cypress open \ No newline at end of file diff --git a/README.md b/README.md index 03c7580..ed586f0 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ## 📋 Overview -Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data. +Scrape websites without writing a single line of code. > 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information. @@ -29,7 +29,7 @@ Scraperr enables you to extract data from websites with precision using XPath se - **Custom Headers**: Add JSON headers to your scraping requests - **Media Downloads**: Automatically download images, videos, and other media - **Results Visualization**: View scraped data in a structured table format -- **Data Export**: Export your results in various formats +- **Data Export**: Export your results in markdown and csv formats - **Notifcation Channels**: Send completion notifcations, through various channels ## 🚀 Getting Started diff --git a/api/backend/constants.py b/api/backend/constants.py index d009219..0939855 100644 --- a/api/backend/constants.py +++ b/api/backend/constants.py @@ -4,3 +4,13 @@ import os DATABASE_PATH = "data/database.db" RECORDINGS_DIR = Path("media/recordings") RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true" +MEDIA_DIR = Path("media") +MEDIA_TYPES = [ + "audio", + "documents", + "images", + "pdfs", + "presentations", + "spreadsheets", + "videos", +] diff --git a/api/backend/job/scraping/collect_media.py b/api/backend/job/scraping/collect_media.py index 6cab8f3..e62f587 100644 --- a/api/backend/job/scraping/collect_media.py +++ b/api/backend/job/scraping/collect_media.py @@ -1,6 +1,7 @@ import os from pathlib import Path -from urllib.parse import urlparse +import re +from urllib.parse import urljoin, urlparse from typing import Dict, List import aiohttp @@ -9,12 +10,12 @@ from playwright.async_api import Page from api.backend.utils import LOG -async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]: +async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]: media_types = { "images": "img", "videos": "video", "audio": "audio", - "pdfs": 'a[href$=".pdf"]', + "pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]', "documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]', "presentations": 'a[href$=".ppt"], a[href$=".pptx"]', "spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]', @@ -48,6 +49,11 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]: root_domain = f"{root_url.scheme}://{root_url.netloc}" url = f"{root_domain}{url}" + if url and re.match(r"^[\w\-]+/", url): + root_url = urlparse(page.url) + root_domain = f"{root_url.scheme}://{root_url.netloc}" + url = urljoin(root_domain + "/", url) + if url and url.startswith(("http://", "https://")): try: parsed = urlparse(url) @@ -67,15 +73,20 @@ async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]: }.get(media_type, "") filename += ext - file_path = media_dir / filename + if not os.path.exists(media_dir / id): + os.makedirs(media_dir / id, exist_ok=True) + + file_path = media_dir / id / f"{filename}" async with session.get(url) as response: response.raise_for_status() + with open(file_path, "wb") as f: while True: chunk = await response.content.read(8192) if not chunk: break + f.write(chunk) urls.append({"url": url, "local_path": str(file_path)}) diff --git a/api/backend/job/scraping/scraping_utils.py b/api/backend/job/scraping/scraping_utils.py index bd71043..bffceaf 100644 --- a/api/backend/job/scraping/scraping_utils.py +++ b/api/backend/job/scraping/scraping_utils.py @@ -8,7 +8,7 @@ from api.backend.job.scraping.collect_media import collect_media as collect_medi async def scrape_content( - page: Page, pages: Set[Tuple[str, str]], collect_media: bool + id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool ) -> str: last_height = await page.evaluate("document.body.scrollHeight") @@ -27,6 +27,6 @@ async def scrape_content( if collect_media: LOG.info("Collecting media") - await collect_media_utils(page) + await collect_media_utils(id, page) return html diff --git a/api/backend/routers/job_router.py b/api/backend/routers/job_router.py index 13c1be5..2a13b90 100644 --- a/api/backend/routers/job_router.py +++ b/api/backend/routers/job_router.py @@ -42,7 +42,7 @@ from api.backend.job.cron_scheduling.cron_scheduling import ( from api.backend.job.utils.clean_job_format import clean_job_format from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results -from api.backend.constants import RECORDINGS_DIR +from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR LOG = logging.getLogger(__name__) @@ -244,3 +244,30 @@ async def get_recording(id: str): return FileResponse( path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"} ) + + +@job_router.get("/get-media") +async def get_media(id: str): + try: + files: dict[str, list[str]] = {} + + for media_type in MEDIA_TYPES: + path = MEDIA_DIR / media_type / f"{id}" + + files[media_type] = [file.name for file in path.glob("*")] + + return JSONResponse(content={"files": files}) + except Exception as e: + LOG.error(f"Exception occurred: {e}") + traceback.print_exc() + return JSONResponse(content={"error": str(e)}, status_code=500) + + +@job_router.get("/media") +async def get_media_file(id: str, type: str, file: str): + path = MEDIA_DIR / type / f"{id}" / file + + if not path.exists(): + return JSONResponse(content={"error": "Media file not found."}, status_code=404) + + return FileResponse(path) diff --git a/api/backend/scraping.py b/api/backend/scraping.py index 6024e28..21b7699 100644 --- a/api/backend/scraping.py +++ b/api/backend/scraping.py @@ -1,5 +1,4 @@ import logging -from pickle import FALSE import random from typing import Any, Optional, cast @@ -40,6 +39,7 @@ def sxpath(context: etree._Element, xpath: str): async def make_site_request( + id: str, url: str, headers: Optional[dict[str, Any]], multi_page_scrape: bool = False, @@ -71,14 +71,14 @@ async def make_site_request( try: await page.goto(url, timeout=60000) - await page.wait_for_load_state("networkidle", timeout=10000) + await page.wait_for_load_state("networkidle") final_url = page.url visited_urls.add(url) visited_urls.add(final_url) - html_content = await scrape_content(page, pages, collect_media) + html_content = await scrape_content(id, page, pages, collect_media) html_content = await page.content() pages.add((html_content, final_url)) @@ -112,6 +112,7 @@ async def make_site_request( if link not in visited_urls and is_same_domain(link, original_url): await make_site_request( + id, link, headers=headers, multi_page_scrape=multi_page_scrape, @@ -136,11 +137,20 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]) for e in el: # type: ignore text = ( - "\t".join(str(t) for t in e.itertext()) + " ".join(str(t) for t in e.itertext()) if isinstance(e, etree._Element) else str(e) # type: ignore ) + text = text.strip() + text = text.replace("\n", " ") + text = text.replace("\t", " ") + text = text.replace("\r", " ") + text = text.replace("\f", " ") + text = text.replace("\v", " ") + text = text.replace("\b", " ") + text = text.replace("\a", " ") + captured_element = CapturedElement( xpath=elem.xpath, text=text, name=elem.name ) @@ -154,6 +164,7 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]) async def scrape( + id: str, url: str, xpaths: list[Element], headers: Optional[dict[str, Any]] = None, @@ -167,6 +178,7 @@ async def scrape( pages: set[tuple[str, str]] = set() await make_site_request( + id, url, headers=headers, multi_page_scrape=multi_page_scrape, diff --git a/api/backend/worker/job_worker.py b/api/backend/worker/job_worker.py index e550a39..e8a130e 100644 --- a/api/backend/worker/job_worker.py +++ b/api/backend/worker/job_worker.py @@ -76,6 +76,7 @@ async def process_job(): proxies = [] scraped = await scrape( + job["id"], job["url"], [Element(**j) for j in job["elements"]], job["job_options"]["custom_headers"], diff --git a/cypress/e2e/submit-job.cy.ts b/cypress/e2e/submit-job.cy.ts index 4d4ccf1..e36824b 100644 --- a/cypress/e2e/submit-job.cy.ts +++ b/cypress/e2e/submit-job.cy.ts @@ -30,5 +30,59 @@ describe.only("Job", () => { "exist" ); cy.contains("div", "Completed", { timeout: 20000 }).should("exist"); + + cy.get("tbody tr") + .first() + .within(() => { + cy.get('input[type="checkbox"]').click(); + }); + + cy.get("[data-testid='DeleteIcon']").click(); + + cy.contains("div", "https://example.com", { timeout: 10000 }).should( + "not.exist" + ); + }); + + it("should create a job with advanced options (media)", () => { + cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob"); + + cy.visit("/"); + + cy.get("button").contains("Advanced Job Options").click(); + + cy.get('[data-cy="collect-media-checkbox"]').click(); + cy.get("body").type("{esc}"); + + cy.get('[data-cy="url-input"]').type("https://books.toscrape.com"); + cy.get('[data-cy="name-field"]').type("example"); + cy.get('[data-cy="xpath-field"]').type("//body"); + cy.get('[data-cy="add-button"]').click(); + + cy.get("button").contains("Submit").click(); + + cy.get("li").contains("Jobs").click(); + + cy.contains("div", "https://books.toscrape.com", { timeout: 10000 }).should( + "exist" + ); + + cy.contains("div", "Completed", { timeout: 20000 }).should("exist"); + cy.get("li").contains("Media").click(); + + cy.get("div[id='select-job']").click(); + cy.get("li[role='option']").click(); + + cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist"); + + cy.get("li").contains("Jobs").click(); + + cy.get("tbody tr") + .first() + .within(() => { + cy.get('input[type="checkbox"]').click(); + }); + + cy.get("[data-testid='DeleteIcon']").click(); }); }); diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 9306d8c..c44c1f2 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -1,6 +1,9 @@ version: "3" services: scraperr: + build: + context: . + dockerfile: docker/frontend/Dockerfile command: ["npm", "run", "dev"] volumes: - "$PWD/src:/app/src" @@ -10,6 +13,9 @@ services: - "$PWD/package-lock.json:/app/package-lock.json" - "$PWD/tsconfig.json:/app/tsconfig.json" scraperr_api: + build: + context: . + dockerfile: docker/api/Dockerfile environment: - LOG_LEVEL=INFO volumes: diff --git a/docker-compose.yml b/docker-compose.yml index 7625d5c..e5d5e2e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,9 +1,6 @@ services: scraperr: image: jpyles0524/scraperr:latest - build: - context: . - dockerfile: docker/frontend/Dockerfile container_name: scraperr command: ["npm", "run", "start"] environment: @@ -16,9 +13,6 @@ services: scraperr_api: init: True image: jpyles0524/scraperr_api:latest - build: - context: . - dockerfile: docker/api/Dockerfile environment: - LOG_LEVEL=INFO container_name: scraperr_api diff --git a/docs/main_page.png b/docs/main_page.png index ea94c17..1e351a3 100644 Binary files a/docs/main_page.png and b/docs/main_page.png differ diff --git a/src/components/ai/JobSelector.tsx b/src/components/ai/JobSelector.tsx index 316d251..c2feb4c 100644 --- a/src/components/ai/JobSelector.tsx +++ b/src/components/ai/JobSelector.tsx @@ -1,15 +1,20 @@ -import React, { useState, Dispatch } from "react"; +import React, { useState, Dispatch, useEffect } from "react"; import { Job } from "../../types"; import Box from "@mui/material/Box"; import InputLabel from "@mui/material/InputLabel"; import FormControl from "@mui/material/FormControl"; import Select from "@mui/material/Select"; import Popover from "@mui/material/Popover"; -import { Typography, MenuItem, useTheme } from "@mui/material"; +import { + Typography, + MenuItem, + useTheme, + ClickAwayListener, +} from "@mui/material"; import { SxProps } from "@mui/material"; interface Props { - sxProps: SxProps; + sxProps?: SxProps; setSelectedJob: | Dispatch> | ((job: Job) => void); @@ -44,6 +49,12 @@ export const JobSelector = ({ const open = Boolean(anchorEl); + useEffect(() => { + if (!open) { + setAnchorEl(null); + } + }, [open]); + return ( @@ -80,57 +91,63 @@ export const JobSelector = ({ ) : null} - - {popoverJob && ( - + - - {popoverJob.url} - -
- - {popoverJob.time_created - ? new Date(popoverJob.time_created).toLocaleString() - : "Unknown"} - -
-
- )} -
+ + {popoverJob.url} + +
+ + {popoverJob.time_created + ? new Date(popoverJob.time_created).toLocaleString() + : "Unknown"} + +
+
+ )} + + + )} ); }; diff --git a/src/components/common/advanced-job-options/dialog/advanced-job-options-dialog.tsx b/src/components/common/advanced-job-options/dialog/advanced-job-options-dialog.tsx index 588bcc0..6cd7148 100644 --- a/src/components/common/advanced-job-options/dialog/advanced-job-options-dialog.tsx +++ b/src/components/common/advanced-job-options/dialog/advanced-job-options-dialog.tsx @@ -140,6 +140,7 @@ export const AdvancedJobOptionsDialog = ({ } label={ diff --git a/src/components/common/csv-table/csv-table.tsx b/src/components/common/csv-table/csv-table.tsx index c5ef4ed..d6fd934 100644 --- a/src/components/common/csv-table/csv-table.tsx +++ b/src/components/common/csv-table/csv-table.tsx @@ -131,8 +131,9 @@ export const CsvTable: React.FC = ({ csv, className }) => { {row.text ? row.text - .replace(/(\r\n|\n|\r)/g, " ") - .replace(/\t/g, " ") + .replace(/[\n\t\r]+/g, " ") + .replace(/\s+/g, " ") + .trim() : "No text available"} diff --git a/src/components/common/media-viewer/audio/audio-viewer.tsx b/src/components/common/media-viewer/audio/audio-viewer.tsx new file mode 100644 index 0000000..930768e --- /dev/null +++ b/src/components/common/media-viewer/audio/audio-viewer.tsx @@ -0,0 +1,40 @@ + +import { Box, Typography } from "@mui/material"; + +interface AudioViewerProps { + mediaUrl: string; + selectedMedia: string; + onError: () => void; +} + +export const AudioViewer = ({ + mediaUrl, + selectedMedia, + onError, +}: AudioViewerProps) => { + return ( + + {selectedMedia} + + + ); +}; diff --git a/src/components/common/media-viewer/audio/index.ts b/src/components/common/media-viewer/audio/index.ts new file mode 100644 index 0000000..349d620 --- /dev/null +++ b/src/components/common/media-viewer/audio/index.ts @@ -0,0 +1 @@ +export * from "./audio-viewer"; diff --git a/src/components/common/media-viewer/image/image-viewer.tsx b/src/components/common/media-viewer/image/image-viewer.tsx new file mode 100644 index 0000000..10decf0 --- /dev/null +++ b/src/components/common/media-viewer/image/image-viewer.tsx @@ -0,0 +1,36 @@ +import { Box, useTheme } from "@mui/material"; + +export const ImageViewer = ({ + mediaUrl, + selectedMedia, +}: { + mediaUrl: string; + selectedMedia: string; +}) => { + const theme = useTheme(); + return ( + + {selectedMedia} + + ); +}; diff --git a/src/components/common/media-viewer/image/index.ts b/src/components/common/media-viewer/image/index.ts new file mode 100644 index 0000000..44b51d7 --- /dev/null +++ b/src/components/common/media-viewer/image/index.ts @@ -0,0 +1 @@ +export * from "./image-viewer"; diff --git a/src/components/common/media-viewer/index.ts b/src/components/common/media-viewer/index.ts new file mode 100644 index 0000000..bb46356 --- /dev/null +++ b/src/components/common/media-viewer/index.ts @@ -0,0 +1 @@ +export * from "./media-viewer"; diff --git a/src/components/common/media-viewer/media-viewer.tsx b/src/components/common/media-viewer/media-viewer.tsx new file mode 100644 index 0000000..70e2407 --- /dev/null +++ b/src/components/common/media-viewer/media-viewer.tsx @@ -0,0 +1,75 @@ +import { Box, Typography } from "@mui/material"; +import { ImageViewer } from "./image"; +import { VideoViewer } from "./video"; +import { AudioViewer } from "./audio"; +import { PDFViewer } from "./pdf-viewer"; + +interface MediaViewerProps { + selectedMedia: string; + activeTab: string; + getMediaUrl: (fileName: string) => string; + onError: (error: string) => void; +} + +export const MediaViewer = ({ + selectedMedia, + activeTab, + getMediaUrl, + onError, +}: MediaViewerProps) => { + if (!selectedMedia) { + return ( + + + Select a file to view + + + ); + } + + const mediaUrl = getMediaUrl(selectedMedia); + + switch (activeTab) { + case "images": + return ; + case "videos": + return ( + onError("Error loading video")} + /> + ); + case "audio": + return ( + onError("Error loading audio")} + /> + ); + case "pdfs": + return ; + default: + return ( + + + {selectedMedia} - Download this file to view it + + + ); + } +}; diff --git a/src/components/common/media-viewer/pdf-viewer/index.ts b/src/components/common/media-viewer/pdf-viewer/index.ts new file mode 100644 index 0000000..e1ce37f --- /dev/null +++ b/src/components/common/media-viewer/pdf-viewer/index.ts @@ -0,0 +1 @@ +export * from "./pdf-viewer"; diff --git a/src/components/common/media-viewer/pdf-viewer/pdf-viewer.tsx b/src/components/common/media-viewer/pdf-viewer/pdf-viewer.tsx new file mode 100644 index 0000000..63318dd --- /dev/null +++ b/src/components/common/media-viewer/pdf-viewer/pdf-viewer.tsx @@ -0,0 +1,33 @@ +import { Box, useTheme } from "@mui/material"; + +interface PDFViewerProps { + mediaUrl: string; + selectedMedia: string; +} + +export const PDFViewer = ({ mediaUrl, selectedMedia }: PDFViewerProps) => { + const theme = useTheme(); + + return ( + +