diff --git a/api/backend/app.py b/api/backend/app.py index b4ca401..3009070 100644 --- a/api/backend/app.py +++ b/api/backend/app.py @@ -1,31 +1,32 @@ # STL -from functools import partial import uuid import logging -from io import StringIO +from io import BytesIO +from openpyxl import Workbook # PDM -import pandas as pd -from fastapi import BackgroundTasks, FastAPI +from fastapi import BackgroundTasks, FastAPI, HTTPException from fastapi.encoders import jsonable_encoder from fastapi.responses import FileResponse, JSONResponse, StreamingResponse from fastapi.staticfiles import StaticFiles from fastapi.middleware.cors import CORSMiddleware +import docker + +client = docker.from_env() # LOCAL -from api.backend.job import query, insert, delete_jobs, update_job +from api.backend.job import query, insert, delete_jobs from api.backend.models import ( DownloadJob, SubmitScrapeJob, DeleteScrapeJobs, RetrieveScrapeJobs, ) -from api.backend.scraping import scrape from api.backend.auth.auth_router import auth_router -from seleniumwire.thirdparty.mitmproxy.master import traceback +import traceback logging.basicConfig( - level=logging.INFO, + level=logging.DEBUG, format="%(levelname)s: %(asctime)s - %(name)s - %(message)s", handlers=[logging.StreamHandler()], ) @@ -81,6 +82,13 @@ async def retrieve_scrape_jobs(retrieve: RetrieveScrapeJobs): return JSONResponse(content={"error": str(e)}, status_code=500) +def clean_text(text: str): + text = text.replace("\r\n", "\n") # Normalize newlines + text = text.replace("\n", "\\n") # Escape newlines + text = text.replace('"', '\\"') # Escape double quotes + return text + + @app.post("/api/download") async def download(download_job: DownloadJob): LOG.info(f"Downloading job with ids: {download_job.ids}") @@ -93,25 +101,54 @@ async def download(download_job: DownloadJob): for url, elements in res.items(): for element_name, values in elements.items(): for value in values: + text = clean_text(value.get("text", "")) flattened_results.append( { "id": result.get("id", None), "url": url, "element_name": element_name, "xpath": value.get("xpath", ""), - "text": value.get("text", ""), + "text": text, "user": result.get("user", ""), "time_created": result.get("time_created", ""), } ) - df = pd.DataFrame(flattened_results) + # Create an Excel workbook and sheet + workbook = Workbook() + sheet = workbook.active + assert sheet + sheet.title = "Results" - csv_buffer = StringIO() - df.to_csv(csv_buffer, index=False) - _ = csv_buffer.seek(0) - response = StreamingResponse(csv_buffer, media_type="text/csv") - response.headers["Content-Disposition"] = "attachment; filename=export.csv" + # Write the header + headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"] + sheet.append(headers) + + # Write the rows + for row in flattened_results: + sheet.append( + [ + row["id"], + row["url"], + row["element_name"], + row["xpath"], + row["text"], + row["user"], + row["time_created"], + ] + ) + + # Save the workbook to a BytesIO buffer + excel_buffer = BytesIO() + workbook.save(excel_buffer) + _ = excel_buffer.seek(0) + + # Create the response + response = StreamingResponse( + excel_buffer, + media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ) + response.headers["Content-Disposition"] = "attachment; filename=export.xlsx" return response except Exception as e: @@ -128,3 +165,22 @@ async def delete(delete_scrape_jobs: DeleteScrapeJobs): if result else JSONResponse({"error": "Jobs not deleted."}) ) + + +@app.get("/api/logs") +async def get_own_logs(): + container_id = "scraperr" + try: + container = client.containers.get(container_id) + log_stream = container.logs(stream=True, follow=True) + + def log_generator(): + try: + for log in log_stream: + yield f"data: {log.decode('utf-8')}\n\n" + except Exception as e: + yield f"data: {str(e)}\n\n" + + return StreamingResponse(log_generator(), media_type="text/event-stream") + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) diff --git a/api/backend/scraping.py b/api/backend/scraping.py index c4777d3..ab7d970 100644 --- a/api/backend/scraping.py +++ b/api/backend/scraping.py @@ -1,8 +1,7 @@ -# STL import logging from typing import Any, Optional +import time -# PDM from bs4 import BeautifulSoup from lxml import etree from seleniumwire import webdriver @@ -16,7 +15,6 @@ from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.chrome.service import Service from urllib.parse import urlparse, urljoin -# LOCAL from api.backend.models import Element, CapturedElement LOG = logging.getLogger(__name__) @@ -42,7 +40,6 @@ def clean_xpath(xpath: str) -> str: else: clean_parts.append(part) clean_xpath = "//".join(clean_parts).replace("////", "//") - clean_xpath = clean_xpath.replace("'", "\\'") return clean_xpath @@ -56,9 +53,7 @@ def interceptor(headers: dict[str, Any]): for key, val in headers.items(): if request.headers.get(key): del request.headers[key] - request.headers[key] = val - if "sec-ch-ua" in request.headers: original_value = request.headers["sec-ch-ua"] del request.headers["sec-ch-ua"] @@ -93,20 +88,28 @@ async def make_site_request( if url in visited_urls: return + LOG.info(f"Visited URLs: {visited_urls}") + driver = create_driver() + driver.implicitly_wait(10) if headers: driver.request_interceptor = interceptor(headers) try: + LOG.info(f"Visiting URL: {url}") driver.get(url) + final_url = driver.current_url + LOG.info(f"Final URL: {final_url}") visited_urls.add(url) + visited_urls.add(final_url) _ = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) + time.sleep(5) page_source = driver.page_source LOG.debug(f"Page source for url: {url}\n{page_source}") - pages.add((page_source, url)) + pages.add((page_source, final_url)) finally: driver.quit() @@ -117,10 +120,11 @@ async def make_site_request( for a_tag in soup.find_all("a"): link = a_tag.get("href") + LOG.info(f"Found Link: {link}") if link: if not urlparse(link).netloc: - base_url = "{0.scheme}://{0.netloc}".format(urlparse(original_url)) + base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url)) link = urljoin(base_url, link) if link not in visited_urls and is_same_domain(link, original_url): diff --git a/docker-compose.yml b/docker-compose.yml index 4d5ba2f..62e127e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,6 +7,8 @@ services: container_name: scraperr env_file: - ./.env + volumes: + - /var/run/docker.sock:/var/run/docker.sock labels: - "traefik.enable=true" - "traefik.http.routers.frontend.rule=Host(`${HOSTNAME}`)" diff --git a/pdm.lock b/pdm.lock index bd831e0..f016ec8 100644 --- a/pdm.lock +++ b/pdm.lock @@ -250,6 +250,17 @@ version = "2.6.1" requires_python = ">=3.8" summary = "DNS toolkit" +[[package]] +name = "docker" +version = "7.1.0" +requires_python = ">=3.8" +summary = "A Python library for the Docker Engine API." +dependencies = [ + "pywin32>=304; sys_platform == \"win32\"", + "requests>=2.26.0", + "urllib3>=1.26.0", +] + [[package]] name = "ecdsa" version = "0.19.0" @@ -901,6 +912,11 @@ name = "pytz" version = "2024.1" summary = "World timezone definitions, modern and historical" +[[package]] +name = "pywin32" +version = "306" +summary = "Python for Window Extensions" + [[package]] name = "pyyaml" version = "6.0.1" @@ -1276,7 +1292,7 @@ dependencies = [ lock_version = "4.2" cross_platform = true groups = ["default", "dev"] -content_hash = "sha256:1bc0c4f11b837a58a72074fdc63d90cb5d0b404b4632e89b942c177a643fc0d3" +content_hash = "sha256:5a345aa8e39fb3285ff36fa9319d6601295c787ec3f5529415aeb89cee4afcfd" [metadata.files] "aiohttp 3.9.5" = [ @@ -1749,6 +1765,10 @@ content_hash = "sha256:1bc0c4f11b837a58a72074fdc63d90cb5d0b404b4632e89b942c177a6 {url = "https://files.pythonhosted.org/packages/37/7d/c871f55054e403fdfd6b8f65fd6d1c4e147ed100d3e9f9ba1fe695403939/dnspython-2.6.1.tar.gz", hash = "sha256:e8f0f9c23a7b7cb99ded64e6c3a6f3e701d78f50c55e002b839dea7225cff7cc"}, {url = "https://files.pythonhosted.org/packages/87/a1/8c5287991ddb8d3e4662f71356d9656d91ab3a36618c3dd11b280df0d255/dnspython-2.6.1-py3-none-any.whl", hash = "sha256:5ef3b9680161f6fa89daf8ad451b5f1a33b18ae8a1c6778cdf4b43f08c0a6e50"}, ] +"docker 7.1.0" = [ + {url = "https://files.pythonhosted.org/packages/91/9b/4a2ea29aeba62471211598dac5d96825bb49348fa07e906ea930394a83ce/docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"}, + {url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"}, +] "ecdsa 0.19.0" = [ {url = "https://files.pythonhosted.org/packages/00/e7/ed3243b30d1bec41675b6394a1daae46349dc2b855cb83be846a5a918238/ecdsa-0.19.0-py2.py3-none-any.whl", hash = "sha256:2cea9b88407fdac7bbeca0833b189e4c9c53f2ef1e1eaa29f6224dbc809b707a"}, {url = "https://files.pythonhosted.org/packages/5e/d0/ec8ac1de7accdcf18cfe468653ef00afd2f609faf67c423efbd02491051b/ecdsa-0.19.0.tar.gz", hash = "sha256:60eaad1199659900dd0af521ed462b793bbdf867432b3948e87416ae4caf6bf8"}, @@ -2685,6 +2705,22 @@ content_hash = "sha256:1bc0c4f11b837a58a72074fdc63d90cb5d0b404b4632e89b942c177a6 {url = "https://files.pythonhosted.org/packages/90/26/9f1f00a5d021fff16dee3de13d43e5e978f3d58928e129c3a62cf7eb9738/pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, {url = "https://files.pythonhosted.org/packages/9c/3d/a121f284241f08268b21359bd425f7d4825cffc5ac5cd0e1b3d82ffd2b10/pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, ] +"pywin32 306" = [ + {url = "https://files.pythonhosted.org/packages/08/dc/28c668097edfaf4eac4617ef7adf081b9cf50d254672fcf399a70f5efc41/pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"}, + {url = "https://files.pythonhosted.org/packages/0e/57/c3ec32b498f24a2392404d1f0fd29f47a3f7339d7d579df7a0560cff337c/pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"}, + {url = "https://files.pythonhosted.org/packages/14/91/17e016d5923e178346aabda3dfec6629d1a26efe587d19667542105cf0a6/pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"}, + {url = "https://files.pythonhosted.org/packages/1c/43/e3444dc9a12f8365d9603c2145d16bf0a2f8180f343cf87be47f5579e547/pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"}, + {url = "https://files.pythonhosted.org/packages/1c/f7/24d8ed4fd9c43b90354df7764f81f0dd5e623f9a50f1538f90fe085d6dff/pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"}, + {url = "https://files.pythonhosted.org/packages/28/19/6b8f416ff02132c404042f251eb90a41d15abe677481fcff22077e943c6f/pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"}, + {url = "https://files.pythonhosted.org/packages/7e/7f/419c4fcadcaa374a0ae41cbdf6c3a81452892dd6c523aea629d17e49146e/pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"}, + {url = "https://files.pythonhosted.org/packages/7e/9e/ad6b1ae2a5ad1066dc509350e0fbf74d8d50251a51e420a2a8feaa0cecbd/pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"}, + {url = "https://files.pythonhosted.org/packages/80/e6/08192cb5728a6ffdb70ea990d9a1351b320d31a751bb463e652d9e05e7aa/pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"}, + {url = "https://files.pythonhosted.org/packages/83/1c/25b79fc3ec99b19b0a0730cc47356f7e2959863bf9f3cd314332bddb4f68/pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"}, + {url = "https://files.pythonhosted.org/packages/8b/1e/fc18ad83ca553e01b97aa8393ff10e33c1fb57801db05488b83282ee9913/pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"}, + {url = "https://files.pythonhosted.org/packages/91/20/f744bff1da8f43388498503634378dbbefbe493e65675f2cc52f7185c2c2/pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"}, + {url = "https://files.pythonhosted.org/packages/d3/d6/891894edec688e72c2e308b3243fad98b4066e1839fd2fe78f04129a9d31/pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"}, + {url = "https://files.pythonhosted.org/packages/fa/80/a6b22e031590cc5f4fcbd5bf4bcf63a9dabce9d59065f53add99a8caaec5/pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"}, +] "pyyaml 6.0.1" = [ {url = "https://files.pythonhosted.org/packages/02/74/b2320ebe006b6a521cf929c78f12a220b9db319b38165023623ed195654b/PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {url = "https://files.pythonhosted.org/packages/03/5c/c4671451b2f1d76ebe352c0945d4cd13500adb5d05f5a51ee296d80152f7/PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, diff --git a/pyproject.toml b/pyproject.toml index 9c3afc4..a19bf19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "selenium-wire>=5.1.0", "blinker<1.8.0", "setuptools>=71.0.4", + "docker>=7.1.0", ] requires-python = ">=3.10" readme = "README.md" diff --git a/src/components/JobTable.tsx b/src/components/JobTable.tsx index f602aac..e17e0d6 100644 --- a/src/components/JobTable.tsx +++ b/src/components/JobTable.tsx @@ -73,7 +73,7 @@ const JobTable: React.FC = ({ jobs, fetchJobs }) => { const a = document.createElement("a"); a.style.display = "none"; a.href = url; - a.download = `job_${ids.splice(0, 1)}.csv`; + a.download = `job_${ids.splice(0, 1)}.xlsx`; document.body.appendChild(a); a.click(); window.URL.revokeObjectURL(url); @@ -134,6 +134,8 @@ const JobTable: React.FC = ({ jobs, fetchJobs }) => { return job.url.toLowerCase().includes(searchQuery.toLowerCase()); } else if (searchMode === "id") { return job.id.toLowerCase().includes(searchQuery.toLowerCase()); + } else if (searchMode === "status") { + return job.status.toLowerCase().includes(searchQuery.toLowerCase()); } return true; }); @@ -154,7 +156,7 @@ const JobTable: React.FC = ({ jobs, fetchJobs }) => { overflow="auto" >
@@ -212,11 +214,12 @@ const JobTable: React.FC = ({ jobs, fetchJobs }) => { > URL ID + Status
- + diff --git a/src/components/NavDrawer.tsx b/src/components/NavDrawer.tsx index 1ce9e35..213c3e8 100644 --- a/src/components/NavDrawer.tsx +++ b/src/components/NavDrawer.tsx @@ -20,6 +20,7 @@ import { import HomeIcon from "@mui/icons-material/Home"; import HttpIcon from "@mui/icons-material/Http"; import ExpandMoreIcon from "@mui/icons-material/ExpandMore"; +import TerminalIcon from "@mui/icons-material/Terminal"; import { useRouter } from "next/router"; import { useTheme } from "@mui/material/styles"; @@ -76,6 +77,15 @@ const NavDrawer: React.FC = ({ toggleTheme, isDarkMode }) => { + + router.push("/logs")}> + + + + + + + { + const theme = useTheme(); const [newRow, setNewRow] = useState({ name: "", xpath: "", @@ -72,10 +74,18 @@ export const ElementTable = ({ rows, setRows, submittedURL }: Props) => { aria-label="add" size="small" onClick={handleAddRow} - sx={{ height: "40px", width: "40px" }} + sx={{ + height: "40px", + width: "40px", + }} disabled={!(newRow.xpath.length > 0 && newRow.name.length > 0)} > - + diff --git a/src/components/submit/JobSubmitter.tsx b/src/components/submit/JobSubmitter.tsx index dd65687..c073b1b 100644 --- a/src/components/submit/JobSubmitter.tsx +++ b/src/components/submit/JobSubmitter.tsx @@ -169,7 +169,7 @@ export const JobSubmitter = ({ stateProps }: Props) => { {loading ? : "Submit"} - +
= ({ Component, pageProps }) => { setIsDarkMode(savedTheme === "dark"); } else { const prefersDarkMode = window.matchMedia( - "(prefers-color-scheme: dark)", + "(prefers-color-scheme: dark)" ).matches; setIsDarkMode(prefersDarkMode); } @@ -46,8 +46,9 @@ const App: React.FC = ({ Component, pageProps }) => { component="main" sx={{ flexGrow: 1, - bgcolor: "background.default", p: 3, + bgcolor: "background.default", + minHeight: "100vh", }} > diff --git a/src/pages/jobs.tsx b/src/pages/jobs.tsx index 485dbbb..f93d554 100644 --- a/src/pages/jobs.tsx +++ b/src/pages/jobs.tsx @@ -28,6 +28,11 @@ const Jobs = () => { } }, [user]); + useEffect(() => { + const intervalId = setInterval(fetchJobs, 5000); + return () => clearInterval(intervalId); + }, []); + return ( <> {user ? ( diff --git a/src/pages/logs.tsx b/src/pages/logs.tsx new file mode 100644 index 0000000..22f3f82 --- /dev/null +++ b/src/pages/logs.tsx @@ -0,0 +1,49 @@ +import { Container } from "@mui/material"; +import { useEffect, useState } from "react"; + +const Logs = () => { + const [logs, setLogs] = useState(""); + + useEffect(() => { + const eventSource = new EventSource("/api/logs"); + + eventSource.onmessage = (event) => { + setLogs((prevLogs) => prevLogs + event.data + "\n"); + }; + + eventSource.onerror = () => { + eventSource.close(); + }; + + return () => { + eventSource.close(); + }; + }, []); + + return ( + +
+        {logs}
+      
+
+ ); +}; + +export default Logs;