diff --git a/api/backend/amazon.py b/api/backend/amazon.py index 42b49ec..6780db7 100644 --- a/api/backend/amazon.py +++ b/api/backend/amazon.py @@ -1,8 +1,49 @@ +# STL +import os +import logging +from typing import Any + # PDM import boto3 +from mypy_boto3_dynamodb.service_resource import Table, DynamoDBServiceResource + +LOG = logging.getLogger(__name__) -def test_dyanmo(): - dynamodb = boto3.resource("dynamodb", region_name="us-west-2") - table = dynamodb.Table("scrape") - print(table) +def connect_to_dynamo() -> Table: + region_name = os.getenv("AWS_REGION") + dynamodb: DynamoDBServiceResource = boto3.resource( + "dynamodb", region_name=region_name + ) + return dynamodb.Table("scrape") + + +def insert(table: Table, item: dict[str, Any]) -> None: + i = table.put_item(Item=item) + LOG.info(f"Inserted item: {i}") + + +def query(table: Table, index_name: str, key_condition: Any) -> list[Any]: + try: + response = table.query( + IndexName=index_name, KeyConditionExpression=key_condition + ) + items = response.get("Items", []) + for item in items: + LOG.info(f"Queried item: {item}") + return items + except Exception as e: + LOG.error(f"Failed to query table: {e}") + raise + + +def query_by_id(table: Table, key_condition: Any) -> list[Any]: + try: + response = table.query(KeyConditionExpression=key_condition) + items = response.get("Items", []) + for item in items: + LOG.info(f"Queried item: {item}") + return items + except Exception as e: + LOG.error(f"Failed to query table: {e}") + raise diff --git a/api/backend/app.py b/api/backend/app.py index d539101..748f51b 100644 --- a/api/backend/app.py +++ b/api/backend/app.py @@ -1,18 +1,28 @@ # STL +import uuid import logging +from io import StringIO # PDM -from fastapi import FastAPI +import pandas as pd +from fastapi import FastAPI, HTTPException from fastapi.encoders import jsonable_encoder -from fastapi.responses import FileResponse, JSONResponse +from fastapi.responses import FileResponse, JSONResponse, StreamingResponse from fastapi.staticfiles import StaticFiles from fastapi.middleware.cors import CORSMiddleware +from boto3.dynamodb.conditions import Key # LOCAL -from api.backend.amazon import test_dyanmo -from api.backend.models import SubmitScrapeJob +from api.backend.amazon import query, insert, query_by_id, connect_to_dynamo +from api.backend.models import DownloadJob, SubmitScrapeJob, RetrieveScrapeJobs from api.backend.scraping import scrape +logging.basicConfig( + level=logging.INFO, + format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s", + handlers=[logging.StreamHandler()], +) + LOG = logging.getLogger(__name__) app = FastAPI(title="api") @@ -33,19 +43,53 @@ def read_root(): return FileResponse("./dist/index.html") -@app.get("/api/endpoint") -async def test_endpoint(): - test_dyanmo() - return "Hello World!" - - @app.post("/api/submit-scrape-job") async def submit_scrape_job(job: SubmitScrapeJob): + LOG.info(f"Recieved job: {job}") try: scraped = await scrape(job.url, job.elements) - print(scraped) + + LOG.info( + f"Scraped result for url: {job.url}, with elements: {job.elements}\n{scraped}" + ) + json_scraped = jsonable_encoder(scraped) - print(json_scraped) + table = connect_to_dynamo() + job.result = json_scraped + job.id = uuid.uuid4().hex + insert(table, jsonable_encoder(job)) return JSONResponse(content=json_scraped) except Exception as e: return JSONResponse(content={"error": str(e)}, status_code=500) + + +@app.post("/api/retrieve-scrape-jobs") +async def retrieve_scrape_jobs(retrieve: RetrieveScrapeJobs): + LOG.info(f"Retrieving jobs for account: {retrieve.user}") + try: + table = connect_to_dynamo() + results = query(table, "user", Key("user").eq(retrieve.user)) + return JSONResponse(content=results) + except Exception as e: + LOG.error(f"Exception occurred: {e}") + return JSONResponse(content={"error": str(e)}, status_code=500) + + +@app.post("/api/download") +async def download(download_job: DownloadJob): + LOG.info(f"Downloading job with id: {download_job.id}") + try: + table = connect_to_dynamo() + results = query_by_id(table, Key("id").eq(download_job.id)) + + df = pd.DataFrame(results) + + csv_buffer = StringIO() + df.to_csv(csv_buffer, index=False) + _ = csv_buffer.seek(0) + response = StreamingResponse(csv_buffer, media_type="text/csv") + response.headers["Content-Disposition"] = "attachment; filename=export.csv" + return response + + except Exception as e: + LOG.error(f"Exception occurred: {e}") diff --git a/api/backend/models.py b/api/backend/models.py index 6d93c75..ba3b0fc 100644 --- a/api/backend/models.py +++ b/api/backend/models.py @@ -1,3 +1,6 @@ +# STL +from typing import Any, Optional + # PDM import pydantic @@ -15,5 +18,17 @@ class CapturedElement(pydantic.BaseModel): class SubmitScrapeJob(pydantic.BaseModel): + id: Optional[str] = None url: str elements: list[Element] + user: str + time_created: str + result: Optional[dict[str, Any]] = None + + +class RetrieveScrapeJobs(pydantic.BaseModel): + user: str + + +class DownloadJob(pydantic.BaseModel): + id: str diff --git a/api/backend/scraping.py b/api/backend/scraping.py index c88903f..51b813c 100644 --- a/api/backend/scraping.py +++ b/api/backend/scraping.py @@ -1,3 +1,6 @@ +# STL +import logging + # PDM from bs4 import BeautifulSoup from lxml import etree @@ -14,10 +17,26 @@ from selenium.webdriver.chrome.service import Service # LOCAL from api.backend.models import Element, CapturedElement +LOG = logging.getLogger(__name__) + class HtmlElement(_Element): ... +def clean_xpath(xpath: str) -> str: + parts = xpath.split("/") + clean_parts: list[str] = [] + for part in parts: + if part == "": + clean_parts.append("/") + else: + clean_parts.append(part) + clean_xpath = "//".join(clean_parts).replace("////", "//") + + clean_xpath = clean_xpath.replace("'", "\\'") + return clean_xpath + + def sxpath(context: _Element, xpath: str) -> list[HtmlElement]: return context.xpath(xpath) # type: ignore [reportReturnType] @@ -44,7 +63,7 @@ async def make_site_request(url: str) -> str: finally: driver.quit() - print(page_source) + LOG.debug(f"Page source for url: {url}\n{page_source}") return page_source @@ -55,7 +74,7 @@ async def collect_scraped_elements(page: str, xpaths: list[Element]): elements: dict[str, list[CapturedElement]] = dict() for elem in xpaths: - el = sxpath(root, elem.xpath) + el = sxpath(root, clean_xpath(elem.xpath)) text = ["".join(str(e) for e in e.itertext()) for e in el] captured_element = CapturedElement( xpath=elem.xpath, text=",".join(text), name=elem.name diff --git a/ipython.py b/ipython.py index 976d988..11a477a 100644 --- a/ipython.py +++ b/ipython.py @@ -1,17 +1,37 @@ # STL -import asyncio +import os -# LOCAL -from api.backend.scraping import scrape +# PDM +import boto3 +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() -async def main(): - url = "https://darksouls3.wiki.fextralife.com/Dark+Souls+3" - xpaths = [".//h3[@class='bonfire']", ".//div[@class='comment']"] - scraped = await scrape(url, xpaths) +def test_insert_and_delete(): + # Get environment variables + region_name = os.getenv("AWS_REGION") + # Initialize DynamoDB resource + dynamodb = boto3.resource("dynamodb", region_name=region_name) + table = dynamodb.Table("scrape") - print(scraped) + # Item to insert + item = { + "id": "123", # Replace with the appropriate id value + "attribute1": "value1", + "attribute2": "value2", + # Add more attributes as needed + } + + # Insert the item + table.put_item(Item=item) + print(f"Inserted item: {item}") + + # Delete the item + table.delete_item(Key={"id": "123"}) # Replace with the appropriate id value + print(f"Deleted item with id: {item['id']}") if __name__ == "__main__": - asyncio.run(main()) + test_insert_and_delete() diff --git a/pdm.lock b/pdm.lock index 0f9c5c2..1427f44 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.2" -content_hash = "sha256:86d834de010a22751311e6b9553fd963dd2073e49bbf17872f5fff0630f99164" +content_hash = "sha256:a12cdcf1cdd6f91260a7d7126be4581a6820caf91ffc26386abfe9a6b3fbc9d9" [[package]] name = "aiohttp" @@ -466,6 +466,17 @@ files = [ {file = "email_validator-2.1.1.tar.gz", hash = "sha256:200a70680ba08904be6d1eef729205cc0d687634399a5924d842533efb824b84"}, ] +[[package]] +name = "et-xmlfile" +version = "1.1.0" +requires_python = ">=3.6" +summary = "An implementation of lxml.xmlfile for the standard library" +groups = ["default"] +files = [ + {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, + {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, +] + [[package]] name = "exceptiongroup" version = "1.2.1" @@ -1199,6 +1210,65 @@ files = [ {file = "mypy_boto3_sqs-1.34.121.tar.gz", hash = "sha256:bdbc623235ffc8127cb8753f49323f74a919df552247b0b2caaf85cf9bb495b8"}, ] +[[package]] +name = "numpy" +version = "2.0.0" +requires_python = ">=3.9" +summary = "Fundamental package for array computing in Python" +groups = ["default"] +marker = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "numpy-2.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:04494f6ec467ccb5369d1808570ae55f6ed9b5809d7f035059000a37b8d7e86f"}, + {file = "numpy-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2635dbd200c2d6faf2ef9a0d04f0ecc6b13b3cad54f7c67c61155138835515d2"}, + {file = "numpy-2.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:0a43f0974d501842866cc83471bdb0116ba0dffdbaac33ec05e6afed5b615238"}, + {file = "numpy-2.0.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:8d83bb187fb647643bd56e1ae43f273c7f4dbcdf94550d7938cfc32566756514"}, + {file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79e843d186c8fb1b102bef3e2bc35ef81160ffef3194646a7fdd6a73c6b97196"}, + {file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7696c615765091cc5093f76fd1fa069870304beaccfd58b5dcc69e55ef49c1"}, + {file = "numpy-2.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b4c76e3d4c56f145d41b7b6751255feefae92edbc9a61e1758a98204200f30fc"}, + {file = "numpy-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:acd3a644e4807e73b4e1867b769fbf1ce8c5d80e7caaef0d90dcdc640dfc9787"}, + {file = "numpy-2.0.0-cp310-cp310-win32.whl", hash = "sha256:cee6cc0584f71adefe2c908856ccc98702baf95ff80092e4ca46061538a2ba98"}, + {file = "numpy-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:ed08d2703b5972ec736451b818c2eb9da80d66c3e84aed1deeb0c345fefe461b"}, + {file = "numpy-2.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad0c86f3455fbd0de6c31a3056eb822fc939f81b1618f10ff3406971893b62a5"}, + {file = "numpy-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7f387600d424f91576af20518334df3d97bc76a300a755f9a8d6e4f5cadd289"}, + {file = "numpy-2.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:34f003cb88b1ba38cb9a9a4a3161c1604973d7f9d5552c38bc2f04f829536609"}, + {file = "numpy-2.0.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:b6f6a8f45d0313db07d6d1d37bd0b112f887e1369758a5419c0370ba915b3871"}, + {file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f64641b42b2429f56ee08b4f427a4d2daf916ec59686061de751a55aafa22e4"}, + {file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7039a136017eaa92c1848152827e1424701532ca8e8967fe480fe1569dae581"}, + {file = "numpy-2.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:46e161722e0f619749d1cd892167039015b2c2817296104487cd03ed4a955995"}, + {file = "numpy-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0e50842b2295ba8414c8c1d9d957083d5dfe9e16828b37de883f51fc53c4016f"}, + {file = "numpy-2.0.0-cp311-cp311-win32.whl", hash = "sha256:2ce46fd0b8a0c947ae047d222f7136fc4d55538741373107574271bc00e20e8f"}, + {file = "numpy-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd6acc766814ea6443628f4e6751d0da6593dae29c08c0b2606164db026970c"}, + {file = "numpy-2.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:354f373279768fa5a584bac997de6a6c9bc535c482592d7a813bb0c09be6c76f"}, + {file = "numpy-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4d2f62e55a4cd9c58c1d9a1c9edaedcd857a73cb6fda875bf79093f9d9086f85"}, + {file = "numpy-2.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1e72728e7501a450288fc8e1f9ebc73d90cfd4671ebbd631f3e7857c39bd16f2"}, + {file = "numpy-2.0.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:84554fc53daa8f6abf8e8a66e076aff6ece62de68523d9f665f32d2fc50fd66e"}, + {file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73aafd1afca80afecb22718f8700b40ac7cab927b8abab3c3e337d70e10e5a2"}, + {file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49d9f7d256fbc804391a7f72d4a617302b1afac1112fac19b6c6cec63fe7fe8a"}, + {file = "numpy-2.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0ec84b9ba0654f3b962802edc91424331f423dcf5d5f926676e0150789cb3d95"}, + {file = "numpy-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:feff59f27338135776f6d4e2ec7aeeac5d5f7a08a83e80869121ef8164b74af9"}, + {file = "numpy-2.0.0-cp312-cp312-win32.whl", hash = "sha256:c5a59996dc61835133b56a32ebe4ef3740ea5bc19b3983ac60cc32be5a665d54"}, + {file = "numpy-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a356364941fb0593bb899a1076b92dfa2029f6f5b8ba88a14fd0984aaf76d0df"}, + {file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9416a5c2e92ace094e9f0082c5fd473502c91651fb896bc17690d6fc475128d6"}, + {file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:17067d097ed036636fa79f6a869ac26df7db1ba22039d962422506640314933a"}, + {file = "numpy-2.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38ecb5b0582cd125f67a629072fed6f83562d9dd04d7e03256c9829bdec027ad"}, + {file = "numpy-2.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cef04d068f5fb0518a77857953193b6bb94809a806bd0a14983a8f12ada060c9"}, + {file = "numpy-2.0.0.tar.gz", hash = "sha256:cf5d1c9e6837f8af9f92b6bd3e86d513cdc11f60fd62185cc49ec7d1aba34864"}, +] + +[[package]] +name = "openpyxl" +version = "3.1.5" +requires_python = ">=3.8" +summary = "A Python library to read/write Excel 2010 xlsx/xlsm files" +groups = ["default"] +dependencies = [ + "et-xmlfile", +] +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + [[package]] name = "orjson" version = "3.10.3" @@ -1261,6 +1331,45 @@ files = [ {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, ] +[[package]] +name = "pandas" +version = "2.2.2" +requires_python = ">=3.9" +summary = "Powerful data structures for data analysis, time series, and statistics" +groups = ["default"] +dependencies = [ + "numpy>=1.22.4; python_version < \"3.11\"", + "numpy>=1.23.2; python_version == \"3.11\"", + "numpy>=1.26.0; python_version >= \"3.12\"", + "python-dateutil>=2.8.2", + "pytz>=2020.1", + "tzdata>=2022.7", +] +files = [ + {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, + {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"}, + {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"}, + {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, + {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, + {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"}, +] + [[package]] name = "parse" version = "1.20.2" @@ -1540,6 +1649,16 @@ files = [ {file = "python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026"}, ] +[[package]] +name = "pytz" +version = "2024.1" +summary = "World timezone definitions, modern and historical" +groups = ["default"] +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + [[package]] name = "pyyaml" version = "6.0.1" @@ -1852,6 +1971,17 @@ files = [ {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] +[[package]] +name = "tzdata" +version = "2024.1" +requires_python = ">=2" +summary = "Provider of IANA time zone data" +groups = ["default"] +files = [ + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, +] + [[package]] name = "ujson" version = "5.10.0" @@ -2185,6 +2315,17 @@ files = [ {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"}, ] +[[package]] +name = "xlsxwriter" +version = "3.2.0" +requires_python = ">=3.6" +summary = "A Python module for creating Excel XLSX files." +groups = ["default"] +files = [ + {file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"}, + {file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"}, +] + [[package]] name = "yarl" version = "1.9.4" diff --git a/pyproject.toml b/pyproject.toml index 75ca1be..1270742 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,9 @@ dependencies = [ "selenium>=4.22.0", "webdriver-manager>=4.0.1", "pydantic>=2.8.2", + "pandas>=2.2.2", + "openpyxl>=3.1.5", + "xlsxwriter>=3.2.0", ] requires-python = ">=3.10" readme = "README.md" @@ -40,9 +43,10 @@ ignore = [] defineConstant = { DEBUG = true } stubPath = "" -reportUnknownMemberType=false +reportUnknownMemberType= false reportMissingImports = true reportMissingTypeStubs = false +reportAny = false pythonVersion = "3.9" pythonPlatform = "Linux" diff --git a/src/components/JobTable.tsx b/src/components/JobTable.tsx new file mode 100644 index 0000000..86ccba7 --- /dev/null +++ b/src/components/JobTable.tsx @@ -0,0 +1,124 @@ +import React, { useState } from "react"; +import { + TextField, + Table, + TableBody, + TableCell, + TableHead, + TableRow, + Button, +} from "@mui/material"; +import { useRouter } from "next/router"; + +interface Job { + id: string; + url: string; + elements: Object[]; + result: Object; + time_created: Date; +} + +interface JobTableProps { + jobs: Job[]; +} + +const JobTable: React.FC = ({ jobs }) => { + const router = useRouter(); + const handleDownload = async (id: string) => { + console.log(id); + const response = await fetch("/api/download", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ id: id }), + }); + + if (response.ok) { + const blob = await response.blob(); + const url = window.URL.createObjectURL(blob); + const a = document.createElement("a"); + a.style.display = "none"; + a.href = url; + a.download = `job_${id}.csv`; + document.body.appendChild(a); + a.click(); + window.URL.revokeObjectURL(url); + } else { + console.error("Failed to download the file."); + } + }; + + const handleNavigate = (elements: Object[], url: string) => { + router.push({ + pathname: "/", + query: { + elements: JSON.stringify(elements), + url: url, + }, + }); + }; + + return ( + <> + + + + id + url + elements + result + time_created + + + + {jobs.map((row, index) => ( + + + + + + + + + + + + + + + + + + + + + + + + ))} + +
+ + ); +}; + +export default JobTable; diff --git a/src/components/NavBar.tsx b/src/components/NavBar.tsx deleted file mode 100644 index cbc81fe..0000000 --- a/src/components/NavBar.tsx +++ /dev/null @@ -1,34 +0,0 @@ -import React from "react"; -import { useAuth } from "../useAuth"; -import { LogoutOptions, RedirectLoginOptions } from "@auth0/auth0-react"; - -const NavBar: React.FC = () => { - const { loginWithRedirect, logout, user, isAuthenticated } = useAuth(); - - const handleLogout = () => { - const logoutOptions: LogoutOptions = {}; - logout(logoutOptions); - }; - - const handleLogin = () => { - const loginOptions: RedirectLoginOptions = { - authorizationParams: { redirect_uri: "http://localhost" }, - }; - loginWithRedirect(loginOptions); - }; - - return ( - - ); -}; - -export default NavBar; diff --git a/src/components/NavDrawer.tsx b/src/components/NavDrawer.tsx new file mode 100644 index 0000000..66c1911 --- /dev/null +++ b/src/components/NavDrawer.tsx @@ -0,0 +1,115 @@ +import React, { useState } from "react"; +import { useAuth } from "../useAuth"; +import { LogoutOptions, RedirectLoginOptions } from "@auth0/auth0-react"; +import { + Box, + Drawer, + List, + ListItem, + ListItemIcon, + ListItemButton, + ListItemText, + AppBar, + Toolbar, + IconButton, + Typography, + Button, +} from "@mui/material"; +import HomeIcon from "@mui/icons-material/Home"; +import HttpIcon from "@mui/icons-material/Http"; +import MenuIcon from "@mui/icons-material/Menu"; +import { useRouter } from "next/router"; + +const NavDrawer: React.FC = () => { + const router = useRouter(); + const { loginWithRedirect, logout, user, isAuthenticated } = useAuth(); + const [open, setOpen] = useState(false); + + const handleLogout = () => { + const logoutOptions: LogoutOptions = {}; + logout(logoutOptions); + }; + + const handleLogin = () => { + const loginOptions: RedirectLoginOptions = { + authorizationParams: { redirect_uri: "http://localhost" }, + }; + loginWithRedirect(loginOptions); + }; + + const toggleDrawer = + (open: boolean) => (event: React.KeyboardEvent | React.MouseEvent) => { + if ( + event.type === "keydown" && + ((event as React.KeyboardEvent).key === "Tab" || + (event as React.KeyboardEvent).key === "Shift") + ) { + return; + } + setOpen(open); + }; + + const DrawerList = ( + + + + router.push("/")}> + + + + Home + + + + router.push("/jobs")}> + + + + Previous Jobs + + + + + ); + + return ( + <> + + + + + + {isAuthenticated ? ( + <> + + Welcome, {user?.name} + + + + ) : ( + + )} + + + + {DrawerList} + + + ); +}; + +export default NavDrawer; diff --git a/src/pages/_app.tsx b/src/pages/_app.tsx index 36700c7..77b6ad9 100644 --- a/src/pages/_app.tsx +++ b/src/pages/_app.tsx @@ -5,6 +5,7 @@ import React from "react"; import type { AppProps } from "next/app"; import Head from "next/head"; import { Auth0Provider } from "@auth0/auth0-react"; +import NavDrawer from "../components/NavDrawer"; const domain = process.env.NEXT_PUBLIC_AUTH0_ISSUER_BASE_URL || ""; const clientId = process.env.NEXT_PUBLIC_AUTH0_CLIENT_ID || ""; @@ -26,6 +27,7 @@ const App: React.FC = ({ Component, pageProps }) => { cacheLocation="localstorage" useRefreshTokens={true} > + diff --git a/src/pages/index.tsx b/src/pages/index.tsx index 6660091..a999c08 100644 --- a/src/pages/index.tsx +++ b/src/pages/index.tsx @@ -1,5 +1,4 @@ -import React, { useState } from "react"; -import NavBar from "../components/NavBar"; +import React, { useState, useEffect } from "react"; import { Typography, TextField, @@ -13,6 +12,8 @@ import { Box, } from "@mui/material"; import AddIcon from "@mui/icons-material/Add"; +import { useAuth0 } from "@auth0/auth0-react"; +import { useRouter } from "next/router"; interface Element { name: string; @@ -31,7 +32,12 @@ type Result = { }; const Home = () => { - const [url, setUrl] = useState(""); + const { user } = useAuth0(); + const router = useRouter(); + + const { elements, url } = router.query; + + const [submittedURL, setUrl] = useState(""); const [rows, setRows] = useState([]); const [results, setResults] = useState(null); const [newRow, setNewRow] = useState({ @@ -40,8 +46,17 @@ const Home = () => { url: "", }); + useEffect(() => { + if (elements) { + setRows(JSON.parse(elements as string)); + } + if (url) { + setUrl(url as string); + } + }, [elements, url]); + const handleAddRow = () => { - newRow.url = url; + newRow.url = submittedURL; setRows([...rows, newRow]); setNewRow({ name: "", xpath: "", url: "" }); }; @@ -50,7 +65,12 @@ const Home = () => { fetch("/api/submit-scrape-job", { method: "POST", headers: { "content-type": "application/json" }, - body: JSON.stringify({ url: url, elements: rows }), + body: JSON.stringify({ + url: url, + elements: rows, + user: user?.name, + time_created: new Date().toISOString(), + }), }) .then((response) => response.json()) .then((data) => setResults(data)); @@ -58,7 +78,6 @@ const Home = () => { return ( <> - Web Scraper @@ -97,9 +116,10 @@ const Home = () => { startIcon={} onClick={handleAddRow} > - Add Row + Add Elements + Elements diff --git a/src/pages/jobs.tsx b/src/pages/jobs.tsx new file mode 100644 index 0000000..315c5d4 --- /dev/null +++ b/src/pages/jobs.tsx @@ -0,0 +1,26 @@ +import { useAuth0 } from "@auth0/auth0-react"; +import React, { useEffect, useState } from "react"; +import JobTable from "../components/JobTable"; + +const Jobs = () => { + const { user } = useAuth0(); + const [jobs, setJobs] = useState([]); + + useEffect(() => { + fetch("/api/retrieve-scrape-jobs", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ user: user?.name }), + }) + .then((response) => response.json()) + .then((data) => setJobs(data)); + }, []); + + return ( + <> + + + ); +}; + +export default Jobs;