wip: add in job rerunning

This commit is contained in:
Jayden Pyles
2024-07-06 16:56:56 -05:00
parent 70bdd01d9d
commit 8808b493e6
13 changed files with 607 additions and 70 deletions

View File

@@ -1,8 +1,49 @@
# STL
import os
import logging
from typing import Any
# PDM # PDM
import boto3 import boto3
from mypy_boto3_dynamodb.service_resource import Table, DynamoDBServiceResource
LOG = logging.getLogger(__name__)
def test_dyanmo(): def connect_to_dynamo() -> Table:
dynamodb = boto3.resource("dynamodb", region_name="us-west-2") region_name = os.getenv("AWS_REGION")
table = dynamodb.Table("scrape") dynamodb: DynamoDBServiceResource = boto3.resource(
print(table) "dynamodb", region_name=region_name
)
return dynamodb.Table("scrape")
def insert(table: Table, item: dict[str, Any]) -> None:
i = table.put_item(Item=item)
LOG.info(f"Inserted item: {i}")
def query(table: Table, index_name: str, key_condition: Any) -> list[Any]:
try:
response = table.query(
IndexName=index_name, KeyConditionExpression=key_condition
)
items = response.get("Items", [])
for item in items:
LOG.info(f"Queried item: {item}")
return items
except Exception as e:
LOG.error(f"Failed to query table: {e}")
raise
def query_by_id(table: Table, key_condition: Any) -> list[Any]:
try:
response = table.query(KeyConditionExpression=key_condition)
items = response.get("Items", [])
for item in items:
LOG.info(f"Queried item: {item}")
return items
except Exception as e:
LOG.error(f"Failed to query table: {e}")
raise

View File

@@ -1,18 +1,28 @@
# STL # STL
import uuid
import logging import logging
from io import StringIO
# PDM # PDM
from fastapi import FastAPI import pandas as pd
from fastapi import FastAPI, HTTPException
from fastapi.encoders import jsonable_encoder from fastapi.encoders import jsonable_encoder
from fastapi.responses import FileResponse, JSONResponse from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from boto3.dynamodb.conditions import Key
# LOCAL # LOCAL
from api.backend.amazon import test_dyanmo from api.backend.amazon import query, insert, query_by_id, connect_to_dynamo
from api.backend.models import SubmitScrapeJob from api.backend.models import DownloadJob, SubmitScrapeJob, RetrieveScrapeJobs
from api.backend.scraping import scrape from api.backend.scraping import scrape
logging.basicConfig(
level=logging.INFO,
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()],
)
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
app = FastAPI(title="api") app = FastAPI(title="api")
@@ -33,19 +43,53 @@ def read_root():
return FileResponse("./dist/index.html") return FileResponse("./dist/index.html")
@app.get("/api/endpoint")
async def test_endpoint():
test_dyanmo()
return "Hello World!"
@app.post("/api/submit-scrape-job") @app.post("/api/submit-scrape-job")
async def submit_scrape_job(job: SubmitScrapeJob): async def submit_scrape_job(job: SubmitScrapeJob):
LOG.info(f"Recieved job: {job}")
try: try:
scraped = await scrape(job.url, job.elements) scraped = await scrape(job.url, job.elements)
print(scraped)
LOG.info(
f"Scraped result for url: {job.url}, with elements: {job.elements}\n{scraped}"
)
json_scraped = jsonable_encoder(scraped) json_scraped = jsonable_encoder(scraped)
print(json_scraped) table = connect_to_dynamo()
job.result = json_scraped
job.id = uuid.uuid4().hex
insert(table, jsonable_encoder(job))
return JSONResponse(content=json_scraped) return JSONResponse(content=json_scraped)
except Exception as e: except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=500) return JSONResponse(content={"error": str(e)}, status_code=500)
@app.post("/api/retrieve-scrape-jobs")
async def retrieve_scrape_jobs(retrieve: RetrieveScrapeJobs):
LOG.info(f"Retrieving jobs for account: {retrieve.user}")
try:
table = connect_to_dynamo()
results = query(table, "user", Key("user").eq(retrieve.user))
return JSONResponse(content=results)
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@app.post("/api/download")
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with id: {download_job.id}")
try:
table = connect_to_dynamo()
results = query_by_id(table, Key("id").eq(download_job.id))
df = pd.DataFrame(results)
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)
_ = csv_buffer.seek(0)
response = StreamingResponse(csv_buffer, media_type="text/csv")
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")

View File

@@ -1,3 +1,6 @@
# STL
from typing import Any, Optional
# PDM # PDM
import pydantic import pydantic
@@ -15,5 +18,17 @@ class CapturedElement(pydantic.BaseModel):
class SubmitScrapeJob(pydantic.BaseModel): class SubmitScrapeJob(pydantic.BaseModel):
id: Optional[str] = None
url: str url: str
elements: list[Element] elements: list[Element]
user: str
time_created: str
result: Optional[dict[str, Any]] = None
class RetrieveScrapeJobs(pydantic.BaseModel):
user: str
class DownloadJob(pydantic.BaseModel):
id: str

View File

@@ -1,3 +1,6 @@
# STL
import logging
# PDM # PDM
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lxml import etree from lxml import etree
@@ -14,10 +17,26 @@ from selenium.webdriver.chrome.service import Service
# LOCAL # LOCAL
from api.backend.models import Element, CapturedElement from api.backend.models import Element, CapturedElement
LOG = logging.getLogger(__name__)
class HtmlElement(_Element): ... class HtmlElement(_Element): ...
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts: list[str] = []
for part in parts:
if part == "":
clean_parts.append("/")
else:
clean_parts.append(part)
clean_xpath = "//".join(clean_parts).replace("////", "//")
clean_xpath = clean_xpath.replace("'", "\\'")
return clean_xpath
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]: def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
return context.xpath(xpath) # type: ignore [reportReturnType] return context.xpath(xpath) # type: ignore [reportReturnType]
@@ -44,7 +63,7 @@ async def make_site_request(url: str) -> str:
finally: finally:
driver.quit() driver.quit()
print(page_source) LOG.debug(f"Page source for url: {url}\n{page_source}")
return page_source return page_source
@@ -55,7 +74,7 @@ async def collect_scraped_elements(page: str, xpaths: list[Element]):
elements: dict[str, list[CapturedElement]] = dict() elements: dict[str, list[CapturedElement]] = dict()
for elem in xpaths: for elem in xpaths:
el = sxpath(root, elem.xpath) el = sxpath(root, clean_xpath(elem.xpath))
text = ["".join(str(e) for e in e.itertext()) for e in el] text = ["".join(str(e) for e in e.itertext()) for e in el]
captured_element = CapturedElement( captured_element = CapturedElement(
xpath=elem.xpath, text=",".join(text), name=elem.name xpath=elem.xpath, text=",".join(text), name=elem.name

View File

@@ -1,17 +1,37 @@
# STL # STL
import asyncio import os
# LOCAL # PDM
from api.backend.scraping import scrape import boto3
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
async def main(): def test_insert_and_delete():
url = "https://darksouls3.wiki.fextralife.com/Dark+Souls+3" # Get environment variables
xpaths = [".//h3[@class='bonfire']", ".//div[@class='comment']"] region_name = os.getenv("AWS_REGION")
scraped = await scrape(url, xpaths) # Initialize DynamoDB resource
dynamodb = boto3.resource("dynamodb", region_name=region_name)
table = dynamodb.Table("scrape")
print(scraped) # Item to insert
item = {
"id": "123", # Replace with the appropriate id value
"attribute1": "value1",
"attribute2": "value2",
# Add more attributes as needed
}
# Insert the item
table.put_item(Item=item)
print(f"Inserted item: {item}")
# Delete the item
table.delete_item(Key={"id": "123"}) # Replace with the appropriate id value
print(f"Deleted item with id: {item['id']}")
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) test_insert_and_delete()

143
pdm.lock generated
View File

@@ -5,7 +5,7 @@
groups = ["default", "dev"] groups = ["default", "dev"]
strategy = ["cross_platform", "inherit_metadata"] strategy = ["cross_platform", "inherit_metadata"]
lock_version = "4.4.2" lock_version = "4.4.2"
content_hash = "sha256:86d834de010a22751311e6b9553fd963dd2073e49bbf17872f5fff0630f99164" content_hash = "sha256:a12cdcf1cdd6f91260a7d7126be4581a6820caf91ffc26386abfe9a6b3fbc9d9"
[[package]] [[package]]
name = "aiohttp" name = "aiohttp"
@@ -466,6 +466,17 @@ files = [
{file = "email_validator-2.1.1.tar.gz", hash = "sha256:200a70680ba08904be6d1eef729205cc0d687634399a5924d842533efb824b84"}, {file = "email_validator-2.1.1.tar.gz", hash = "sha256:200a70680ba08904be6d1eef729205cc0d687634399a5924d842533efb824b84"},
] ]
[[package]]
name = "et-xmlfile"
version = "1.1.0"
requires_python = ">=3.6"
summary = "An implementation of lxml.xmlfile for the standard library"
groups = ["default"]
files = [
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
]
[[package]] [[package]]
name = "exceptiongroup" name = "exceptiongroup"
version = "1.2.1" version = "1.2.1"
@@ -1199,6 +1210,65 @@ files = [
{file = "mypy_boto3_sqs-1.34.121.tar.gz", hash = "sha256:bdbc623235ffc8127cb8753f49323f74a919df552247b0b2caaf85cf9bb495b8"}, {file = "mypy_boto3_sqs-1.34.121.tar.gz", hash = "sha256:bdbc623235ffc8127cb8753f49323f74a919df552247b0b2caaf85cf9bb495b8"},
] ]
[[package]]
name = "numpy"
version = "2.0.0"
requires_python = ">=3.9"
summary = "Fundamental package for array computing in Python"
groups = ["default"]
marker = "python_version <= \"3.11\" or python_version >= \"3.12\""
files = [
{file = "numpy-2.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:04494f6ec467ccb5369d1808570ae55f6ed9b5809d7f035059000a37b8d7e86f"},
{file = "numpy-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2635dbd200c2d6faf2ef9a0d04f0ecc6b13b3cad54f7c67c61155138835515d2"},
{file = "numpy-2.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:0a43f0974d501842866cc83471bdb0116ba0dffdbaac33ec05e6afed5b615238"},
{file = "numpy-2.0.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:8d83bb187fb647643bd56e1ae43f273c7f4dbcdf94550d7938cfc32566756514"},
{file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79e843d186c8fb1b102bef3e2bc35ef81160ffef3194646a7fdd6a73c6b97196"},
{file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7696c615765091cc5093f76fd1fa069870304beaccfd58b5dcc69e55ef49c1"},
{file = "numpy-2.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b4c76e3d4c56f145d41b7b6751255feefae92edbc9a61e1758a98204200f30fc"},
{file = "numpy-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:acd3a644e4807e73b4e1867b769fbf1ce8c5d80e7caaef0d90dcdc640dfc9787"},
{file = "numpy-2.0.0-cp310-cp310-win32.whl", hash = "sha256:cee6cc0584f71adefe2c908856ccc98702baf95ff80092e4ca46061538a2ba98"},
{file = "numpy-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:ed08d2703b5972ec736451b818c2eb9da80d66c3e84aed1deeb0c345fefe461b"},
{file = "numpy-2.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad0c86f3455fbd0de6c31a3056eb822fc939f81b1618f10ff3406971893b62a5"},
{file = "numpy-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7f387600d424f91576af20518334df3d97bc76a300a755f9a8d6e4f5cadd289"},
{file = "numpy-2.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:34f003cb88b1ba38cb9a9a4a3161c1604973d7f9d5552c38bc2f04f829536609"},
{file = "numpy-2.0.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:b6f6a8f45d0313db07d6d1d37bd0b112f887e1369758a5419c0370ba915b3871"},
{file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f64641b42b2429f56ee08b4f427a4d2daf916ec59686061de751a55aafa22e4"},
{file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7039a136017eaa92c1848152827e1424701532ca8e8967fe480fe1569dae581"},
{file = "numpy-2.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:46e161722e0f619749d1cd892167039015b2c2817296104487cd03ed4a955995"},
{file = "numpy-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0e50842b2295ba8414c8c1d9d957083d5dfe9e16828b37de883f51fc53c4016f"},
{file = "numpy-2.0.0-cp311-cp311-win32.whl", hash = "sha256:2ce46fd0b8a0c947ae047d222f7136fc4d55538741373107574271bc00e20e8f"},
{file = "numpy-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd6acc766814ea6443628f4e6751d0da6593dae29c08c0b2606164db026970c"},
{file = "numpy-2.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:354f373279768fa5a584bac997de6a6c9bc535c482592d7a813bb0c09be6c76f"},
{file = "numpy-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4d2f62e55a4cd9c58c1d9a1c9edaedcd857a73cb6fda875bf79093f9d9086f85"},
{file = "numpy-2.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1e72728e7501a450288fc8e1f9ebc73d90cfd4671ebbd631f3e7857c39bd16f2"},
{file = "numpy-2.0.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:84554fc53daa8f6abf8e8a66e076aff6ece62de68523d9f665f32d2fc50fd66e"},
{file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73aafd1afca80afecb22718f8700b40ac7cab927b8abab3c3e337d70e10e5a2"},
{file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49d9f7d256fbc804391a7f72d4a617302b1afac1112fac19b6c6cec63fe7fe8a"},
{file = "numpy-2.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0ec84b9ba0654f3b962802edc91424331f423dcf5d5f926676e0150789cb3d95"},
{file = "numpy-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:feff59f27338135776f6d4e2ec7aeeac5d5f7a08a83e80869121ef8164b74af9"},
{file = "numpy-2.0.0-cp312-cp312-win32.whl", hash = "sha256:c5a59996dc61835133b56a32ebe4ef3740ea5bc19b3983ac60cc32be5a665d54"},
{file = "numpy-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a356364941fb0593bb899a1076b92dfa2029f6f5b8ba88a14fd0984aaf76d0df"},
{file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9416a5c2e92ace094e9f0082c5fd473502c91651fb896bc17690d6fc475128d6"},
{file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:17067d097ed036636fa79f6a869ac26df7db1ba22039d962422506640314933a"},
{file = "numpy-2.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38ecb5b0582cd125f67a629072fed6f83562d9dd04d7e03256c9829bdec027ad"},
{file = "numpy-2.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cef04d068f5fb0518a77857953193b6bb94809a806bd0a14983a8f12ada060c9"},
{file = "numpy-2.0.0.tar.gz", hash = "sha256:cf5d1c9e6837f8af9f92b6bd3e86d513cdc11f60fd62185cc49ec7d1aba34864"},
]
[[package]]
name = "openpyxl"
version = "3.1.5"
requires_python = ">=3.8"
summary = "A Python library to read/write Excel 2010 xlsx/xlsm files"
groups = ["default"]
dependencies = [
"et-xmlfile",
]
files = [
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
]
[[package]] [[package]]
name = "orjson" name = "orjson"
version = "3.10.3" version = "3.10.3"
@@ -1261,6 +1331,45 @@ files = [
{file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
] ]
[[package]]
name = "pandas"
version = "2.2.2"
requires_python = ">=3.9"
summary = "Powerful data structures for data analysis, time series, and statistics"
groups = ["default"]
dependencies = [
"numpy>=1.22.4; python_version < \"3.11\"",
"numpy>=1.23.2; python_version == \"3.11\"",
"numpy>=1.26.0; python_version >= \"3.12\"",
"python-dateutil>=2.8.2",
"pytz>=2020.1",
"tzdata>=2022.7",
]
files = [
{file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
{file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
{file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
{file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
{file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
{file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
{file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
{file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
{file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
{file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
{file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
{file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
{file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
{file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
{file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
{file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
{file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
{file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
{file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
{file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
{file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
{file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
]
[[package]] [[package]]
name = "parse" name = "parse"
version = "1.20.2" version = "1.20.2"
@@ -1540,6 +1649,16 @@ files = [
{file = "python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026"}, {file = "python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026"},
] ]
[[package]]
name = "pytz"
version = "2024.1"
summary = "World timezone definitions, modern and historical"
groups = ["default"]
files = [
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
]
[[package]] [[package]]
name = "pyyaml" name = "pyyaml"
version = "6.0.1" version = "6.0.1"
@@ -1852,6 +1971,17 @@ files = [
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
] ]
[[package]]
name = "tzdata"
version = "2024.1"
requires_python = ">=2"
summary = "Provider of IANA time zone data"
groups = ["default"]
files = [
{file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
{file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
]
[[package]] [[package]]
name = "ujson" name = "ujson"
version = "5.10.0" version = "5.10.0"
@@ -2185,6 +2315,17 @@ files = [
{file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"}, {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"},
] ]
[[package]]
name = "xlsxwriter"
version = "3.2.0"
requires_python = ">=3.6"
summary = "A Python module for creating Excel XLSX files."
groups = ["default"]
files = [
{file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"},
{file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"},
]
[[package]] [[package]]
name = "yarl" name = "yarl"
version = "1.9.4" version = "1.9.4"

View File

@@ -21,6 +21,9 @@ dependencies = [
"selenium>=4.22.0", "selenium>=4.22.0",
"webdriver-manager>=4.0.1", "webdriver-manager>=4.0.1",
"pydantic>=2.8.2", "pydantic>=2.8.2",
"pandas>=2.2.2",
"openpyxl>=3.1.5",
"xlsxwriter>=3.2.0",
] ]
requires-python = ">=3.10" requires-python = ">=3.10"
readme = "README.md" readme = "README.md"
@@ -40,9 +43,10 @@ ignore = []
defineConstant = { DEBUG = true } defineConstant = { DEBUG = true }
stubPath = "" stubPath = ""
reportUnknownMemberType=false reportUnknownMemberType= false
reportMissingImports = true reportMissingImports = true
reportMissingTypeStubs = false reportMissingTypeStubs = false
reportAny = false
pythonVersion = "3.9" pythonVersion = "3.9"
pythonPlatform = "Linux" pythonPlatform = "Linux"

124
src/components/JobTable.tsx Normal file
View File

@@ -0,0 +1,124 @@
import React, { useState } from "react";
import {
TextField,
Table,
TableBody,
TableCell,
TableHead,
TableRow,
Button,
} from "@mui/material";
import { useRouter } from "next/router";
interface Job {
id: string;
url: string;
elements: Object[];
result: Object;
time_created: Date;
}
interface JobTableProps {
jobs: Job[];
}
const JobTable: React.FC<JobTableProps> = ({ jobs }) => {
const router = useRouter();
const handleDownload = async (id: string) => {
console.log(id);
const response = await fetch("/api/download", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ id: id }),
});
if (response.ok) {
const blob = await response.blob();
const url = window.URL.createObjectURL(blob);
const a = document.createElement("a");
a.style.display = "none";
a.href = url;
a.download = `job_${id}.csv`;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
} else {
console.error("Failed to download the file.");
}
};
const handleNavigate = (elements: Object[], url: string) => {
router.push({
pathname: "/",
query: {
elements: JSON.stringify(elements),
url: url,
},
});
};
return (
<>
<Table>
<TableHead>
<TableRow>
<TableCell>id</TableCell>
<TableCell>url</TableCell>
<TableCell>elements</TableCell>
<TableCell>result</TableCell>
<TableCell>time_created</TableCell>
</TableRow>
</TableHead>
<TableBody>
{jobs.map((row, index) => (
<TableRow key={index}>
<TableCell>
<TextField variant="outlined" fullWidth value={row.id} />
</TableCell>
<TableCell>
<TextField variant="outlined" fullWidth value={row.url} />
</TableCell>
<TableCell>
<TextField
variant="outlined"
fullWidth
value={JSON.stringify(row.elements)}
/>
</TableCell>
<TableCell>
<TextField
variant="outlined"
fullWidth
value={JSON.stringify(row.result)}
/>
</TableCell>
<TableCell>
<TextField
variant="outlined"
fullWidth
value={row.time_created}
/>
</TableCell>
<TableCell>
<Button
onClick={() => {
handleDownload(row.id);
}}
>
Download
</Button>
</TableCell>
<TableCell>
<Button onClick={() => handleNavigate(row.elements, row.url)}>
Rerun
</Button>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</>
);
};
export default JobTable;

View File

@@ -1,34 +0,0 @@
import React from "react";
import { useAuth } from "../useAuth";
import { LogoutOptions, RedirectLoginOptions } from "@auth0/auth0-react";
const NavBar: React.FC = () => {
const { loginWithRedirect, logout, user, isAuthenticated } = useAuth();
const handleLogout = () => {
const logoutOptions: LogoutOptions = {};
logout(logoutOptions);
};
const handleLogin = () => {
const loginOptions: RedirectLoginOptions = {
authorizationParams: { redirect_uri: "http://localhost" },
};
loginWithRedirect(loginOptions);
};
return (
<nav>
{isAuthenticated ? (
<>
<p>Welcome, {user?.name}</p>
<button onClick={handleLogout}>Logout</button>
</>
) : (
<button onClick={handleLogin}>Login</button>
)}
</nav>
);
};
export default NavBar;

View File

@@ -0,0 +1,115 @@
import React, { useState } from "react";
import { useAuth } from "../useAuth";
import { LogoutOptions, RedirectLoginOptions } from "@auth0/auth0-react";
import {
Box,
Drawer,
List,
ListItem,
ListItemIcon,
ListItemButton,
ListItemText,
AppBar,
Toolbar,
IconButton,
Typography,
Button,
} from "@mui/material";
import HomeIcon from "@mui/icons-material/Home";
import HttpIcon from "@mui/icons-material/Http";
import MenuIcon from "@mui/icons-material/Menu";
import { useRouter } from "next/router";
const NavDrawer: React.FC = () => {
const router = useRouter();
const { loginWithRedirect, logout, user, isAuthenticated } = useAuth();
const [open, setOpen] = useState<boolean>(false);
const handleLogout = () => {
const logoutOptions: LogoutOptions = {};
logout(logoutOptions);
};
const handleLogin = () => {
const loginOptions: RedirectLoginOptions = {
authorizationParams: { redirect_uri: "http://localhost" },
};
loginWithRedirect(loginOptions);
};
const toggleDrawer =
(open: boolean) => (event: React.KeyboardEvent | React.MouseEvent) => {
if (
event.type === "keydown" &&
((event as React.KeyboardEvent).key === "Tab" ||
(event as React.KeyboardEvent).key === "Shift")
) {
return;
}
setOpen(open);
};
const DrawerList = (
<Box
sx={{ width: 250 }}
role="presentation"
onClick={toggleDrawer(false)}
onKeyDown={toggleDrawer(false)}
>
<List>
<ListItem>
<ListItemButton onClick={() => router.push("/")}>
<ListItemIcon>
<HomeIcon />
</ListItemIcon>
<ListItemText>Home</ListItemText>
</ListItemButton>
</ListItem>
<ListItem>
<ListItemButton onClick={() => router.push("/jobs")}>
<ListItemIcon>
<HttpIcon />
</ListItemIcon>
<ListItemText>Previous Jobs</ListItemText>
</ListItemButton>
</ListItem>
</List>
</Box>
);
return (
<>
<AppBar position="static">
<Toolbar>
<IconButton
edge="start"
color="inherit"
aria-label="menu"
onClick={toggleDrawer(true)}
>
<MenuIcon />
</IconButton>
{isAuthenticated ? (
<>
<Typography variant="body1" sx={{ marginRight: 2 }}>
Welcome, {user?.name}
</Typography>
<Button color="inherit" onClick={handleLogout}>
Logout
</Button>
</>
) : (
<Button color="inherit" onClick={handleLogin}>
Login
</Button>
)}
</Toolbar>
</AppBar>
<Drawer open={open} onClose={toggleDrawer(false)}>
{DrawerList}
</Drawer>
</>
);
};
export default NavDrawer;

View File

@@ -5,6 +5,7 @@ import React from "react";
import type { AppProps } from "next/app"; import type { AppProps } from "next/app";
import Head from "next/head"; import Head from "next/head";
import { Auth0Provider } from "@auth0/auth0-react"; import { Auth0Provider } from "@auth0/auth0-react";
import NavDrawer from "../components/NavDrawer";
const domain = process.env.NEXT_PUBLIC_AUTH0_ISSUER_BASE_URL || ""; const domain = process.env.NEXT_PUBLIC_AUTH0_ISSUER_BASE_URL || "";
const clientId = process.env.NEXT_PUBLIC_AUTH0_CLIENT_ID || ""; const clientId = process.env.NEXT_PUBLIC_AUTH0_CLIENT_ID || "";
@@ -26,6 +27,7 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
cacheLocation="localstorage" cacheLocation="localstorage"
useRefreshTokens={true} useRefreshTokens={true}
> >
<NavDrawer></NavDrawer>
<Component {...pageProps} /> <Component {...pageProps} />
</Auth0Provider> </Auth0Provider>
</> </>

View File

@@ -1,5 +1,4 @@
import React, { useState } from "react"; import React, { useState, useEffect } from "react";
import NavBar from "../components/NavBar";
import { import {
Typography, Typography,
TextField, TextField,
@@ -13,6 +12,8 @@ import {
Box, Box,
} from "@mui/material"; } from "@mui/material";
import AddIcon from "@mui/icons-material/Add"; import AddIcon from "@mui/icons-material/Add";
import { useAuth0 } from "@auth0/auth0-react";
import { useRouter } from "next/router";
interface Element { interface Element {
name: string; name: string;
@@ -31,7 +32,12 @@ type Result = {
}; };
const Home = () => { const Home = () => {
const [url, setUrl] = useState(""); const { user } = useAuth0();
const router = useRouter();
const { elements, url } = router.query;
const [submittedURL, setUrl] = useState("");
const [rows, setRows] = useState<Element[]>([]); const [rows, setRows] = useState<Element[]>([]);
const [results, setResults] = useState<null | Result>(null); const [results, setResults] = useState<null | Result>(null);
const [newRow, setNewRow] = useState<Element>({ const [newRow, setNewRow] = useState<Element>({
@@ -40,8 +46,17 @@ const Home = () => {
url: "", url: "",
}); });
useEffect(() => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setUrl(url as string);
}
}, [elements, url]);
const handleAddRow = () => { const handleAddRow = () => {
newRow.url = url; newRow.url = submittedURL;
setRows([...rows, newRow]); setRows([...rows, newRow]);
setNewRow({ name: "", xpath: "", url: "" }); setNewRow({ name: "", xpath: "", url: "" });
}; };
@@ -50,7 +65,12 @@ const Home = () => {
fetch("/api/submit-scrape-job", { fetch("/api/submit-scrape-job", {
method: "POST", method: "POST",
headers: { "content-type": "application/json" }, headers: { "content-type": "application/json" },
body: JSON.stringify({ url: url, elements: rows }), body: JSON.stringify({
url: url,
elements: rows,
user: user?.name,
time_created: new Date().toISOString(),
}),
}) })
.then((response) => response.json()) .then((response) => response.json())
.then((data) => setResults(data)); .then((data) => setResults(data));
@@ -58,7 +78,6 @@ const Home = () => {
return ( return (
<> <>
<NavBar />
<Container maxWidth="md"> <Container maxWidth="md">
<Typography variant="h1" gutterBottom> <Typography variant="h1" gutterBottom>
Web Scraper Web Scraper
@@ -97,9 +116,10 @@ const Home = () => {
startIcon={<AddIcon />} startIcon={<AddIcon />}
onClick={handleAddRow} onClick={handleAddRow}
> >
Add Row Add Elements
</Button> </Button>
</Box> </Box>
<Typography variant="h4">Elements</Typography>
<Table> <Table>
<TableHead> <TableHead>
<TableRow> <TableRow>

26
src/pages/jobs.tsx Normal file
View File

@@ -0,0 +1,26 @@
import { useAuth0 } from "@auth0/auth0-react";
import React, { useEffect, useState } from "react";
import JobTable from "../components/JobTable";
const Jobs = () => {
const { user } = useAuth0();
const [jobs, setJobs] = useState([]);
useEffect(() => {
fetch("/api/retrieve-scrape-jobs", {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({ user: user?.name }),
})
.then((response) => response.json())
.then((data) => setJobs(data));
}, []);
return (
<>
<JobTable jobs={jobs} />
</>
);
};
export default Jobs;