wip: add in job rerunning

This commit is contained in:
Jayden Pyles
2024-07-06 16:56:56 -05:00
parent 70bdd01d9d
commit 8808b493e6
13 changed files with 607 additions and 70 deletions

View File

@@ -1,8 +1,49 @@
# STL
import os
import logging
from typing import Any
# PDM
import boto3
from mypy_boto3_dynamodb.service_resource import Table, DynamoDBServiceResource
LOG = logging.getLogger(__name__)
def test_dyanmo():
dynamodb = boto3.resource("dynamodb", region_name="us-west-2")
table = dynamodb.Table("scrape")
print(table)
def connect_to_dynamo() -> Table:
region_name = os.getenv("AWS_REGION")
dynamodb: DynamoDBServiceResource = boto3.resource(
"dynamodb", region_name=region_name
)
return dynamodb.Table("scrape")
def insert(table: Table, item: dict[str, Any]) -> None:
i = table.put_item(Item=item)
LOG.info(f"Inserted item: {i}")
def query(table: Table, index_name: str, key_condition: Any) -> list[Any]:
try:
response = table.query(
IndexName=index_name, KeyConditionExpression=key_condition
)
items = response.get("Items", [])
for item in items:
LOG.info(f"Queried item: {item}")
return items
except Exception as e:
LOG.error(f"Failed to query table: {e}")
raise
def query_by_id(table: Table, key_condition: Any) -> list[Any]:
try:
response = table.query(KeyConditionExpression=key_condition)
items = response.get("Items", [])
for item in items:
LOG.info(f"Queried item: {item}")
return items
except Exception as e:
LOG.error(f"Failed to query table: {e}")
raise

View File

@@ -1,18 +1,28 @@
# STL
import uuid
import logging
from io import StringIO
# PDM
from fastapi import FastAPI
import pandas as pd
from fastapi import FastAPI, HTTPException
from fastapi.encoders import jsonable_encoder
from fastapi.responses import FileResponse, JSONResponse
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from boto3.dynamodb.conditions import Key
# LOCAL
from api.backend.amazon import test_dyanmo
from api.backend.models import SubmitScrapeJob
from api.backend.amazon import query, insert, query_by_id, connect_to_dynamo
from api.backend.models import DownloadJob, SubmitScrapeJob, RetrieveScrapeJobs
from api.backend.scraping import scrape
logging.basicConfig(
level=logging.INFO,
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
handlers=[logging.StreamHandler()],
)
LOG = logging.getLogger(__name__)
app = FastAPI(title="api")
@@ -33,19 +43,53 @@ def read_root():
return FileResponse("./dist/index.html")
@app.get("/api/endpoint")
async def test_endpoint():
test_dyanmo()
return "Hello World!"
@app.post("/api/submit-scrape-job")
async def submit_scrape_job(job: SubmitScrapeJob):
LOG.info(f"Recieved job: {job}")
try:
scraped = await scrape(job.url, job.elements)
print(scraped)
LOG.info(
f"Scraped result for url: {job.url}, with elements: {job.elements}\n{scraped}"
)
json_scraped = jsonable_encoder(scraped)
print(json_scraped)
table = connect_to_dynamo()
job.result = json_scraped
job.id = uuid.uuid4().hex
insert(table, jsonable_encoder(job))
return JSONResponse(content=json_scraped)
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=500)
@app.post("/api/retrieve-scrape-jobs")
async def retrieve_scrape_jobs(retrieve: RetrieveScrapeJobs):
LOG.info(f"Retrieving jobs for account: {retrieve.user}")
try:
table = connect_to_dynamo()
results = query(table, "user", Key("user").eq(retrieve.user))
return JSONResponse(content=results)
except Exception as e:
LOG.error(f"Exception occurred: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
@app.post("/api/download")
async def download(download_job: DownloadJob):
LOG.info(f"Downloading job with id: {download_job.id}")
try:
table = connect_to_dynamo()
results = query_by_id(table, Key("id").eq(download_job.id))
df = pd.DataFrame(results)
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)
_ = csv_buffer.seek(0)
response = StreamingResponse(csv_buffer, media_type="text/csv")
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")

View File

@@ -1,3 +1,6 @@
# STL
from typing import Any, Optional
# PDM
import pydantic
@@ -15,5 +18,17 @@ class CapturedElement(pydantic.BaseModel):
class SubmitScrapeJob(pydantic.BaseModel):
id: Optional[str] = None
url: str
elements: list[Element]
user: str
time_created: str
result: Optional[dict[str, Any]] = None
class RetrieveScrapeJobs(pydantic.BaseModel):
user: str
class DownloadJob(pydantic.BaseModel):
id: str

View File

@@ -1,3 +1,6 @@
# STL
import logging
# PDM
from bs4 import BeautifulSoup
from lxml import etree
@@ -14,10 +17,26 @@ from selenium.webdriver.chrome.service import Service
# LOCAL
from api.backend.models import Element, CapturedElement
LOG = logging.getLogger(__name__)
class HtmlElement(_Element): ...
def clean_xpath(xpath: str) -> str:
parts = xpath.split("/")
clean_parts: list[str] = []
for part in parts:
if part == "":
clean_parts.append("/")
else:
clean_parts.append(part)
clean_xpath = "//".join(clean_parts).replace("////", "//")
clean_xpath = clean_xpath.replace("'", "\\'")
return clean_xpath
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
return context.xpath(xpath) # type: ignore [reportReturnType]
@@ -44,7 +63,7 @@ async def make_site_request(url: str) -> str:
finally:
driver.quit()
print(page_source)
LOG.debug(f"Page source for url: {url}\n{page_source}")
return page_source
@@ -55,7 +74,7 @@ async def collect_scraped_elements(page: str, xpaths: list[Element]):
elements: dict[str, list[CapturedElement]] = dict()
for elem in xpaths:
el = sxpath(root, elem.xpath)
el = sxpath(root, clean_xpath(elem.xpath))
text = ["".join(str(e) for e in e.itertext()) for e in el]
captured_element = CapturedElement(
xpath=elem.xpath, text=",".join(text), name=elem.name

View File

@@ -1,17 +1,37 @@
# STL
import asyncio
import os
# LOCAL
from api.backend.scraping import scrape
# PDM
import boto3
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
async def main():
url = "https://darksouls3.wiki.fextralife.com/Dark+Souls+3"
xpaths = [".//h3[@class='bonfire']", ".//div[@class='comment']"]
scraped = await scrape(url, xpaths)
def test_insert_and_delete():
# Get environment variables
region_name = os.getenv("AWS_REGION")
# Initialize DynamoDB resource
dynamodb = boto3.resource("dynamodb", region_name=region_name)
table = dynamodb.Table("scrape")
print(scraped)
# Item to insert
item = {
"id": "123", # Replace with the appropriate id value
"attribute1": "value1",
"attribute2": "value2",
# Add more attributes as needed
}
# Insert the item
table.put_item(Item=item)
print(f"Inserted item: {item}")
# Delete the item
table.delete_item(Key={"id": "123"}) # Replace with the appropriate id value
print(f"Deleted item with id: {item['id']}")
if __name__ == "__main__":
asyncio.run(main())
test_insert_and_delete()

143
pdm.lock generated
View File

@@ -5,7 +5,7 @@
groups = ["default", "dev"]
strategy = ["cross_platform", "inherit_metadata"]
lock_version = "4.4.2"
content_hash = "sha256:86d834de010a22751311e6b9553fd963dd2073e49bbf17872f5fff0630f99164"
content_hash = "sha256:a12cdcf1cdd6f91260a7d7126be4581a6820caf91ffc26386abfe9a6b3fbc9d9"
[[package]]
name = "aiohttp"
@@ -466,6 +466,17 @@ files = [
{file = "email_validator-2.1.1.tar.gz", hash = "sha256:200a70680ba08904be6d1eef729205cc0d687634399a5924d842533efb824b84"},
]
[[package]]
name = "et-xmlfile"
version = "1.1.0"
requires_python = ">=3.6"
summary = "An implementation of lxml.xmlfile for the standard library"
groups = ["default"]
files = [
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
]
[[package]]
name = "exceptiongroup"
version = "1.2.1"
@@ -1199,6 +1210,65 @@ files = [
{file = "mypy_boto3_sqs-1.34.121.tar.gz", hash = "sha256:bdbc623235ffc8127cb8753f49323f74a919df552247b0b2caaf85cf9bb495b8"},
]
[[package]]
name = "numpy"
version = "2.0.0"
requires_python = ">=3.9"
summary = "Fundamental package for array computing in Python"
groups = ["default"]
marker = "python_version <= \"3.11\" or python_version >= \"3.12\""
files = [
{file = "numpy-2.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:04494f6ec467ccb5369d1808570ae55f6ed9b5809d7f035059000a37b8d7e86f"},
{file = "numpy-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2635dbd200c2d6faf2ef9a0d04f0ecc6b13b3cad54f7c67c61155138835515d2"},
{file = "numpy-2.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:0a43f0974d501842866cc83471bdb0116ba0dffdbaac33ec05e6afed5b615238"},
{file = "numpy-2.0.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:8d83bb187fb647643bd56e1ae43f273c7f4dbcdf94550d7938cfc32566756514"},
{file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79e843d186c8fb1b102bef3e2bc35ef81160ffef3194646a7fdd6a73c6b97196"},
{file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7696c615765091cc5093f76fd1fa069870304beaccfd58b5dcc69e55ef49c1"},
{file = "numpy-2.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b4c76e3d4c56f145d41b7b6751255feefae92edbc9a61e1758a98204200f30fc"},
{file = "numpy-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:acd3a644e4807e73b4e1867b769fbf1ce8c5d80e7caaef0d90dcdc640dfc9787"},
{file = "numpy-2.0.0-cp310-cp310-win32.whl", hash = "sha256:cee6cc0584f71adefe2c908856ccc98702baf95ff80092e4ca46061538a2ba98"},
{file = "numpy-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:ed08d2703b5972ec736451b818c2eb9da80d66c3e84aed1deeb0c345fefe461b"},
{file = "numpy-2.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad0c86f3455fbd0de6c31a3056eb822fc939f81b1618f10ff3406971893b62a5"},
{file = "numpy-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7f387600d424f91576af20518334df3d97bc76a300a755f9a8d6e4f5cadd289"},
{file = "numpy-2.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:34f003cb88b1ba38cb9a9a4a3161c1604973d7f9d5552c38bc2f04f829536609"},
{file = "numpy-2.0.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:b6f6a8f45d0313db07d6d1d37bd0b112f887e1369758a5419c0370ba915b3871"},
{file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f64641b42b2429f56ee08b4f427a4d2daf916ec59686061de751a55aafa22e4"},
{file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7039a136017eaa92c1848152827e1424701532ca8e8967fe480fe1569dae581"},
{file = "numpy-2.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:46e161722e0f619749d1cd892167039015b2c2817296104487cd03ed4a955995"},
{file = "numpy-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0e50842b2295ba8414c8c1d9d957083d5dfe9e16828b37de883f51fc53c4016f"},
{file = "numpy-2.0.0-cp311-cp311-win32.whl", hash = "sha256:2ce46fd0b8a0c947ae047d222f7136fc4d55538741373107574271bc00e20e8f"},
{file = "numpy-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd6acc766814ea6443628f4e6751d0da6593dae29c08c0b2606164db026970c"},
{file = "numpy-2.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:354f373279768fa5a584bac997de6a6c9bc535c482592d7a813bb0c09be6c76f"},
{file = "numpy-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4d2f62e55a4cd9c58c1d9a1c9edaedcd857a73cb6fda875bf79093f9d9086f85"},
{file = "numpy-2.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1e72728e7501a450288fc8e1f9ebc73d90cfd4671ebbd631f3e7857c39bd16f2"},
{file = "numpy-2.0.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:84554fc53daa8f6abf8e8a66e076aff6ece62de68523d9f665f32d2fc50fd66e"},
{file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73aafd1afca80afecb22718f8700b40ac7cab927b8abab3c3e337d70e10e5a2"},
{file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49d9f7d256fbc804391a7f72d4a617302b1afac1112fac19b6c6cec63fe7fe8a"},
{file = "numpy-2.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0ec84b9ba0654f3b962802edc91424331f423dcf5d5f926676e0150789cb3d95"},
{file = "numpy-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:feff59f27338135776f6d4e2ec7aeeac5d5f7a08a83e80869121ef8164b74af9"},
{file = "numpy-2.0.0-cp312-cp312-win32.whl", hash = "sha256:c5a59996dc61835133b56a32ebe4ef3740ea5bc19b3983ac60cc32be5a665d54"},
{file = "numpy-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a356364941fb0593bb899a1076b92dfa2029f6f5b8ba88a14fd0984aaf76d0df"},
{file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9416a5c2e92ace094e9f0082c5fd473502c91651fb896bc17690d6fc475128d6"},
{file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:17067d097ed036636fa79f6a869ac26df7db1ba22039d962422506640314933a"},
{file = "numpy-2.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38ecb5b0582cd125f67a629072fed6f83562d9dd04d7e03256c9829bdec027ad"},
{file = "numpy-2.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cef04d068f5fb0518a77857953193b6bb94809a806bd0a14983a8f12ada060c9"},
{file = "numpy-2.0.0.tar.gz", hash = "sha256:cf5d1c9e6837f8af9f92b6bd3e86d513cdc11f60fd62185cc49ec7d1aba34864"},
]
[[package]]
name = "openpyxl"
version = "3.1.5"
requires_python = ">=3.8"
summary = "A Python library to read/write Excel 2010 xlsx/xlsm files"
groups = ["default"]
dependencies = [
"et-xmlfile",
]
files = [
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
]
[[package]]
name = "orjson"
version = "3.10.3"
@@ -1261,6 +1331,45 @@ files = [
{file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
]
[[package]]
name = "pandas"
version = "2.2.2"
requires_python = ">=3.9"
summary = "Powerful data structures for data analysis, time series, and statistics"
groups = ["default"]
dependencies = [
"numpy>=1.22.4; python_version < \"3.11\"",
"numpy>=1.23.2; python_version == \"3.11\"",
"numpy>=1.26.0; python_version >= \"3.12\"",
"python-dateutil>=2.8.2",
"pytz>=2020.1",
"tzdata>=2022.7",
]
files = [
{file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
{file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
{file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
{file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
{file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
{file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
{file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
{file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
{file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
{file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
{file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
{file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
{file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
{file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
{file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
{file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
{file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
{file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
{file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
{file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
{file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
{file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
]
[[package]]
name = "parse"
version = "1.20.2"
@@ -1540,6 +1649,16 @@ files = [
{file = "python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026"},
]
[[package]]
name = "pytz"
version = "2024.1"
summary = "World timezone definitions, modern and historical"
groups = ["default"]
files = [
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
]
[[package]]
name = "pyyaml"
version = "6.0.1"
@@ -1852,6 +1971,17 @@ files = [
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
]
[[package]]
name = "tzdata"
version = "2024.1"
requires_python = ">=2"
summary = "Provider of IANA time zone data"
groups = ["default"]
files = [
{file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
{file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
]
[[package]]
name = "ujson"
version = "5.10.0"
@@ -2185,6 +2315,17 @@ files = [
{file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"},
]
[[package]]
name = "xlsxwriter"
version = "3.2.0"
requires_python = ">=3.6"
summary = "A Python module for creating Excel XLSX files."
groups = ["default"]
files = [
{file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"},
{file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"},
]
[[package]]
name = "yarl"
version = "1.9.4"

View File

@@ -21,6 +21,9 @@ dependencies = [
"selenium>=4.22.0",
"webdriver-manager>=4.0.1",
"pydantic>=2.8.2",
"pandas>=2.2.2",
"openpyxl>=3.1.5",
"xlsxwriter>=3.2.0",
]
requires-python = ">=3.10"
readme = "README.md"
@@ -40,9 +43,10 @@ ignore = []
defineConstant = { DEBUG = true }
stubPath = ""
reportUnknownMemberType=false
reportUnknownMemberType= false
reportMissingImports = true
reportMissingTypeStubs = false
reportAny = false
pythonVersion = "3.9"
pythonPlatform = "Linux"

124
src/components/JobTable.tsx Normal file
View File

@@ -0,0 +1,124 @@
import React, { useState } from "react";
import {
TextField,
Table,
TableBody,
TableCell,
TableHead,
TableRow,
Button,
} from "@mui/material";
import { useRouter } from "next/router";
interface Job {
id: string;
url: string;
elements: Object[];
result: Object;
time_created: Date;
}
interface JobTableProps {
jobs: Job[];
}
const JobTable: React.FC<JobTableProps> = ({ jobs }) => {
const router = useRouter();
const handleDownload = async (id: string) => {
console.log(id);
const response = await fetch("/api/download", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ id: id }),
});
if (response.ok) {
const blob = await response.blob();
const url = window.URL.createObjectURL(blob);
const a = document.createElement("a");
a.style.display = "none";
a.href = url;
a.download = `job_${id}.csv`;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
} else {
console.error("Failed to download the file.");
}
};
const handleNavigate = (elements: Object[], url: string) => {
router.push({
pathname: "/",
query: {
elements: JSON.stringify(elements),
url: url,
},
});
};
return (
<>
<Table>
<TableHead>
<TableRow>
<TableCell>id</TableCell>
<TableCell>url</TableCell>
<TableCell>elements</TableCell>
<TableCell>result</TableCell>
<TableCell>time_created</TableCell>
</TableRow>
</TableHead>
<TableBody>
{jobs.map((row, index) => (
<TableRow key={index}>
<TableCell>
<TextField variant="outlined" fullWidth value={row.id} />
</TableCell>
<TableCell>
<TextField variant="outlined" fullWidth value={row.url} />
</TableCell>
<TableCell>
<TextField
variant="outlined"
fullWidth
value={JSON.stringify(row.elements)}
/>
</TableCell>
<TableCell>
<TextField
variant="outlined"
fullWidth
value={JSON.stringify(row.result)}
/>
</TableCell>
<TableCell>
<TextField
variant="outlined"
fullWidth
value={row.time_created}
/>
</TableCell>
<TableCell>
<Button
onClick={() => {
handleDownload(row.id);
}}
>
Download
</Button>
</TableCell>
<TableCell>
<Button onClick={() => handleNavigate(row.elements, row.url)}>
Rerun
</Button>
</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</>
);
};
export default JobTable;

View File

@@ -1,34 +0,0 @@
import React from "react";
import { useAuth } from "../useAuth";
import { LogoutOptions, RedirectLoginOptions } from "@auth0/auth0-react";
const NavBar: React.FC = () => {
const { loginWithRedirect, logout, user, isAuthenticated } = useAuth();
const handleLogout = () => {
const logoutOptions: LogoutOptions = {};
logout(logoutOptions);
};
const handleLogin = () => {
const loginOptions: RedirectLoginOptions = {
authorizationParams: { redirect_uri: "http://localhost" },
};
loginWithRedirect(loginOptions);
};
return (
<nav>
{isAuthenticated ? (
<>
<p>Welcome, {user?.name}</p>
<button onClick={handleLogout}>Logout</button>
</>
) : (
<button onClick={handleLogin}>Login</button>
)}
</nav>
);
};
export default NavBar;

View File

@@ -0,0 +1,115 @@
import React, { useState } from "react";
import { useAuth } from "../useAuth";
import { LogoutOptions, RedirectLoginOptions } from "@auth0/auth0-react";
import {
Box,
Drawer,
List,
ListItem,
ListItemIcon,
ListItemButton,
ListItemText,
AppBar,
Toolbar,
IconButton,
Typography,
Button,
} from "@mui/material";
import HomeIcon from "@mui/icons-material/Home";
import HttpIcon from "@mui/icons-material/Http";
import MenuIcon from "@mui/icons-material/Menu";
import { useRouter } from "next/router";
const NavDrawer: React.FC = () => {
const router = useRouter();
const { loginWithRedirect, logout, user, isAuthenticated } = useAuth();
const [open, setOpen] = useState<boolean>(false);
const handleLogout = () => {
const logoutOptions: LogoutOptions = {};
logout(logoutOptions);
};
const handleLogin = () => {
const loginOptions: RedirectLoginOptions = {
authorizationParams: { redirect_uri: "http://localhost" },
};
loginWithRedirect(loginOptions);
};
const toggleDrawer =
(open: boolean) => (event: React.KeyboardEvent | React.MouseEvent) => {
if (
event.type === "keydown" &&
((event as React.KeyboardEvent).key === "Tab" ||
(event as React.KeyboardEvent).key === "Shift")
) {
return;
}
setOpen(open);
};
const DrawerList = (
<Box
sx={{ width: 250 }}
role="presentation"
onClick={toggleDrawer(false)}
onKeyDown={toggleDrawer(false)}
>
<List>
<ListItem>
<ListItemButton onClick={() => router.push("/")}>
<ListItemIcon>
<HomeIcon />
</ListItemIcon>
<ListItemText>Home</ListItemText>
</ListItemButton>
</ListItem>
<ListItem>
<ListItemButton onClick={() => router.push("/jobs")}>
<ListItemIcon>
<HttpIcon />
</ListItemIcon>
<ListItemText>Previous Jobs</ListItemText>
</ListItemButton>
</ListItem>
</List>
</Box>
);
return (
<>
<AppBar position="static">
<Toolbar>
<IconButton
edge="start"
color="inherit"
aria-label="menu"
onClick={toggleDrawer(true)}
>
<MenuIcon />
</IconButton>
{isAuthenticated ? (
<>
<Typography variant="body1" sx={{ marginRight: 2 }}>
Welcome, {user?.name}
</Typography>
<Button color="inherit" onClick={handleLogout}>
Logout
</Button>
</>
) : (
<Button color="inherit" onClick={handleLogin}>
Login
</Button>
)}
</Toolbar>
</AppBar>
<Drawer open={open} onClose={toggleDrawer(false)}>
{DrawerList}
</Drawer>
</>
);
};
export default NavDrawer;

View File

@@ -5,6 +5,7 @@ import React from "react";
import type { AppProps } from "next/app";
import Head from "next/head";
import { Auth0Provider } from "@auth0/auth0-react";
import NavDrawer from "../components/NavDrawer";
const domain = process.env.NEXT_PUBLIC_AUTH0_ISSUER_BASE_URL || "";
const clientId = process.env.NEXT_PUBLIC_AUTH0_CLIENT_ID || "";
@@ -26,6 +27,7 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
cacheLocation="localstorage"
useRefreshTokens={true}
>
<NavDrawer></NavDrawer>
<Component {...pageProps} />
</Auth0Provider>
</>

View File

@@ -1,5 +1,4 @@
import React, { useState } from "react";
import NavBar from "../components/NavBar";
import React, { useState, useEffect } from "react";
import {
Typography,
TextField,
@@ -13,6 +12,8 @@ import {
Box,
} from "@mui/material";
import AddIcon from "@mui/icons-material/Add";
import { useAuth0 } from "@auth0/auth0-react";
import { useRouter } from "next/router";
interface Element {
name: string;
@@ -31,7 +32,12 @@ type Result = {
};
const Home = () => {
const [url, setUrl] = useState("");
const { user } = useAuth0();
const router = useRouter();
const { elements, url } = router.query;
const [submittedURL, setUrl] = useState("");
const [rows, setRows] = useState<Element[]>([]);
const [results, setResults] = useState<null | Result>(null);
const [newRow, setNewRow] = useState<Element>({
@@ -40,8 +46,17 @@ const Home = () => {
url: "",
});
useEffect(() => {
if (elements) {
setRows(JSON.parse(elements as string));
}
if (url) {
setUrl(url as string);
}
}, [elements, url]);
const handleAddRow = () => {
newRow.url = url;
newRow.url = submittedURL;
setRows([...rows, newRow]);
setNewRow({ name: "", xpath: "", url: "" });
};
@@ -50,7 +65,12 @@ const Home = () => {
fetch("/api/submit-scrape-job", {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({ url: url, elements: rows }),
body: JSON.stringify({
url: url,
elements: rows,
user: user?.name,
time_created: new Date().toISOString(),
}),
})
.then((response) => response.json())
.then((data) => setResults(data));
@@ -58,7 +78,6 @@ const Home = () => {
return (
<>
<NavBar />
<Container maxWidth="md">
<Typography variant="h1" gutterBottom>
Web Scraper
@@ -97,9 +116,10 @@ const Home = () => {
startIcon={<AddIcon />}
onClick={handleAddRow}
>
Add Row
Add Elements
</Button>
</Box>
<Typography variant="h4">Elements</Typography>
<Table>
<TableHead>
<TableRow>

26
src/pages/jobs.tsx Normal file
View File

@@ -0,0 +1,26 @@
import { useAuth0 } from "@auth0/auth0-react";
import React, { useEffect, useState } from "react";
import JobTable from "../components/JobTable";
const Jobs = () => {
const { user } = useAuth0();
const [jobs, setJobs] = useState([]);
useEffect(() => {
fetch("/api/retrieve-scrape-jobs", {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({ user: user?.name }),
})
.then((response) => response.json())
.then((data) => setJobs(data));
}, []);
return (
<>
<JobTable jobs={jobs} />
</>
);
};
export default Jobs;