mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-13 19:25:58 +00:00
wip: add in job rerunning
This commit is contained in:
@@ -1,8 +1,49 @@
|
||||
# STL
|
||||
import os
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
# PDM
|
||||
import boto3
|
||||
from mypy_boto3_dynamodb.service_resource import Table, DynamoDBServiceResource
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def test_dyanmo():
|
||||
dynamodb = boto3.resource("dynamodb", region_name="us-west-2")
|
||||
table = dynamodb.Table("scrape")
|
||||
print(table)
|
||||
def connect_to_dynamo() -> Table:
|
||||
region_name = os.getenv("AWS_REGION")
|
||||
dynamodb: DynamoDBServiceResource = boto3.resource(
|
||||
"dynamodb", region_name=region_name
|
||||
)
|
||||
return dynamodb.Table("scrape")
|
||||
|
||||
|
||||
def insert(table: Table, item: dict[str, Any]) -> None:
|
||||
i = table.put_item(Item=item)
|
||||
LOG.info(f"Inserted item: {i}")
|
||||
|
||||
|
||||
def query(table: Table, index_name: str, key_condition: Any) -> list[Any]:
|
||||
try:
|
||||
response = table.query(
|
||||
IndexName=index_name, KeyConditionExpression=key_condition
|
||||
)
|
||||
items = response.get("Items", [])
|
||||
for item in items:
|
||||
LOG.info(f"Queried item: {item}")
|
||||
return items
|
||||
except Exception as e:
|
||||
LOG.error(f"Failed to query table: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def query_by_id(table: Table, key_condition: Any) -> list[Any]:
|
||||
try:
|
||||
response = table.query(KeyConditionExpression=key_condition)
|
||||
items = response.get("Items", [])
|
||||
for item in items:
|
||||
LOG.info(f"Queried item: {item}")
|
||||
return items
|
||||
except Exception as e:
|
||||
LOG.error(f"Failed to query table: {e}")
|
||||
raise
|
||||
|
||||
@@ -1,18 +1,28 @@
|
||||
# STL
|
||||
import uuid
|
||||
import logging
|
||||
from io import StringIO
|
||||
|
||||
# PDM
|
||||
from fastapi import FastAPI
|
||||
import pandas as pd
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from boto3.dynamodb.conditions import Key
|
||||
|
||||
# LOCAL
|
||||
from api.backend.amazon import test_dyanmo
|
||||
from api.backend.models import SubmitScrapeJob
|
||||
from api.backend.amazon import query, insert, query_by_id, connect_to_dynamo
|
||||
from api.backend.models import DownloadJob, SubmitScrapeJob, RetrieveScrapeJobs
|
||||
from api.backend.scraping import scrape
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
||||
handlers=[logging.StreamHandler()],
|
||||
)
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(title="api")
|
||||
@@ -33,19 +43,53 @@ def read_root():
|
||||
return FileResponse("./dist/index.html")
|
||||
|
||||
|
||||
@app.get("/api/endpoint")
|
||||
async def test_endpoint():
|
||||
test_dyanmo()
|
||||
return "Hello World!"
|
||||
|
||||
|
||||
@app.post("/api/submit-scrape-job")
|
||||
async def submit_scrape_job(job: SubmitScrapeJob):
|
||||
LOG.info(f"Recieved job: {job}")
|
||||
try:
|
||||
scraped = await scrape(job.url, job.elements)
|
||||
print(scraped)
|
||||
|
||||
LOG.info(
|
||||
f"Scraped result for url: {job.url}, with elements: {job.elements}\n{scraped}"
|
||||
)
|
||||
|
||||
json_scraped = jsonable_encoder(scraped)
|
||||
print(json_scraped)
|
||||
table = connect_to_dynamo()
|
||||
job.result = json_scraped
|
||||
job.id = uuid.uuid4().hex
|
||||
insert(table, jsonable_encoder(job))
|
||||
return JSONResponse(content=json_scraped)
|
||||
except Exception as e:
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
@app.post("/api/retrieve-scrape-jobs")
|
||||
async def retrieve_scrape_jobs(retrieve: RetrieveScrapeJobs):
|
||||
LOG.info(f"Retrieving jobs for account: {retrieve.user}")
|
||||
try:
|
||||
table = connect_to_dynamo()
|
||||
results = query(table, "user", Key("user").eq(retrieve.user))
|
||||
return JSONResponse(content=results)
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
|
||||
@app.post("/api/download")
|
||||
async def download(download_job: DownloadJob):
|
||||
LOG.info(f"Downloading job with id: {download_job.id}")
|
||||
try:
|
||||
table = connect_to_dynamo()
|
||||
results = query_by_id(table, Key("id").eq(download_job.id))
|
||||
|
||||
df = pd.DataFrame(results)
|
||||
|
||||
csv_buffer = StringIO()
|
||||
df.to_csv(csv_buffer, index=False)
|
||||
_ = csv_buffer.seek(0)
|
||||
response = StreamingResponse(csv_buffer, media_type="text/csv")
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# STL
|
||||
from typing import Any, Optional
|
||||
|
||||
# PDM
|
||||
import pydantic
|
||||
|
||||
@@ -15,5 +18,17 @@ class CapturedElement(pydantic.BaseModel):
|
||||
|
||||
|
||||
class SubmitScrapeJob(pydantic.BaseModel):
|
||||
id: Optional[str] = None
|
||||
url: str
|
||||
elements: list[Element]
|
||||
user: str
|
||||
time_created: str
|
||||
result: Optional[dict[str, Any]] = None
|
||||
|
||||
|
||||
class RetrieveScrapeJobs(pydantic.BaseModel):
|
||||
user: str
|
||||
|
||||
|
||||
class DownloadJob(pydantic.BaseModel):
|
||||
id: str
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# STL
|
||||
import logging
|
||||
|
||||
# PDM
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import etree
|
||||
@@ -14,10 +17,26 @@ from selenium.webdriver.chrome.service import Service
|
||||
# LOCAL
|
||||
from api.backend.models import Element, CapturedElement
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HtmlElement(_Element): ...
|
||||
|
||||
|
||||
def clean_xpath(xpath: str) -> str:
|
||||
parts = xpath.split("/")
|
||||
clean_parts: list[str] = []
|
||||
for part in parts:
|
||||
if part == "":
|
||||
clean_parts.append("/")
|
||||
else:
|
||||
clean_parts.append(part)
|
||||
clean_xpath = "//".join(clean_parts).replace("////", "//")
|
||||
|
||||
clean_xpath = clean_xpath.replace("'", "\\'")
|
||||
return clean_xpath
|
||||
|
||||
|
||||
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
|
||||
return context.xpath(xpath) # type: ignore [reportReturnType]
|
||||
|
||||
@@ -44,7 +63,7 @@ async def make_site_request(url: str) -> str:
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
print(page_source)
|
||||
LOG.debug(f"Page source for url: {url}\n{page_source}")
|
||||
return page_source
|
||||
|
||||
|
||||
@@ -55,7 +74,7 @@ async def collect_scraped_elements(page: str, xpaths: list[Element]):
|
||||
elements: dict[str, list[CapturedElement]] = dict()
|
||||
|
||||
for elem in xpaths:
|
||||
el = sxpath(root, elem.xpath)
|
||||
el = sxpath(root, clean_xpath(elem.xpath))
|
||||
text = ["".join(str(e) for e in e.itertext()) for e in el]
|
||||
captured_element = CapturedElement(
|
||||
xpath=elem.xpath, text=",".join(text), name=elem.name
|
||||
|
||||
38
ipython.py
38
ipython.py
@@ -1,17 +1,37 @@
|
||||
# STL
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
# LOCAL
|
||||
from api.backend.scraping import scrape
|
||||
# PDM
|
||||
import boto3
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables from .env file
|
||||
load_dotenv()
|
||||
|
||||
|
||||
async def main():
|
||||
url = "https://darksouls3.wiki.fextralife.com/Dark+Souls+3"
|
||||
xpaths = [".//h3[@class='bonfire']", ".//div[@class='comment']"]
|
||||
scraped = await scrape(url, xpaths)
|
||||
def test_insert_and_delete():
|
||||
# Get environment variables
|
||||
region_name = os.getenv("AWS_REGION")
|
||||
# Initialize DynamoDB resource
|
||||
dynamodb = boto3.resource("dynamodb", region_name=region_name)
|
||||
table = dynamodb.Table("scrape")
|
||||
|
||||
print(scraped)
|
||||
# Item to insert
|
||||
item = {
|
||||
"id": "123", # Replace with the appropriate id value
|
||||
"attribute1": "value1",
|
||||
"attribute2": "value2",
|
||||
# Add more attributes as needed
|
||||
}
|
||||
|
||||
# Insert the item
|
||||
table.put_item(Item=item)
|
||||
print(f"Inserted item: {item}")
|
||||
|
||||
# Delete the item
|
||||
table.delete_item(Key={"id": "123"}) # Replace with the appropriate id value
|
||||
print(f"Deleted item with id: {item['id']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
test_insert_and_delete()
|
||||
|
||||
143
pdm.lock
generated
143
pdm.lock
generated
@@ -5,7 +5,7 @@
|
||||
groups = ["default", "dev"]
|
||||
strategy = ["cross_platform", "inherit_metadata"]
|
||||
lock_version = "4.4.2"
|
||||
content_hash = "sha256:86d834de010a22751311e6b9553fd963dd2073e49bbf17872f5fff0630f99164"
|
||||
content_hash = "sha256:a12cdcf1cdd6f91260a7d7126be4581a6820caf91ffc26386abfe9a6b3fbc9d9"
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
@@ -466,6 +466,17 @@ files = [
|
||||
{file = "email_validator-2.1.1.tar.gz", hash = "sha256:200a70680ba08904be6d1eef729205cc0d687634399a5924d842533efb824b84"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "1.1.0"
|
||||
requires_python = ">=3.6"
|
||||
summary = "An implementation of lxml.xmlfile for the standard library"
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
|
||||
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "exceptiongroup"
|
||||
version = "1.2.1"
|
||||
@@ -1199,6 +1210,65 @@ files = [
|
||||
{file = "mypy_boto3_sqs-1.34.121.tar.gz", hash = "sha256:bdbc623235ffc8127cb8753f49323f74a919df552247b0b2caaf85cf9bb495b8"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "numpy"
|
||||
version = "2.0.0"
|
||||
requires_python = ">=3.9"
|
||||
summary = "Fundamental package for array computing in Python"
|
||||
groups = ["default"]
|
||||
marker = "python_version <= \"3.11\" or python_version >= \"3.12\""
|
||||
files = [
|
||||
{file = "numpy-2.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:04494f6ec467ccb5369d1808570ae55f6ed9b5809d7f035059000a37b8d7e86f"},
|
||||
{file = "numpy-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2635dbd200c2d6faf2ef9a0d04f0ecc6b13b3cad54f7c67c61155138835515d2"},
|
||||
{file = "numpy-2.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:0a43f0974d501842866cc83471bdb0116ba0dffdbaac33ec05e6afed5b615238"},
|
||||
{file = "numpy-2.0.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:8d83bb187fb647643bd56e1ae43f273c7f4dbcdf94550d7938cfc32566756514"},
|
||||
{file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79e843d186c8fb1b102bef3e2bc35ef81160ffef3194646a7fdd6a73c6b97196"},
|
||||
{file = "numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d7696c615765091cc5093f76fd1fa069870304beaccfd58b5dcc69e55ef49c1"},
|
||||
{file = "numpy-2.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b4c76e3d4c56f145d41b7b6751255feefae92edbc9a61e1758a98204200f30fc"},
|
||||
{file = "numpy-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:acd3a644e4807e73b4e1867b769fbf1ce8c5d80e7caaef0d90dcdc640dfc9787"},
|
||||
{file = "numpy-2.0.0-cp310-cp310-win32.whl", hash = "sha256:cee6cc0584f71adefe2c908856ccc98702baf95ff80092e4ca46061538a2ba98"},
|
||||
{file = "numpy-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:ed08d2703b5972ec736451b818c2eb9da80d66c3e84aed1deeb0c345fefe461b"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad0c86f3455fbd0de6c31a3056eb822fc939f81b1618f10ff3406971893b62a5"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7f387600d424f91576af20518334df3d97bc76a300a755f9a8d6e4f5cadd289"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:34f003cb88b1ba38cb9a9a4a3161c1604973d7f9d5552c38bc2f04f829536609"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:b6f6a8f45d0313db07d6d1d37bd0b112f887e1369758a5419c0370ba915b3871"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f64641b42b2429f56ee08b4f427a4d2daf916ec59686061de751a55aafa22e4"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7039a136017eaa92c1848152827e1424701532ca8e8967fe480fe1569dae581"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:46e161722e0f619749d1cd892167039015b2c2817296104487cd03ed4a955995"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0e50842b2295ba8414c8c1d9d957083d5dfe9e16828b37de883f51fc53c4016f"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-win32.whl", hash = "sha256:2ce46fd0b8a0c947ae047d222f7136fc4d55538741373107574271bc00e20e8f"},
|
||||
{file = "numpy-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbd6acc766814ea6443628f4e6751d0da6593dae29c08c0b2606164db026970c"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:354f373279768fa5a584bac997de6a6c9bc535c482592d7a813bb0c09be6c76f"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4d2f62e55a4cd9c58c1d9a1c9edaedcd857a73cb6fda875bf79093f9d9086f85"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1e72728e7501a450288fc8e1f9ebc73d90cfd4671ebbd631f3e7857c39bd16f2"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:84554fc53daa8f6abf8e8a66e076aff6ece62de68523d9f665f32d2fc50fd66e"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73aafd1afca80afecb22718f8700b40ac7cab927b8abab3c3e337d70e10e5a2"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49d9f7d256fbc804391a7f72d4a617302b1afac1112fac19b6c6cec63fe7fe8a"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0ec84b9ba0654f3b962802edc91424331f423dcf5d5f926676e0150789cb3d95"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:feff59f27338135776f6d4e2ec7aeeac5d5f7a08a83e80869121ef8164b74af9"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-win32.whl", hash = "sha256:c5a59996dc61835133b56a32ebe4ef3740ea5bc19b3983ac60cc32be5a665d54"},
|
||||
{file = "numpy-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a356364941fb0593bb899a1076b92dfa2029f6f5b8ba88a14fd0984aaf76d0df"},
|
||||
{file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9416a5c2e92ace094e9f0082c5fd473502c91651fb896bc17690d6fc475128d6"},
|
||||
{file = "numpy-2.0.0-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:17067d097ed036636fa79f6a869ac26df7db1ba22039d962422506640314933a"},
|
||||
{file = "numpy-2.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38ecb5b0582cd125f67a629072fed6f83562d9dd04d7e03256c9829bdec027ad"},
|
||||
{file = "numpy-2.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cef04d068f5fb0518a77857953193b6bb94809a806bd0a14983a8f12ada060c9"},
|
||||
{file = "numpy-2.0.0.tar.gz", hash = "sha256:cf5d1c9e6837f8af9f92b6bd3e86d513cdc11f60fd62185cc49ec7d1aba34864"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.5"
|
||||
requires_python = ">=3.8"
|
||||
summary = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||
groups = ["default"]
|
||||
dependencies = [
|
||||
"et-xmlfile",
|
||||
]
|
||||
files = [
|
||||
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
|
||||
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "orjson"
|
||||
version = "3.10.3"
|
||||
@@ -1261,6 +1331,45 @@ files = [
|
||||
{file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pandas"
|
||||
version = "2.2.2"
|
||||
requires_python = ">=3.9"
|
||||
summary = "Powerful data structures for data analysis, time series, and statistics"
|
||||
groups = ["default"]
|
||||
dependencies = [
|
||||
"numpy>=1.22.4; python_version < \"3.11\"",
|
||||
"numpy>=1.23.2; python_version == \"3.11\"",
|
||||
"numpy>=1.26.0; python_version >= \"3.12\"",
|
||||
"python-dateutil>=2.8.2",
|
||||
"pytz>=2020.1",
|
||||
"tzdata>=2022.7",
|
||||
]
|
||||
files = [
|
||||
{file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
|
||||
{file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
|
||||
{file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
|
||||
{file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
|
||||
{file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
|
||||
{file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
|
||||
{file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
|
||||
{file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
|
||||
{file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
|
||||
{file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
|
||||
{file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
|
||||
{file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
|
||||
{file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
|
||||
{file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
|
||||
{file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
|
||||
{file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
|
||||
{file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
|
||||
{file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
|
||||
{file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
|
||||
{file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
|
||||
{file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
|
||||
{file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parse"
|
||||
version = "1.20.2"
|
||||
@@ -1540,6 +1649,16 @@ files = [
|
||||
{file = "python_multipart-0.0.9.tar.gz", hash = "sha256:03f54688c663f1b7977105f021043b0793151e4cb1c1a9d4a11fc13d622c4026"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2024.1"
|
||||
summary = "World timezone definitions, modern and historical"
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
|
||||
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyyaml"
|
||||
version = "6.0.1"
|
||||
@@ -1852,6 +1971,17 @@ files = [
|
||||
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tzdata"
|
||||
version = "2024.1"
|
||||
requires_python = ">=2"
|
||||
summary = "Provider of IANA time zone data"
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
|
||||
{file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ujson"
|
||||
version = "5.10.0"
|
||||
@@ -2185,6 +2315,17 @@ files = [
|
||||
{file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xlsxwriter"
|
||||
version = "3.2.0"
|
||||
requires_python = ">=3.6"
|
||||
summary = "A Python module for creating Excel XLSX files."
|
||||
groups = ["default"]
|
||||
files = [
|
||||
{file = "XlsxWriter-3.2.0-py3-none-any.whl", hash = "sha256:ecfd5405b3e0e228219bcaf24c2ca0915e012ca9464a14048021d21a995d490e"},
|
||||
{file = "XlsxWriter-3.2.0.tar.gz", hash = "sha256:9977d0c661a72866a61f9f7a809e25ebbb0fb7036baa3b9fe74afcfca6b3cb8c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yarl"
|
||||
version = "1.9.4"
|
||||
|
||||
@@ -21,6 +21,9 @@ dependencies = [
|
||||
"selenium>=4.22.0",
|
||||
"webdriver-manager>=4.0.1",
|
||||
"pydantic>=2.8.2",
|
||||
"pandas>=2.2.2",
|
||||
"openpyxl>=3.1.5",
|
||||
"xlsxwriter>=3.2.0",
|
||||
]
|
||||
requires-python = ">=3.10"
|
||||
readme = "README.md"
|
||||
@@ -40,9 +43,10 @@ ignore = []
|
||||
defineConstant = { DEBUG = true }
|
||||
stubPath = ""
|
||||
|
||||
reportUnknownMemberType=false
|
||||
reportUnknownMemberType= false
|
||||
reportMissingImports = true
|
||||
reportMissingTypeStubs = false
|
||||
reportAny = false
|
||||
|
||||
pythonVersion = "3.9"
|
||||
pythonPlatform = "Linux"
|
||||
|
||||
124
src/components/JobTable.tsx
Normal file
124
src/components/JobTable.tsx
Normal file
@@ -0,0 +1,124 @@
|
||||
import React, { useState } from "react";
|
||||
import {
|
||||
TextField,
|
||||
Table,
|
||||
TableBody,
|
||||
TableCell,
|
||||
TableHead,
|
||||
TableRow,
|
||||
Button,
|
||||
} from "@mui/material";
|
||||
import { useRouter } from "next/router";
|
||||
|
||||
interface Job {
|
||||
id: string;
|
||||
url: string;
|
||||
elements: Object[];
|
||||
result: Object;
|
||||
time_created: Date;
|
||||
}
|
||||
|
||||
interface JobTableProps {
|
||||
jobs: Job[];
|
||||
}
|
||||
|
||||
const JobTable: React.FC<JobTableProps> = ({ jobs }) => {
|
||||
const router = useRouter();
|
||||
const handleDownload = async (id: string) => {
|
||||
console.log(id);
|
||||
const response = await fetch("/api/download", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ id: id }),
|
||||
});
|
||||
|
||||
if (response.ok) {
|
||||
const blob = await response.blob();
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement("a");
|
||||
a.style.display = "none";
|
||||
a.href = url;
|
||||
a.download = `job_${id}.csv`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
} else {
|
||||
console.error("Failed to download the file.");
|
||||
}
|
||||
};
|
||||
|
||||
const handleNavigate = (elements: Object[], url: string) => {
|
||||
router.push({
|
||||
pathname: "/",
|
||||
query: {
|
||||
elements: JSON.stringify(elements),
|
||||
url: url,
|
||||
},
|
||||
});
|
||||
};
|
||||
|
||||
return (
|
||||
<>
|
||||
<Table>
|
||||
<TableHead>
|
||||
<TableRow>
|
||||
<TableCell>id</TableCell>
|
||||
<TableCell>url</TableCell>
|
||||
<TableCell>elements</TableCell>
|
||||
<TableCell>result</TableCell>
|
||||
<TableCell>time_created</TableCell>
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{jobs.map((row, index) => (
|
||||
<TableRow key={index}>
|
||||
<TableCell>
|
||||
<TextField variant="outlined" fullWidth value={row.id} />
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<TextField variant="outlined" fullWidth value={row.url} />
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<TextField
|
||||
variant="outlined"
|
||||
fullWidth
|
||||
value={JSON.stringify(row.elements)}
|
||||
/>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<TextField
|
||||
variant="outlined"
|
||||
fullWidth
|
||||
value={JSON.stringify(row.result)}
|
||||
/>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<TextField
|
||||
variant="outlined"
|
||||
fullWidth
|
||||
value={row.time_created}
|
||||
/>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Button
|
||||
onClick={() => {
|
||||
handleDownload(row.id);
|
||||
}}
|
||||
>
|
||||
Download
|
||||
</Button>
|
||||
</TableCell>
|
||||
<TableCell>
|
||||
<Button onClick={() => handleNavigate(row.elements, row.url)}>
|
||||
Rerun
|
||||
</Button>
|
||||
</TableCell>
|
||||
</TableRow>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</>
|
||||
);
|
||||
};
|
||||
|
||||
export default JobTable;
|
||||
@@ -1,34 +0,0 @@
|
||||
import React from "react";
|
||||
import { useAuth } from "../useAuth";
|
||||
import { LogoutOptions, RedirectLoginOptions } from "@auth0/auth0-react";
|
||||
|
||||
const NavBar: React.FC = () => {
|
||||
const { loginWithRedirect, logout, user, isAuthenticated } = useAuth();
|
||||
|
||||
const handleLogout = () => {
|
||||
const logoutOptions: LogoutOptions = {};
|
||||
logout(logoutOptions);
|
||||
};
|
||||
|
||||
const handleLogin = () => {
|
||||
const loginOptions: RedirectLoginOptions = {
|
||||
authorizationParams: { redirect_uri: "http://localhost" },
|
||||
};
|
||||
loginWithRedirect(loginOptions);
|
||||
};
|
||||
|
||||
return (
|
||||
<nav>
|
||||
{isAuthenticated ? (
|
||||
<>
|
||||
<p>Welcome, {user?.name}</p>
|
||||
<button onClick={handleLogout}>Logout</button>
|
||||
</>
|
||||
) : (
|
||||
<button onClick={handleLogin}>Login</button>
|
||||
)}
|
||||
</nav>
|
||||
);
|
||||
};
|
||||
|
||||
export default NavBar;
|
||||
115
src/components/NavDrawer.tsx
Normal file
115
src/components/NavDrawer.tsx
Normal file
@@ -0,0 +1,115 @@
|
||||
import React, { useState } from "react";
|
||||
import { useAuth } from "../useAuth";
|
||||
import { LogoutOptions, RedirectLoginOptions } from "@auth0/auth0-react";
|
||||
import {
|
||||
Box,
|
||||
Drawer,
|
||||
List,
|
||||
ListItem,
|
||||
ListItemIcon,
|
||||
ListItemButton,
|
||||
ListItemText,
|
||||
AppBar,
|
||||
Toolbar,
|
||||
IconButton,
|
||||
Typography,
|
||||
Button,
|
||||
} from "@mui/material";
|
||||
import HomeIcon from "@mui/icons-material/Home";
|
||||
import HttpIcon from "@mui/icons-material/Http";
|
||||
import MenuIcon from "@mui/icons-material/Menu";
|
||||
import { useRouter } from "next/router";
|
||||
|
||||
const NavDrawer: React.FC = () => {
|
||||
const router = useRouter();
|
||||
const { loginWithRedirect, logout, user, isAuthenticated } = useAuth();
|
||||
const [open, setOpen] = useState<boolean>(false);
|
||||
|
||||
const handleLogout = () => {
|
||||
const logoutOptions: LogoutOptions = {};
|
||||
logout(logoutOptions);
|
||||
};
|
||||
|
||||
const handleLogin = () => {
|
||||
const loginOptions: RedirectLoginOptions = {
|
||||
authorizationParams: { redirect_uri: "http://localhost" },
|
||||
};
|
||||
loginWithRedirect(loginOptions);
|
||||
};
|
||||
|
||||
const toggleDrawer =
|
||||
(open: boolean) => (event: React.KeyboardEvent | React.MouseEvent) => {
|
||||
if (
|
||||
event.type === "keydown" &&
|
||||
((event as React.KeyboardEvent).key === "Tab" ||
|
||||
(event as React.KeyboardEvent).key === "Shift")
|
||||
) {
|
||||
return;
|
||||
}
|
||||
setOpen(open);
|
||||
};
|
||||
|
||||
const DrawerList = (
|
||||
<Box
|
||||
sx={{ width: 250 }}
|
||||
role="presentation"
|
||||
onClick={toggleDrawer(false)}
|
||||
onKeyDown={toggleDrawer(false)}
|
||||
>
|
||||
<List>
|
||||
<ListItem>
|
||||
<ListItemButton onClick={() => router.push("/")}>
|
||||
<ListItemIcon>
|
||||
<HomeIcon />
|
||||
</ListItemIcon>
|
||||
<ListItemText>Home</ListItemText>
|
||||
</ListItemButton>
|
||||
</ListItem>
|
||||
<ListItem>
|
||||
<ListItemButton onClick={() => router.push("/jobs")}>
|
||||
<ListItemIcon>
|
||||
<HttpIcon />
|
||||
</ListItemIcon>
|
||||
<ListItemText>Previous Jobs</ListItemText>
|
||||
</ListItemButton>
|
||||
</ListItem>
|
||||
</List>
|
||||
</Box>
|
||||
);
|
||||
|
||||
return (
|
||||
<>
|
||||
<AppBar position="static">
|
||||
<Toolbar>
|
||||
<IconButton
|
||||
edge="start"
|
||||
color="inherit"
|
||||
aria-label="menu"
|
||||
onClick={toggleDrawer(true)}
|
||||
>
|
||||
<MenuIcon />
|
||||
</IconButton>
|
||||
{isAuthenticated ? (
|
||||
<>
|
||||
<Typography variant="body1" sx={{ marginRight: 2 }}>
|
||||
Welcome, {user?.name}
|
||||
</Typography>
|
||||
<Button color="inherit" onClick={handleLogout}>
|
||||
Logout
|
||||
</Button>
|
||||
</>
|
||||
) : (
|
||||
<Button color="inherit" onClick={handleLogin}>
|
||||
Login
|
||||
</Button>
|
||||
)}
|
||||
</Toolbar>
|
||||
</AppBar>
|
||||
<Drawer open={open} onClose={toggleDrawer(false)}>
|
||||
{DrawerList}
|
||||
</Drawer>
|
||||
</>
|
||||
);
|
||||
};
|
||||
|
||||
export default NavDrawer;
|
||||
@@ -5,6 +5,7 @@ import React from "react";
|
||||
import type { AppProps } from "next/app";
|
||||
import Head from "next/head";
|
||||
import { Auth0Provider } from "@auth0/auth0-react";
|
||||
import NavDrawer from "../components/NavDrawer";
|
||||
|
||||
const domain = process.env.NEXT_PUBLIC_AUTH0_ISSUER_BASE_URL || "";
|
||||
const clientId = process.env.NEXT_PUBLIC_AUTH0_CLIENT_ID || "";
|
||||
@@ -26,6 +27,7 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
|
||||
cacheLocation="localstorage"
|
||||
useRefreshTokens={true}
|
||||
>
|
||||
<NavDrawer></NavDrawer>
|
||||
<Component {...pageProps} />
|
||||
</Auth0Provider>
|
||||
</>
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import React, { useState } from "react";
|
||||
import NavBar from "../components/NavBar";
|
||||
import React, { useState, useEffect } from "react";
|
||||
import {
|
||||
Typography,
|
||||
TextField,
|
||||
@@ -13,6 +12,8 @@ import {
|
||||
Box,
|
||||
} from "@mui/material";
|
||||
import AddIcon from "@mui/icons-material/Add";
|
||||
import { useAuth0 } from "@auth0/auth0-react";
|
||||
import { useRouter } from "next/router";
|
||||
|
||||
interface Element {
|
||||
name: string;
|
||||
@@ -31,7 +32,12 @@ type Result = {
|
||||
};
|
||||
|
||||
const Home = () => {
|
||||
const [url, setUrl] = useState("");
|
||||
const { user } = useAuth0();
|
||||
const router = useRouter();
|
||||
|
||||
const { elements, url } = router.query;
|
||||
|
||||
const [submittedURL, setUrl] = useState("");
|
||||
const [rows, setRows] = useState<Element[]>([]);
|
||||
const [results, setResults] = useState<null | Result>(null);
|
||||
const [newRow, setNewRow] = useState<Element>({
|
||||
@@ -40,8 +46,17 @@ const Home = () => {
|
||||
url: "",
|
||||
});
|
||||
|
||||
useEffect(() => {
|
||||
if (elements) {
|
||||
setRows(JSON.parse(elements as string));
|
||||
}
|
||||
if (url) {
|
||||
setUrl(url as string);
|
||||
}
|
||||
}, [elements, url]);
|
||||
|
||||
const handleAddRow = () => {
|
||||
newRow.url = url;
|
||||
newRow.url = submittedURL;
|
||||
setRows([...rows, newRow]);
|
||||
setNewRow({ name: "", xpath: "", url: "" });
|
||||
};
|
||||
@@ -50,7 +65,12 @@ const Home = () => {
|
||||
fetch("/api/submit-scrape-job", {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({ url: url, elements: rows }),
|
||||
body: JSON.stringify({
|
||||
url: url,
|
||||
elements: rows,
|
||||
user: user?.name,
|
||||
time_created: new Date().toISOString(),
|
||||
}),
|
||||
})
|
||||
.then((response) => response.json())
|
||||
.then((data) => setResults(data));
|
||||
@@ -58,7 +78,6 @@ const Home = () => {
|
||||
|
||||
return (
|
||||
<>
|
||||
<NavBar />
|
||||
<Container maxWidth="md">
|
||||
<Typography variant="h1" gutterBottom>
|
||||
Web Scraper
|
||||
@@ -97,9 +116,10 @@ const Home = () => {
|
||||
startIcon={<AddIcon />}
|
||||
onClick={handleAddRow}
|
||||
>
|
||||
Add Row
|
||||
Add Elements
|
||||
</Button>
|
||||
</Box>
|
||||
<Typography variant="h4">Elements</Typography>
|
||||
<Table>
|
||||
<TableHead>
|
||||
<TableRow>
|
||||
|
||||
26
src/pages/jobs.tsx
Normal file
26
src/pages/jobs.tsx
Normal file
@@ -0,0 +1,26 @@
|
||||
import { useAuth0 } from "@auth0/auth0-react";
|
||||
import React, { useEffect, useState } from "react";
|
||||
import JobTable from "../components/JobTable";
|
||||
|
||||
const Jobs = () => {
|
||||
const { user } = useAuth0();
|
||||
const [jobs, setJobs] = useState([]);
|
||||
|
||||
useEffect(() => {
|
||||
fetch("/api/retrieve-scrape-jobs", {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({ user: user?.name }),
|
||||
})
|
||||
.then((response) => response.json())
|
||||
.then((data) => setJobs(data));
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<>
|
||||
<JobTable jobs={jobs} />
|
||||
</>
|
||||
);
|
||||
};
|
||||
|
||||
export default Jobs;
|
||||
Reference in New Issue
Block a user