mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-12 02:35:43 +00:00
wip: create webapp
This commit is contained in:
8
api/backend/amazon.py
Normal file
8
api/backend/amazon.py
Normal file
@@ -0,0 +1,8 @@
|
||||
# PDM
|
||||
import boto3
|
||||
|
||||
|
||||
def test_dyanmo():
|
||||
dynamodb = boto3.resource("dynamodb", region_name="us-west-2")
|
||||
table = dynamodb.Table("scrape")
|
||||
print(table)
|
||||
@@ -3,10 +3,16 @@ import logging
|
||||
|
||||
# PDM
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
# LOCAL
|
||||
from api.backend.amazon import test_dyanmo
|
||||
from api.backend.models import SubmitScrapeJob
|
||||
from api.backend.scraping import scrape
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(title="api")
|
||||
@@ -19,14 +25,27 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.mount("/_next/static", StaticFiles(directory="dist/_next/static"), name="static")
|
||||
app.mount("/_next/static", StaticFiles(directory="./dist/_next/static"), name="static")
|
||||
|
||||
|
||||
@app.get("/")
|
||||
def read_root():
|
||||
return FileResponse("dist/index.html")
|
||||
return FileResponse("./dist/index.html")
|
||||
|
||||
|
||||
@app.get("/api/endpoint")
|
||||
async def test_endpoint():
|
||||
test_dyanmo()
|
||||
return "Hello World!"
|
||||
|
||||
|
||||
@app.post("/api/submit-scrape-job")
|
||||
async def submit_scrape_job(job: SubmitScrapeJob):
|
||||
try:
|
||||
scraped = await scrape(job.url, job.elements)
|
||||
print(scraped)
|
||||
json_scraped = jsonable_encoder(scraped)
|
||||
print(json_scraped)
|
||||
return JSONResponse(content=json_scraped)
|
||||
except Exception as e:
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
|
||||
19
api/backend/models.py
Normal file
19
api/backend/models.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# PDM
|
||||
import pydantic
|
||||
|
||||
|
||||
class Element(pydantic.BaseModel):
|
||||
name: str
|
||||
url: str
|
||||
xpath: str
|
||||
|
||||
|
||||
class CapturedElement(pydantic.BaseModel):
|
||||
xpath: str
|
||||
text: str
|
||||
name: str
|
||||
|
||||
|
||||
class SubmitScrapeJob(pydantic.BaseModel):
|
||||
url: str
|
||||
elements: list[Element]
|
||||
77
api/backend/scraping.py
Normal file
77
api/backend/scraping.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# PDM
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import etree
|
||||
from selenium import webdriver
|
||||
from lxml.etree import _Element # type: ignore [reportPrivateImport]
|
||||
from fake_useragent import UserAgent
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
|
||||
# LOCAL
|
||||
from api.backend.models import Element, CapturedElement
|
||||
|
||||
|
||||
class HtmlElement(_Element): ...
|
||||
|
||||
|
||||
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
|
||||
return context.xpath(xpath) # type: ignore [reportReturnType]
|
||||
|
||||
|
||||
async def make_site_request(url: str) -> str:
|
||||
"""Make basic `GET` request to site using Selenium."""
|
||||
ua = UserAgent()
|
||||
|
||||
chrome_options = ChromeOptions()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument(f"user-agent={ua.random}")
|
||||
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(options=chrome_options, service=service)
|
||||
|
||||
try:
|
||||
driver.get(url)
|
||||
_ = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
page_source = driver.page_source
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
print(page_source)
|
||||
return page_source
|
||||
|
||||
|
||||
async def collect_scraped_elements(page: str, xpaths: list[Element]):
|
||||
soup = BeautifulSoup(page, "lxml")
|
||||
root = etree.HTML(str(soup))
|
||||
|
||||
elements: dict[str, list[CapturedElement]] = dict()
|
||||
|
||||
for elem in xpaths:
|
||||
el = sxpath(root, elem.xpath)
|
||||
text = ["".join(str(e) for e in e.itertext()) for e in el]
|
||||
captured_element = CapturedElement(
|
||||
xpath=elem.xpath, text=",".join(text), name=elem.name
|
||||
)
|
||||
|
||||
if elem.name in elements:
|
||||
elements[elem.name].append(captured_element)
|
||||
continue
|
||||
|
||||
elements[elem.name] = [captured_element]
|
||||
|
||||
return elements
|
||||
|
||||
|
||||
async def scrape(url: str, xpaths: list[Element]):
|
||||
page = await make_site_request(url)
|
||||
elements = await collect_scraped_elements(page, xpaths)
|
||||
|
||||
return elements
|
||||
Reference in New Issue
Block a user