mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-15 20:26:02 +00:00
Feat: Site Mapping (#46)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled
* wip: add site mapping * chore: cleanup
This commit is contained in:
19
api/backend/job/__init__.py
Normal file
19
api/backend/job/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from .job import (
|
||||
query,
|
||||
insert,
|
||||
update_job,
|
||||
delete_jobs,
|
||||
get_jobs_per_day,
|
||||
get_queued_job,
|
||||
average_elements_per_link,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"query",
|
||||
"insert",
|
||||
"update_job",
|
||||
"delete_jobs",
|
||||
"get_jobs_per_day",
|
||||
"get_queued_job",
|
||||
"average_elements_per_link",
|
||||
]
|
||||
@@ -6,8 +6,8 @@ from typing import Any, Optional
|
||||
from pymongo import DESCENDING
|
||||
|
||||
# LOCAL
|
||||
from api.backend.models import FetchOptions
|
||||
from api.backend.database import get_job_collection
|
||||
from api.backend.job.models.job_options import FetchOptions
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
0
api/backend/job/models/__init__.py
Normal file
0
api/backend/job/models/__init__.py
Normal file
14
api/backend/job/models/job_options.py
Normal file
14
api/backend/job/models/job_options.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import Any, Optional
|
||||
from api.backend.job.models.site_map import SiteMap
|
||||
|
||||
|
||||
class FetchOptions(BaseModel):
|
||||
chat: Optional[bool] = None
|
||||
|
||||
|
||||
class JobOptions(BaseModel):
|
||||
multi_page_scrape: bool = False
|
||||
custom_headers: dict[str, Any] = {}
|
||||
proxies: list[str] = []
|
||||
site_map: Optional[SiteMap] = None
|
||||
14
api/backend/job/models/site_map.py
Normal file
14
api/backend/job/models/site_map.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import Literal
|
||||
|
||||
|
||||
class Action(BaseModel):
|
||||
type: Literal["click", "input"]
|
||||
xpath: str
|
||||
name: str
|
||||
input: str = ""
|
||||
do_once: bool = True
|
||||
|
||||
|
||||
class SiteMap(BaseModel):
|
||||
actions: list[Action]
|
||||
30
api/backend/job/scraping/scraping_utils.py
Normal file
30
api/backend/job/scraping/scraping_utils.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import time
|
||||
from typing import cast
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
|
||||
|
||||
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
|
||||
_ = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
|
||||
last_height = cast(str, driver.execute_script("return document.body.scrollHeight"))
|
||||
while True:
|
||||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
|
||||
time.sleep(3) # Wait for the page to load
|
||||
new_height = cast(
|
||||
str, driver.execute_script("return document.body.scrollHeight")
|
||||
)
|
||||
|
||||
if new_height == last_height:
|
||||
break
|
||||
|
||||
last_height = new_height
|
||||
|
||||
pages.add((driver.page_source, driver.current_url))
|
||||
return driver.page_source
|
||||
0
api/backend/job/site_mapping/__init__.py
Normal file
0
api/backend/job/site_mapping/__init__.py
Normal file
94
api/backend/job/site_mapping/site_mapping.py
Normal file
94
api/backend/job/site_mapping/site_mapping.py
Normal file
@@ -0,0 +1,94 @@
|
||||
from api.backend.job.models.site_map import Action, SiteMap
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import NoSuchElementException
|
||||
from selenium.webdriver.common.by import By
|
||||
from typing import Any
|
||||
import logging
|
||||
import time
|
||||
from copy import deepcopy
|
||||
|
||||
from api.backend.job.scraping.scraping_utils import scrape_content
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from seleniumwire.inspect import TimeoutException
|
||||
from seleniumwire.webdriver import Chrome
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def clear_done_actions(site_map: dict[str, Any]):
|
||||
"""Clear all actions that have been clicked."""
|
||||
cleared_site_map = deepcopy(site_map)
|
||||
|
||||
cleared_site_map["actions"] = [
|
||||
action for action in cleared_site_map["actions"] if not action["do_once"]
|
||||
]
|
||||
|
||||
return cleared_site_map
|
||||
|
||||
|
||||
def handle_input(action: Action, driver: webdriver.Chrome):
|
||||
try:
|
||||
element = WebDriverWait(driver, 10).until(
|
||||
EC.element_to_be_clickable((By.XPATH, action.xpath))
|
||||
)
|
||||
LOG.info(f"Sending keys: {action.input} to element: {element}")
|
||||
|
||||
element.send_keys(action.input)
|
||||
|
||||
except NoSuchElementException:
|
||||
LOG.info(f"Element not found: {action.xpath}")
|
||||
return False
|
||||
|
||||
except TimeoutException:
|
||||
LOG.info(f"Timeout waiting for element: {action.xpath}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
LOG.info(f"Error handling input: {e}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def handle_click(action: Action, driver: webdriver.Chrome):
|
||||
try:
|
||||
element = driver.find_element(By.XPATH, action.xpath)
|
||||
LOG.info(f"Clicking element: {element}")
|
||||
|
||||
element.click()
|
||||
|
||||
except NoSuchElementException:
|
||||
LOG.info(f"Element not found: {action.xpath}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
ACTION_MAP = {
|
||||
"click": handle_click,
|
||||
"input": handle_input,
|
||||
}
|
||||
|
||||
|
||||
async def handle_site_mapping(
|
||||
site_map_dict: dict[str, Any],
|
||||
driver: Chrome,
|
||||
pages: set[tuple[str, str]],
|
||||
):
|
||||
site_map = SiteMap(**site_map_dict)
|
||||
LOG.info(f"Handling site map: {site_map}")
|
||||
|
||||
for action in site_map.actions:
|
||||
action_handler = ACTION_MAP[action.type]
|
||||
if not action_handler(action, driver):
|
||||
return
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
_ = scrape_content(driver, pages)
|
||||
|
||||
cleared_site_map_dict = clear_done_actions(site_map_dict)
|
||||
|
||||
if cleared_site_map_dict["actions"]:
|
||||
await handle_site_mapping(cleared_site_map_dict, driver, pages)
|
||||
@@ -2,12 +2,14 @@
|
||||
from typing import Any, Optional, Union
|
||||
from datetime import datetime
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job.models.job_options import JobOptions
|
||||
|
||||
# PDM
|
||||
import pydantic
|
||||
|
||||
|
||||
class FetchOptions(pydantic.BaseModel):
|
||||
chat: Optional[bool] = None
|
||||
|
||||
|
||||
|
||||
class Element(pydantic.BaseModel):
|
||||
@@ -22,12 +24,6 @@ class CapturedElement(pydantic.BaseModel):
|
||||
name: str
|
||||
|
||||
|
||||
class JobOptions(pydantic.BaseModel):
|
||||
multi_page_scrape: bool = False
|
||||
custom_headers: Optional[dict[str, Any]] = {}
|
||||
proxies: Optional[list[str]] = []
|
||||
|
||||
|
||||
class RetrieveScrapeJobs(pydantic.BaseModel):
|
||||
user: str
|
||||
|
||||
|
||||
@@ -12,22 +12,17 @@ from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
# LOCAL
|
||||
from api.backend.job import (
|
||||
query,
|
||||
insert,
|
||||
update_job,
|
||||
delete_jobs,
|
||||
)
|
||||
from api.backend.job import query, insert, update_job, delete_jobs
|
||||
from api.backend.models import (
|
||||
UpdateJobs,
|
||||
DownloadJob,
|
||||
FetchOptions,
|
||||
DeleteScrapeJobs,
|
||||
Job,
|
||||
)
|
||||
from api.backend.schemas import User
|
||||
from api.backend.auth.auth_utils import get_current_user
|
||||
from api.backend.utils import clean_text
|
||||
from api.backend.job.models.job_options import FetchOptions
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -1,19 +1,20 @@
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
import time
|
||||
import random
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from lxml import etree
|
||||
from seleniumwire import webdriver
|
||||
from lxml.etree import _Element # type: ignore [reportPrivateImport]
|
||||
from lxml.etree import _Element # pyright: ignore [reportPrivateUsage]
|
||||
from fake_useragent import UserAgent
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from api.backend.models import Element, CapturedElement
|
||||
from api.backend.job.site_mapping.site_mapping import (
|
||||
handle_site_mapping,
|
||||
)
|
||||
from api.backend.job.scraping.scraping_utils import scrape_content
|
||||
from api.backend.job.models.site_map import SiteMap
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@@ -95,6 +96,7 @@ async def make_site_request(
|
||||
pages: set[tuple[str, str]] = set(),
|
||||
original_url: str = "",
|
||||
proxies: Optional[list[str]] = [],
|
||||
site_map: Optional[dict[str, Any]] = None,
|
||||
) -> None:
|
||||
"""Make basic `GET` request to site using Selenium."""
|
||||
# Check if URL has already been visited
|
||||
@@ -114,27 +116,16 @@ async def make_site_request(
|
||||
final_url = driver.current_url
|
||||
visited_urls.add(url)
|
||||
visited_urls.add(final_url)
|
||||
_ = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||
)
|
||||
|
||||
last_height = driver.execute_script("return document.body.scrollHeight")
|
||||
while True:
|
||||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
page_source = scrape_content(driver, pages)
|
||||
|
||||
time.sleep(3) # Wait for the page to load
|
||||
new_height = driver.execute_script("return document.body.scrollHeight")
|
||||
|
||||
if new_height == last_height:
|
||||
break
|
||||
|
||||
last_height = new_height
|
||||
|
||||
final_height = driver.execute_script("return document.body.scrollHeight")
|
||||
|
||||
page_source = driver.page_source
|
||||
LOG.debug(f"Page source for url: {url}\n{page_source}")
|
||||
pages.add((page_source, final_url))
|
||||
if site_map:
|
||||
LOG.info("Site map: %s", site_map)
|
||||
_ = await handle_site_mapping(
|
||||
site_map,
|
||||
driver,
|
||||
pages,
|
||||
)
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
@@ -192,6 +183,7 @@ async def scrape(
|
||||
headers: Optional[dict[str, Any]],
|
||||
multi_page_scrape: bool = False,
|
||||
proxies: Optional[list[str]] = [],
|
||||
site_map: Optional[SiteMap] = None,
|
||||
):
|
||||
visited_urls: set[str] = set()
|
||||
pages: set[tuple[str, str]] = set()
|
||||
@@ -204,6 +196,7 @@ async def scrape(
|
||||
pages=pages,
|
||||
original_url=url,
|
||||
proxies=proxies,
|
||||
site_map=site_map,
|
||||
)
|
||||
|
||||
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()
|
||||
|
||||
@@ -24,6 +24,7 @@ async def process_job():
|
||||
job["job_options"]["custom_headers"],
|
||||
job["job_options"]["multi_page_scrape"],
|
||||
job["job_options"]["proxies"],
|
||||
job["job_options"]["site_map"],
|
||||
)
|
||||
LOG.info(
|
||||
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"
|
||||
|
||||
Reference in New Issue
Block a user