Feat: Site Mapping (#46)
Some checks failed
Unit Tests / unit-tests (push) Has been cancelled

* wip: add site mapping

* chore: cleanup
This commit is contained in:
Jayden Pyles
2024-11-16 20:55:23 -06:00
committed by GitHub
parent 3a0762f1e3
commit 7d80ff5c7f
35 changed files with 853 additions and 349 deletions

View File

@@ -0,0 +1,19 @@
from .job import (
query,
insert,
update_job,
delete_jobs,
get_jobs_per_day,
get_queued_job,
average_elements_per_link,
)
__all__ = [
"query",
"insert",
"update_job",
"delete_jobs",
"get_jobs_per_day",
"get_queued_job",
"average_elements_per_link",
]

View File

@@ -6,8 +6,8 @@ from typing import Any, Optional
from pymongo import DESCENDING
# LOCAL
from api.backend.models import FetchOptions
from api.backend.database import get_job_collection
from api.backend.job.models.job_options import FetchOptions
LOG = logging.getLogger(__name__)

View File

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Any, Optional
from api.backend.job.models.site_map import SiteMap
class FetchOptions(BaseModel):
chat: Optional[bool] = None
class JobOptions(BaseModel):
multi_page_scrape: bool = False
custom_headers: dict[str, Any] = {}
proxies: list[str] = []
site_map: Optional[SiteMap] = None

View File

@@ -0,0 +1,14 @@
from pydantic import BaseModel
from typing import Literal
class Action(BaseModel):
type: Literal["click", "input"]
xpath: str
name: str
input: str = ""
do_once: bool = True
class SiteMap(BaseModel):
actions: list[Action]

View File

@@ -0,0 +1,30 @@
import time
from typing import cast
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def scrape_content(driver: webdriver.Chrome, pages: set[tuple[str, str]]):
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = cast(str, driver.execute_script("return document.body.scrollHeight"))
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) # Wait for the page to load
new_height = cast(
str, driver.execute_script("return document.body.scrollHeight")
)
if new_height == last_height:
break
last_height = new_height
pages.add((driver.page_source, driver.current_url))
return driver.page_source

View File

View File

@@ -0,0 +1,94 @@
from api.backend.job.models.site_map import Action, SiteMap
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from typing import Any
import logging
import time
from copy import deepcopy
from api.backend.job.scraping.scraping_utils import scrape_content
from selenium.webdriver.support.ui import WebDriverWait
from seleniumwire.inspect import TimeoutException
from seleniumwire.webdriver import Chrome
from selenium.webdriver.support import expected_conditions as EC
LOG = logging.getLogger(__name__)
def clear_done_actions(site_map: dict[str, Any]):
"""Clear all actions that have been clicked."""
cleared_site_map = deepcopy(site_map)
cleared_site_map["actions"] = [
action for action in cleared_site_map["actions"] if not action["do_once"]
]
return cleared_site_map
def handle_input(action: Action, driver: webdriver.Chrome):
try:
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, action.xpath))
)
LOG.info(f"Sending keys: {action.input} to element: {element}")
element.send_keys(action.input)
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
except TimeoutException:
LOG.info(f"Timeout waiting for element: {action.xpath}")
return False
except Exception as e:
LOG.info(f"Error handling input: {e}")
return False
return True
def handle_click(action: Action, driver: webdriver.Chrome):
try:
element = driver.find_element(By.XPATH, action.xpath)
LOG.info(f"Clicking element: {element}")
element.click()
except NoSuchElementException:
LOG.info(f"Element not found: {action.xpath}")
return False
return True
ACTION_MAP = {
"click": handle_click,
"input": handle_input,
}
async def handle_site_mapping(
site_map_dict: dict[str, Any],
driver: Chrome,
pages: set[tuple[str, str]],
):
site_map = SiteMap(**site_map_dict)
LOG.info(f"Handling site map: {site_map}")
for action in site_map.actions:
action_handler = ACTION_MAP[action.type]
if not action_handler(action, driver):
return
time.sleep(2)
_ = scrape_content(driver, pages)
cleared_site_map_dict = clear_done_actions(site_map_dict)
if cleared_site_map_dict["actions"]:
await handle_site_mapping(cleared_site_map_dict, driver, pages)

View File

@@ -2,12 +2,14 @@
from typing import Any, Optional, Union
from datetime import datetime
# LOCAL
from api.backend.job.models.job_options import JobOptions
# PDM
import pydantic
class FetchOptions(pydantic.BaseModel):
chat: Optional[bool] = None
class Element(pydantic.BaseModel):
@@ -22,12 +24,6 @@ class CapturedElement(pydantic.BaseModel):
name: str
class JobOptions(pydantic.BaseModel):
multi_page_scrape: bool = False
custom_headers: Optional[dict[str, Any]] = {}
proxies: Optional[list[str]] = []
class RetrieveScrapeJobs(pydantic.BaseModel):
user: str

View File

@@ -12,22 +12,17 @@ from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse, StreamingResponse
# LOCAL
from api.backend.job import (
query,
insert,
update_job,
delete_jobs,
)
from api.backend.job import query, insert, update_job, delete_jobs
from api.backend.models import (
UpdateJobs,
DownloadJob,
FetchOptions,
DeleteScrapeJobs,
Job,
)
from api.backend.schemas import User
from api.backend.auth.auth_utils import get_current_user
from api.backend.utils import clean_text
from api.backend.job.models.job_options import FetchOptions
LOG = logging.getLogger(__name__)

View File

@@ -1,19 +1,20 @@
import logging
from typing import Any, Optional
import time
import random
from bs4 import BeautifulSoup
from lxml import etree
from seleniumwire import webdriver
from lxml.etree import _Element # type: ignore [reportPrivateImport]
from lxml.etree import _Element # pyright: ignore [reportPrivateUsage]
from fake_useragent import UserAgent
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options as ChromeOptions
from urllib.parse import urlparse, urljoin
from api.backend.models import Element, CapturedElement
from api.backend.job.site_mapping.site_mapping import (
handle_site_mapping,
)
from api.backend.job.scraping.scraping_utils import scrape_content
from api.backend.job.models.site_map import SiteMap
LOG = logging.getLogger(__name__)
@@ -95,6 +96,7 @@ async def make_site_request(
pages: set[tuple[str, str]] = set(),
original_url: str = "",
proxies: Optional[list[str]] = [],
site_map: Optional[dict[str, Any]] = None,
) -> None:
"""Make basic `GET` request to site using Selenium."""
# Check if URL has already been visited
@@ -114,27 +116,16 @@ async def make_site_request(
final_url = driver.current_url
visited_urls.add(url)
visited_urls.add(final_url)
_ = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_source = scrape_content(driver, pages)
time.sleep(3) # Wait for the page to load
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
final_height = driver.execute_script("return document.body.scrollHeight")
page_source = driver.page_source
LOG.debug(f"Page source for url: {url}\n{page_source}")
pages.add((page_source, final_url))
if site_map:
LOG.info("Site map: %s", site_map)
_ = await handle_site_mapping(
site_map,
driver,
pages,
)
finally:
driver.quit()
@@ -192,6 +183,7 @@ async def scrape(
headers: Optional[dict[str, Any]],
multi_page_scrape: bool = False,
proxies: Optional[list[str]] = [],
site_map: Optional[SiteMap] = None,
):
visited_urls: set[str] = set()
pages: set[tuple[str, str]] = set()
@@ -204,6 +196,7 @@ async def scrape(
pages=pages,
original_url=url,
proxies=proxies,
site_map=site_map,
)
elements: list[dict[str, dict[str, list[CapturedElement]]]] = list()

View File

@@ -24,6 +24,7 @@ async def process_job():
job["job_options"]["custom_headers"],
job["job_options"]["multi_page_scrape"],
job["job_options"]["proxies"],
job["job_options"]["site_map"],
)
LOG.info(
f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}"