mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-16 04:36:14 +00:00
fix: make id random to make text appear on separate rows
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
# STL
|
||||
import uuid
|
||||
import traceback
|
||||
import random
|
||||
from io import StringIO
|
||||
import csv
|
||||
import logging
|
||||
@@ -99,7 +100,9 @@ async def download(download_job: DownloadJob):
|
||||
text = clean_text(value.get("text", ""))
|
||||
csv_writer.writerow(
|
||||
[
|
||||
result.get("id", ""),
|
||||
result.get("id", "")
|
||||
+ "-"
|
||||
+ str(random.randint(0, 1000000)),
|
||||
url,
|
||||
element_name,
|
||||
value.get("xpath", ""),
|
||||
|
||||
@@ -43,7 +43,7 @@ def clean_xpath(xpath: str) -> str:
|
||||
|
||||
|
||||
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
|
||||
return context.xpath(xpath) # type: ignore [reportReturnType]
|
||||
return context.xpath(xpath) # pyright: ignore [reportReturnType]
|
||||
|
||||
|
||||
def interceptor(headers: dict[str, Any]):
|
||||
@@ -139,9 +139,11 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
|
||||
|
||||
for elem in xpaths:
|
||||
el = sxpath(root, clean_xpath(elem.xpath))
|
||||
text = ["\t".join(str(e) for e in e.itertext()) for e in el]
|
||||
|
||||
for e in el:
|
||||
text = "\t".join(str(t) for t in e.itertext())
|
||||
captured_element = CapturedElement(
|
||||
xpath=elem.xpath, text=",".join(text), name=elem.name
|
||||
xpath=elem.xpath, text=text, name=elem.name
|
||||
)
|
||||
|
||||
if elem.name in elements:
|
||||
|
||||
@@ -6,11 +6,12 @@ RUN python -m pip --no-cache-dir install pdm
|
||||
RUN pdm config python.use_venv false
|
||||
|
||||
COPY pyproject.toml pdm.lock /project/app/
|
||||
COPY ./api/ /project/app/api
|
||||
|
||||
WORKDIR /project/app
|
||||
RUN pdm install
|
||||
|
||||
COPY ./api/ /project/app/api
|
||||
WORKDIR /project/app
|
||||
|
||||
# Create final image
|
||||
FROM python:3.10.12-slim
|
||||
|
||||
|
||||
Reference in New Issue
Block a user