mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-17 13:16:10 +00:00
fix: make id random to make text appear on separate rows
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
# STL
|
# STL
|
||||||
import uuid
|
import uuid
|
||||||
import traceback
|
import traceback
|
||||||
|
import random
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
import csv
|
import csv
|
||||||
import logging
|
import logging
|
||||||
@@ -99,7 +100,9 @@ async def download(download_job: DownloadJob):
|
|||||||
text = clean_text(value.get("text", ""))
|
text = clean_text(value.get("text", ""))
|
||||||
csv_writer.writerow(
|
csv_writer.writerow(
|
||||||
[
|
[
|
||||||
result.get("id", ""),
|
result.get("id", "")
|
||||||
|
+ "-"
|
||||||
|
+ str(random.randint(0, 1000000)),
|
||||||
url,
|
url,
|
||||||
element_name,
|
element_name,
|
||||||
value.get("xpath", ""),
|
value.get("xpath", ""),
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ def clean_xpath(xpath: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
|
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
|
||||||
return context.xpath(xpath) # type: ignore [reportReturnType]
|
return context.xpath(xpath) # pyright: ignore [reportReturnType]
|
||||||
|
|
||||||
|
|
||||||
def interceptor(headers: dict[str, Any]):
|
def interceptor(headers: dict[str, Any]):
|
||||||
@@ -139,9 +139,11 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
|
|||||||
|
|
||||||
for elem in xpaths:
|
for elem in xpaths:
|
||||||
el = sxpath(root, clean_xpath(elem.xpath))
|
el = sxpath(root, clean_xpath(elem.xpath))
|
||||||
text = ["\t".join(str(e) for e in e.itertext()) for e in el]
|
|
||||||
|
for e in el:
|
||||||
|
text = "\t".join(str(t) for t in e.itertext())
|
||||||
captured_element = CapturedElement(
|
captured_element = CapturedElement(
|
||||||
xpath=elem.xpath, text=",".join(text), name=elem.name
|
xpath=elem.xpath, text=text, name=elem.name
|
||||||
)
|
)
|
||||||
|
|
||||||
if elem.name in elements:
|
if elem.name in elements:
|
||||||
|
|||||||
@@ -6,11 +6,12 @@ RUN python -m pip --no-cache-dir install pdm
|
|||||||
RUN pdm config python.use_venv false
|
RUN pdm config python.use_venv false
|
||||||
|
|
||||||
COPY pyproject.toml pdm.lock /project/app/
|
COPY pyproject.toml pdm.lock /project/app/
|
||||||
COPY ./api/ /project/app/api
|
|
||||||
|
|
||||||
WORKDIR /project/app
|
|
||||||
RUN pdm install
|
RUN pdm install
|
||||||
|
|
||||||
|
COPY ./api/ /project/app/api
|
||||||
|
WORKDIR /project/app
|
||||||
|
|
||||||
# Create final image
|
# Create final image
|
||||||
FROM python:3.10.12-slim
|
FROM python:3.10.12-slim
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user