fix: make id random to make text appear on separate rows

This commit is contained in:
Jayden Pyles
2024-11-06 18:27:40 -06:00
parent 186723460d
commit 8a74a8fbd6
3 changed files with 18 additions and 12 deletions

View File

@@ -1,6 +1,7 @@
# STL
import uuid
import traceback
import random
from io import StringIO
import csv
import logging
@@ -99,7 +100,9 @@ async def download(download_job: DownloadJob):
text = clean_text(value.get("text", ""))
csv_writer.writerow(
[
result.get("id", ""),
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),

View File

@@ -43,7 +43,7 @@ def clean_xpath(xpath: str) -> str:
def sxpath(context: _Element, xpath: str) -> list[HtmlElement]:
return context.xpath(xpath) # type: ignore [reportReturnType]
return context.xpath(xpath) # pyright: ignore [reportReturnType]
def interceptor(headers: dict[str, Any]):
@@ -139,9 +139,11 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element])
for elem in xpaths:
el = sxpath(root, clean_xpath(elem.xpath))
text = ["\t".join(str(e) for e in e.itertext()) for e in el]
for e in el:
text = "\t".join(str(t) for t in e.itertext())
captured_element = CapturedElement(
xpath=elem.xpath, text=",".join(text), name=elem.name
xpath=elem.xpath, text=text, name=elem.name
)
if elem.name in elements:

View File

@@ -6,11 +6,12 @@ RUN python -m pip --no-cache-dir install pdm
RUN pdm config python.use_venv false
COPY pyproject.toml pdm.lock /project/app/
COPY ./api/ /project/app/api
WORKDIR /project/app
RUN pdm install
COPY ./api/ /project/app/api
WORKDIR /project/app
# Create final image
FROM python:3.10.12-slim