diff --git a/api/backend/routers/job_router.py b/api/backend/routers/job_router.py index 2a6631d..26da1d5 100644 --- a/api/backend/routers/job_router.py +++ b/api/backend/routers/job_router.py @@ -1,6 +1,7 @@ # STL import uuid import traceback +import random from io import StringIO import csv import logging @@ -99,7 +100,9 @@ async def download(download_job: DownloadJob): text = clean_text(value.get("text", "")) csv_writer.writerow( [ - result.get("id", ""), + result.get("id", "") + + "-" + + str(random.randint(0, 1000000)), url, element_name, value.get("xpath", ""), diff --git a/api/backend/scraping.py b/api/backend/scraping.py index 8f0dadf..1238c82 100644 --- a/api/backend/scraping.py +++ b/api/backend/scraping.py @@ -43,7 +43,7 @@ def clean_xpath(xpath: str) -> str: def sxpath(context: _Element, xpath: str) -> list[HtmlElement]: - return context.xpath(xpath) # type: ignore [reportReturnType] + return context.xpath(xpath) # pyright: ignore [reportReturnType] def interceptor(headers: dict[str, Any]): @@ -139,16 +139,18 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]) for elem in xpaths: el = sxpath(root, clean_xpath(elem.xpath)) - text = ["\t".join(str(e) for e in e.itertext()) for e in el] - captured_element = CapturedElement( - xpath=elem.xpath, text=",".join(text), name=elem.name - ) - if elem.name in elements: - elements[elem.name].append(captured_element) - continue + for e in el: + text = "\t".join(str(t) for t in e.itertext()) + captured_element = CapturedElement( + xpath=elem.xpath, text=text, name=elem.name + ) - elements[elem.name] = [captured_element] + if elem.name in elements: + elements[elem.name].append(captured_element) + continue + + elements[elem.name] = [captured_element] return {page[1]: elements} diff --git a/docker/api/Dockerfile b/docker/api/Dockerfile index 8a6faec..2023fb2 100644 --- a/docker/api/Dockerfile +++ b/docker/api/Dockerfile @@ -6,11 +6,12 @@ RUN python -m pip --no-cache-dir install pdm RUN pdm config python.use_venv false COPY pyproject.toml pdm.lock /project/app/ -COPY ./api/ /project/app/api -WORKDIR /project/app RUN pdm install +COPY ./api/ /project/app/api +WORKDIR /project/app + # Create final image FROM python:3.10.12-slim