Feature: Allow Multiple Download Options (#75)

* feat: allow downloading in MD format

* fix: unit tests

* fix: deployments [skip ci]

* fix: deployment
This commit is contained in:
Jayden Pyles
2025-05-13 18:23:59 -05:00
committed by GitHub
parent 267cc73657
commit 1b8c8c779a
10 changed files with 191 additions and 58 deletions

View File

@@ -0,0 +1,24 @@
from typing import Any
from api.backend.utils import clean_text
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
md = "# Job Results Summary\n\n"
for i, job in enumerate(jobs, start=1):
md += f"## Job #{i}\n"
yield f"- **Job URL:** {job.get('url', 'N/A')}\n"
yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n"
yield f"- **ID:** {job.get('id', 'N/A')}\n"
yield "### Extracted Results:\n"
for res in job.get("result", []):
for url, elements in res.items():
yield f"\n#### URL: {url}\n"
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
yield f"- **Element:** `{element_name}`\n"
yield f" - **Text:** {text}\n"
yield "\n---\n"

View File

@@ -1,5 +1,5 @@
# STL
from typing import Any, Optional, Union
from typing import Any, Literal, Optional, Union
from datetime import datetime
# LOCAL
@@ -27,6 +27,7 @@ class RetrieveScrapeJobs(pydantic.BaseModel):
class DownloadJob(pydantic.BaseModel):
ids: list[str]
job_format: Literal["csv", "md"]
class DeleteScrapeJobs(pydantic.BaseModel):

View File

@@ -40,6 +40,7 @@ from api.backend.job.cron_scheduling.cron_scheduling import (
)
from api.backend.job.utils.clean_job_format import clean_job_format
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
LOG = logging.getLogger(__name__)
@@ -106,40 +107,58 @@ async def download(download_job: DownloadJob):
)
results = query(job_query, tuple(download_job.ids))
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
if download_job.job_format == "csv":
csv_buffer = StringIO()
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
csv_writer.writerow(headers)
headers = [
"id",
"url",
"element_name",
"xpath",
"text",
"user",
"time_created",
]
csv_writer.writerow(headers)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
for result in results:
for res in result["result"]:
for url, elements in res.items():
for element_name, values in elements.items():
for value in values:
text = clean_text(value.get("text", "")).strip()
if text:
csv_writer.writerow(
[
result.get("id", "")
+ "-"
+ str(random.randint(0, 1000000)),
url,
element_name,
value.get("xpath", ""),
text,
result.get("user", ""),
result.get("time_created", ""),
]
)
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
_ = csv_buffer.seek(0)
response = StreamingResponse(
csv_buffer,
media_type="text/csv",
)
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
return response
elif download_job.job_format == "md":
response = StreamingResponse(
stream_md_from_job_results(results),
media_type="text/markdown",
)
response.headers["Content-Disposition"] = "attachment; filename=export.md"
return response
except Exception as e:
LOG.error(f"Exception occurred: {e}")

View File

@@ -21,7 +21,7 @@ async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
mock_randint.return_value = mocked_random_int
# Create a DownloadJob instance
download_job = DownloadJob(ids=[mocked_job["id"]])
download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
# Make a POST request to the /download endpoint
response = client.post("/download", json=download_job.model_dump())