mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-11-30 04:53:24 +00:00
Feature: Allow Multiple Download Options (#75)
* feat: allow downloading in MD format * fix: unit tests * fix: deployments [skip ci] * fix: deployment
This commit is contained in:
24
api/backend/job/utils/stream_md_from_job_results.py
Normal file
24
api/backend/job/utils/stream_md_from_job_results.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from typing import Any
|
||||
|
||||
from api.backend.utils import clean_text
|
||||
|
||||
|
||||
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
|
||||
md = "# Job Results Summary\n\n"
|
||||
for i, job in enumerate(jobs, start=1):
|
||||
md += f"## Job #{i}\n"
|
||||
yield f"- **Job URL:** {job.get('url', 'N/A')}\n"
|
||||
yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n"
|
||||
yield f"- **ID:** {job.get('id', 'N/A')}\n"
|
||||
yield "### Extracted Results:\n"
|
||||
|
||||
for res in job.get("result", []):
|
||||
for url, elements in res.items():
|
||||
yield f"\n#### URL: {url}\n"
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", "")).strip()
|
||||
if text:
|
||||
yield f"- **Element:** `{element_name}`\n"
|
||||
yield f" - **Text:** {text}\n"
|
||||
yield "\n---\n"
|
||||
@@ -1,5 +1,5 @@
|
||||
# STL
|
||||
from typing import Any, Optional, Union
|
||||
from typing import Any, Literal, Optional, Union
|
||||
from datetime import datetime
|
||||
|
||||
# LOCAL
|
||||
@@ -27,6 +27,7 @@ class RetrieveScrapeJobs(pydantic.BaseModel):
|
||||
|
||||
class DownloadJob(pydantic.BaseModel):
|
||||
ids: list[str]
|
||||
job_format: Literal["csv", "md"]
|
||||
|
||||
|
||||
class DeleteScrapeJobs(pydantic.BaseModel):
|
||||
|
||||
@@ -40,6 +40,7 @@ from api.backend.job.cron_scheduling.cron_scheduling import (
|
||||
)
|
||||
|
||||
from api.backend.job.utils.clean_job_format import clean_job_format
|
||||
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
@@ -106,40 +107,58 @@ async def download(download_job: DownloadJob):
|
||||
)
|
||||
results = query(job_query, tuple(download_job.ids))
|
||||
|
||||
csv_buffer = StringIO()
|
||||
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
||||
if download_job.job_format == "csv":
|
||||
csv_buffer = StringIO()
|
||||
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
||||
|
||||
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
|
||||
csv_writer.writerow(headers)
|
||||
headers = [
|
||||
"id",
|
||||
"url",
|
||||
"element_name",
|
||||
"xpath",
|
||||
"text",
|
||||
"user",
|
||||
"time_created",
|
||||
]
|
||||
csv_writer.writerow(headers)
|
||||
|
||||
for result in results:
|
||||
for res in result["result"]:
|
||||
for url, elements in res.items():
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", "")).strip()
|
||||
if text:
|
||||
csv_writer.writerow(
|
||||
[
|
||||
result.get("id", "")
|
||||
+ "-"
|
||||
+ str(random.randint(0, 1000000)),
|
||||
url,
|
||||
element_name,
|
||||
value.get("xpath", ""),
|
||||
text,
|
||||
result.get("user", ""),
|
||||
result.get("time_created", ""),
|
||||
]
|
||||
)
|
||||
for result in results:
|
||||
for res in result["result"]:
|
||||
for url, elements in res.items():
|
||||
for element_name, values in elements.items():
|
||||
for value in values:
|
||||
text = clean_text(value.get("text", "")).strip()
|
||||
if text:
|
||||
csv_writer.writerow(
|
||||
[
|
||||
result.get("id", "")
|
||||
+ "-"
|
||||
+ str(random.randint(0, 1000000)),
|
||||
url,
|
||||
element_name,
|
||||
value.get("xpath", ""),
|
||||
text,
|
||||
result.get("user", ""),
|
||||
result.get("time_created", ""),
|
||||
]
|
||||
)
|
||||
|
||||
_ = csv_buffer.seek(0)
|
||||
response = StreamingResponse(
|
||||
csv_buffer,
|
||||
media_type="text/csv",
|
||||
)
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||
return response
|
||||
_ = csv_buffer.seek(0)
|
||||
response = StreamingResponse(
|
||||
csv_buffer,
|
||||
media_type="text/csv",
|
||||
)
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||
return response
|
||||
|
||||
elif download_job.job_format == "md":
|
||||
response = StreamingResponse(
|
||||
stream_md_from_job_results(results),
|
||||
media_type="text/markdown",
|
||||
)
|
||||
|
||||
response.headers["Content-Disposition"] = "attachment; filename=export.md"
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
|
||||
@@ -21,7 +21,7 @@ async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
|
||||
mock_randint.return_value = mocked_random_int
|
||||
|
||||
# Create a DownloadJob instance
|
||||
download_job = DownloadJob(ids=[mocked_job["id"]])
|
||||
download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
|
||||
|
||||
# Make a POST request to the /download endpoint
|
||||
response = client.post("/download", json=download_job.model_dump())
|
||||
|
||||
Reference in New Issue
Block a user