mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-17 21:25:36 +00:00
Feature: Allow Multiple Download Options (#75)
* feat: allow downloading in MD format * fix: unit tests * fix: deployments [skip ci] * fix: deployment
This commit is contained in:
2
.github/workflows/docker-image.yml
vendored
2
.github/workflows/docker-image.yml
vendored
@@ -8,7 +8,7 @@ on:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build:
|
build:
|
||||||
if: ${{ github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'master' }} || github.event.workflow_dispatch.inputs.branch == 'feat/add-helm-chart'
|
if: ${{ github.event.workflow_run.conclusion == 'success' && github.ref == 'refs/heads/master' }}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
|
|||||||
24
api/backend/job/utils/stream_md_from_job_results.py
Normal file
24
api/backend/job/utils/stream_md_from_job_results.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from api.backend.utils import clean_text
|
||||||
|
|
||||||
|
|
||||||
|
def stream_md_from_job_results(jobs: list[dict[str, Any]]):
|
||||||
|
md = "# Job Results Summary\n\n"
|
||||||
|
for i, job in enumerate(jobs, start=1):
|
||||||
|
md += f"## Job #{i}\n"
|
||||||
|
yield f"- **Job URL:** {job.get('url', 'N/A')}\n"
|
||||||
|
yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n"
|
||||||
|
yield f"- **ID:** {job.get('id', 'N/A')}\n"
|
||||||
|
yield "### Extracted Results:\n"
|
||||||
|
|
||||||
|
for res in job.get("result", []):
|
||||||
|
for url, elements in res.items():
|
||||||
|
yield f"\n#### URL: {url}\n"
|
||||||
|
for element_name, values in elements.items():
|
||||||
|
for value in values:
|
||||||
|
text = clean_text(value.get("text", "")).strip()
|
||||||
|
if text:
|
||||||
|
yield f"- **Element:** `{element_name}`\n"
|
||||||
|
yield f" - **Text:** {text}\n"
|
||||||
|
yield "\n---\n"
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
# STL
|
# STL
|
||||||
from typing import Any, Optional, Union
|
from typing import Any, Literal, Optional, Union
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
# LOCAL
|
# LOCAL
|
||||||
@@ -27,6 +27,7 @@ class RetrieveScrapeJobs(pydantic.BaseModel):
|
|||||||
|
|
||||||
class DownloadJob(pydantic.BaseModel):
|
class DownloadJob(pydantic.BaseModel):
|
||||||
ids: list[str]
|
ids: list[str]
|
||||||
|
job_format: Literal["csv", "md"]
|
||||||
|
|
||||||
|
|
||||||
class DeleteScrapeJobs(pydantic.BaseModel):
|
class DeleteScrapeJobs(pydantic.BaseModel):
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ from api.backend.job.cron_scheduling.cron_scheduling import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from api.backend.job.utils.clean_job_format import clean_job_format
|
from api.backend.job.utils.clean_job_format import clean_job_format
|
||||||
|
from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -106,10 +107,19 @@ async def download(download_job: DownloadJob):
|
|||||||
)
|
)
|
||||||
results = query(job_query, tuple(download_job.ids))
|
results = query(job_query, tuple(download_job.ids))
|
||||||
|
|
||||||
|
if download_job.job_format == "csv":
|
||||||
csv_buffer = StringIO()
|
csv_buffer = StringIO()
|
||||||
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL)
|
||||||
|
|
||||||
headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"]
|
headers = [
|
||||||
|
"id",
|
||||||
|
"url",
|
||||||
|
"element_name",
|
||||||
|
"xpath",
|
||||||
|
"text",
|
||||||
|
"user",
|
||||||
|
"time_created",
|
||||||
|
]
|
||||||
csv_writer.writerow(headers)
|
csv_writer.writerow(headers)
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
@@ -141,6 +151,15 @@ async def download(download_job: DownloadJob):
|
|||||||
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
response.headers["Content-Disposition"] = "attachment; filename=export.csv"
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
elif download_job.job_format == "md":
|
||||||
|
response = StreamingResponse(
|
||||||
|
stream_md_from_job_results(results),
|
||||||
|
media_type="text/markdown",
|
||||||
|
)
|
||||||
|
|
||||||
|
response.headers["Content-Disposition"] = "attachment; filename=export.md"
|
||||||
|
return response
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.error(f"Exception occurred: {e}")
|
LOG.error(f"Exception occurred: {e}")
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock):
|
|||||||
mock_randint.return_value = mocked_random_int
|
mock_randint.return_value = mocked_random_int
|
||||||
|
|
||||||
# Create a DownloadJob instance
|
# Create a DownloadJob instance
|
||||||
download_job = DownloadJob(ids=[mocked_job["id"]])
|
download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv")
|
||||||
|
|
||||||
# Make a POST request to the /download endpoint
|
# Make a POST request to the /download endpoint
|
||||||
response = client.post("/download", json=download_job.model_dump())
|
response = client.post("/download", json=download_job.model_dump())
|
||||||
|
|||||||
@@ -30,4 +30,7 @@ EXPOSE 8000
|
|||||||
|
|
||||||
WORKDIR /project/app
|
WORKDIR /project/app
|
||||||
|
|
||||||
|
RUN mkdir -p /project/app/data
|
||||||
|
RUN touch /project/app/data/database.db
|
||||||
|
|
||||||
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]
|
CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ]
|
||||||
@@ -15,7 +15,7 @@ type: application
|
|||||||
# This is the chart version. This version number should be incremented each time you make changes
|
# This is the chart version. This version number should be incremented each time you make changes
|
||||||
# to the chart and its templates, including the app version.
|
# to the chart and its templates, including the app version.
|
||||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||||
version: 1.0.12
|
version: 1.0.13
|
||||||
|
|
||||||
# This is the version number of the application being deployed. This version number should be
|
# This is the version number of the application being deployed. This version number should be
|
||||||
# incremented each time you make changes to the application. Versions are not expected to
|
# incremented each time you make changes to the application. Versions are not expected to
|
||||||
|
|||||||
1
src/components/common/job-download-dialog/index.ts
Normal file
1
src/components/common/job-download-dialog/index.ts
Normal file
@@ -0,0 +1 @@
|
|||||||
|
export * from "./job-download-dialog";
|
||||||
@@ -0,0 +1,95 @@
|
|||||||
|
import {
|
||||||
|
Dialog,
|
||||||
|
DialogTitle,
|
||||||
|
DialogContent,
|
||||||
|
DialogActions,
|
||||||
|
Button,
|
||||||
|
FormControl,
|
||||||
|
RadioGroup,
|
||||||
|
FormControlLabel,
|
||||||
|
Radio,
|
||||||
|
FormLabel,
|
||||||
|
Typography,
|
||||||
|
Box,
|
||||||
|
} from "@mui/material";
|
||||||
|
import { useState } from "react";
|
||||||
|
|
||||||
|
export type JobDownloadDialogProps = {
|
||||||
|
open: boolean;
|
||||||
|
onClose: () => void;
|
||||||
|
ids: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
export const JobDownloadDialog = ({
|
||||||
|
open,
|
||||||
|
onClose,
|
||||||
|
ids,
|
||||||
|
}: JobDownloadDialogProps) => {
|
||||||
|
const [jobFormat, setJobFormat] = useState<string>("csv");
|
||||||
|
const handleDownload = async () => {
|
||||||
|
const response = await fetch("/api/download", {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const blob = await response.blob();
|
||||||
|
const url = window.URL.createObjectURL(blob);
|
||||||
|
const a = document.createElement("a");
|
||||||
|
a.style.display = "none";
|
||||||
|
a.href = url;
|
||||||
|
a.download = `job_${ids[0]}.${jobFormat}`;
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
window.URL.revokeObjectURL(url);
|
||||||
|
document.body.removeChild(a);
|
||||||
|
} else {
|
||||||
|
console.error("Failed to download the file.");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Dialog open={open} onClose={onClose}>
|
||||||
|
<DialogTitle>Download Job</DialogTitle>
|
||||||
|
<DialogContent>
|
||||||
|
<FormControl>
|
||||||
|
<Typography variant="body1">
|
||||||
|
You are about to download {ids.length} job(s). Please select the
|
||||||
|
format that you would like to download them in.
|
||||||
|
</Typography>
|
||||||
|
<br />
|
||||||
|
<Box
|
||||||
|
sx={{
|
||||||
|
display: "flex",
|
||||||
|
flexDirection: "column",
|
||||||
|
backgroundColor: "background.paper",
|
||||||
|
padding: 2,
|
||||||
|
border: "1px solid",
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<FormLabel>Format</FormLabel>
|
||||||
|
<hr style={{ width: "100%", margin: "10px 0" }} />
|
||||||
|
<RadioGroup
|
||||||
|
aria-labelledby="job-download-format-radio-buttons"
|
||||||
|
name="job-download-format-radio-buttons"
|
||||||
|
value={jobFormat}
|
||||||
|
onChange={(e) => setJobFormat(e.target.value)}
|
||||||
|
>
|
||||||
|
<FormControlLabel value="csv" control={<Radio />} label="CSV" />
|
||||||
|
<FormControlLabel
|
||||||
|
value="md"
|
||||||
|
control={<Radio />}
|
||||||
|
label="Markdown"
|
||||||
|
/>
|
||||||
|
</RadioGroup>
|
||||||
|
</Box>
|
||||||
|
<br />
|
||||||
|
<Button onClick={handleDownload} size="small">
|
||||||
|
Download
|
||||||
|
</Button>
|
||||||
|
</FormControl>
|
||||||
|
</DialogContent>
|
||||||
|
</Dialog>
|
||||||
|
);
|
||||||
|
};
|
||||||
@@ -20,6 +20,7 @@ import { Favorites, JobQueue } from ".";
|
|||||||
import { Job } from "../../types";
|
import { Job } from "../../types";
|
||||||
import Cookies from "js-cookie";
|
import Cookies from "js-cookie";
|
||||||
import { useSearchParams } from "next/navigation";
|
import { useSearchParams } from "next/navigation";
|
||||||
|
import { JobDownloadDialog } from "../common/job-download-dialog";
|
||||||
|
|
||||||
interface JobTableProps {
|
interface JobTableProps {
|
||||||
jobs: Job[];
|
jobs: Job[];
|
||||||
@@ -47,31 +48,15 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
|||||||
const [searchQuery, setSearchQuery] = useState<string>(search || "");
|
const [searchQuery, setSearchQuery] = useState<string>(search || "");
|
||||||
const [searchMode, setSearchMode] = useState<string>(type || "url");
|
const [searchMode, setSearchMode] = useState<string>(type || "url");
|
||||||
const [favoriteView, setFavoriteView] = useState<boolean>(false);
|
const [favoriteView, setFavoriteView] = useState<boolean>(false);
|
||||||
|
const [jobDownloadDialogOpen, setJobDownloadDialogOpen] =
|
||||||
|
useState<boolean>(false);
|
||||||
|
|
||||||
const token = Cookies.get("token");
|
const token = Cookies.get("token");
|
||||||
const router = useRouter();
|
const router = useRouter();
|
||||||
|
|
||||||
const handleDownload = async (ids: string[]) => {
|
const handleDownload = (ids: string[]) => {
|
||||||
const response = await fetch("/api/download", {
|
setSelectedJobs(new Set(ids));
|
||||||
method: "POST",
|
setJobDownloadDialogOpen(true);
|
||||||
headers: { "Content-Type": "application/json" },
|
|
||||||
body: JSON.stringify({ data: { ids: ids } }),
|
|
||||||
});
|
|
||||||
|
|
||||||
if (response.ok) {
|
|
||||||
const blob = await response.blob();
|
|
||||||
const url = window.URL.createObjectURL(blob);
|
|
||||||
const a = document.createElement("a");
|
|
||||||
a.style.display = "none";
|
|
||||||
a.href = url;
|
|
||||||
a.download = `job_${ids[0]}.csv`;
|
|
||||||
document.body.appendChild(a);
|
|
||||||
a.click();
|
|
||||||
window.URL.revokeObjectURL(url);
|
|
||||||
document.body.removeChild(a);
|
|
||||||
} else {
|
|
||||||
console.error("Failed to download the file.");
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const handleNavigate = (elements: Object[], url: string, options: any) => {
|
const handleNavigate = (elements: Object[], url: string, options: any) => {
|
||||||
@@ -259,17 +244,22 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
|||||||
onSelectJob={handleSelectJob}
|
onSelectJob={handleSelectJob}
|
||||||
onFavorite={favoriteJob}
|
onFavorite={favoriteJob}
|
||||||
onJobClick={handleJobClick}
|
onJobClick={handleJobClick}
|
||||||
></JobQueue>
|
/>
|
||||||
) : (
|
) : (
|
||||||
<Favorites
|
<Favorites
|
||||||
stateProps={{ selectedJobs, filteredJobs }}
|
stateProps={{ selectedJobs, filteredJobs }}
|
||||||
onNavigate={handleNavigate}
|
onNavigate={handleNavigate}
|
||||||
onSelectJob={handleSelectJob}
|
onSelectJob={handleSelectJob}
|
||||||
onFavorite={favoriteJob}
|
onFavorite={favoriteJob}
|
||||||
></Favorites>
|
/>
|
||||||
)}
|
)}
|
||||||
</Box>
|
</Box>
|
||||||
</Box>
|
</Box>
|
||||||
|
<JobDownloadDialog
|
||||||
|
open={jobDownloadDialogOpen}
|
||||||
|
onClose={() => setJobDownloadDialogOpen(false)}
|
||||||
|
ids={Array.from(selectedJobs)}
|
||||||
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user