mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-10-30 05:57:12 +00:00
chore: wip add upload/import
This commit is contained in:
@@ -64,7 +64,7 @@ async def scrape_with_agent(agent_job: dict[str, Any]):
|
||||
xpaths = parse_response(response)
|
||||
|
||||
captured_elements = await capture_elements(
|
||||
page, xpaths, agent_job["job_options"]["return_html"]
|
||||
page, xpaths, agent_job["job_options"].get("return_html", False)
|
||||
)
|
||||
|
||||
final_url = page.url
|
||||
|
||||
@@ -29,6 +29,7 @@ def insert(query: str, values: tuple[Any, ...]):
|
||||
|
||||
except sqlite3.Error as e:
|
||||
LOG.error(f"An error occurred: {e}")
|
||||
raise e
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
@@ -49,10 +49,15 @@ async def get_queued_job():
|
||||
return res[0] if res else None
|
||||
|
||||
|
||||
async def update_job(ids: list[str], field: str, value: Any):
|
||||
query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}"
|
||||
res = update(query, tuple([value] + ids))
|
||||
LOG.info(f"Updated job: {res}")
|
||||
async def update_job(ids: list[str], updates: dict[str, Any]):
|
||||
if not updates:
|
||||
return
|
||||
|
||||
set_clause = ", ".join(f"{field} = ?" for field in updates.keys())
|
||||
query = f"UPDATE jobs SET {set_clause} WHERE id IN {format_list_for_query(ids)}"
|
||||
values = list(updates.values()) + ids
|
||||
res = update(query, tuple(values))
|
||||
LOG.debug(f"Updated job: {res}")
|
||||
|
||||
|
||||
async def delete_jobs(jobs: list[str]):
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# STL
|
||||
import logging
|
||||
import datetime
|
||||
from typing import Any
|
||||
|
||||
# LOCAL
|
||||
@@ -12,7 +13,19 @@ from api.backend.database.queries.job.job_queries import JOB_INSERT_QUERY
|
||||
LOG = logging.getLogger("Job")
|
||||
|
||||
|
||||
def insert(item: dict[str, Any]) -> None:
|
||||
async def insert(item: dict[str, Any]) -> None:
|
||||
if check_for_job_completion(item["id"]):
|
||||
await multi_field_update_job(
|
||||
item["id"],
|
||||
{
|
||||
"status": "Queued",
|
||||
"result": [],
|
||||
"time_created": datetime.datetime.now().isoformat(),
|
||||
"chat": None,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
common_insert(
|
||||
JOB_INSERT_QUERY,
|
||||
(
|
||||
@@ -33,6 +46,12 @@ def insert(item: dict[str, Any]) -> None:
|
||||
LOG.debug(f"Inserted item: {item}")
|
||||
|
||||
|
||||
def check_for_job_completion(id: str) -> dict[str, Any]:
|
||||
query = f"SELECT * FROM jobs WHERE id = ?"
|
||||
res = common_query(query, (id,))
|
||||
return res[0] if res else {}
|
||||
|
||||
|
||||
async def get_queued_job():
|
||||
query = (
|
||||
"SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1"
|
||||
@@ -48,6 +67,12 @@ async def update_job(ids: list[str], field: str, value: Any):
|
||||
LOG.debug(f"Updated job: {res}")
|
||||
|
||||
|
||||
async def multi_field_update_job(id: str, fields: dict[str, Any]):
|
||||
query = f"UPDATE jobs SET {', '.join(f'{field} = ?' for field in fields.keys())} WHERE id = ?"
|
||||
res = common_update(query, tuple(list(fields.values()) + [id]))
|
||||
LOG.debug(f"Updated job: {res}")
|
||||
|
||||
|
||||
async def delete_jobs(jobs: list[str]):
|
||||
if not jobs:
|
||||
LOG.debug("No jobs to delete.")
|
||||
|
||||
@@ -43,10 +43,8 @@ job_router = APIRouter()
|
||||
@job_router.post("/update")
|
||||
@handle_exceptions(logger=LOG)
|
||||
async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
|
||||
"""Used to update jobs"""
|
||||
await update_job(update_jobs.ids, update_jobs.field, update_jobs.value)
|
||||
|
||||
return JSONResponse(content={"message": "Jobs updated successfully."})
|
||||
return {"message": "Jobs updated successfully"}
|
||||
|
||||
|
||||
@job_router.post("/submit-scrape-job")
|
||||
@@ -54,9 +52,11 @@ async def update(update_jobs: UpdateJobs, _: User = Depends(get_current_user)):
|
||||
async def submit_scrape_job(job: Job):
|
||||
LOG.info(f"Recieved job: {job}")
|
||||
|
||||
job.id = uuid.uuid4().hex
|
||||
if not job.id:
|
||||
job.id = uuid.uuid4().hex
|
||||
|
||||
job_dict = job.model_dump()
|
||||
insert(job_dict)
|
||||
await insert(job_dict)
|
||||
|
||||
return JSONResponse(
|
||||
content={"id": job.id, "message": "Job submitted successfully."}
|
||||
@@ -70,7 +70,9 @@ async def retrieve_scrape_jobs(
|
||||
):
|
||||
LOG.info(f"Retrieving jobs for account: {user.email}")
|
||||
ATTRIBUTES = "chat" if fetch_options.chat else "*"
|
||||
job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?"
|
||||
job_query = (
|
||||
f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ? ORDER BY time_created ASC"
|
||||
)
|
||||
results = query(job_query, (user.email,))
|
||||
return JSONResponse(content=jsonable_encoder(results[::-1]))
|
||||
|
||||
|
||||
@@ -174,7 +174,9 @@ async def scrape(
|
||||
|
||||
for page in pages:
|
||||
elements.append(
|
||||
await collect_scraped_elements(page, xpaths, job_options["return_html"])
|
||||
await collect_scraped_elements(
|
||||
page, xpaths, job_options.get("return_html", False)
|
||||
)
|
||||
)
|
||||
|
||||
return elements
|
||||
|
||||
@@ -13,21 +13,45 @@ import { useJobSubmitterProvider } from "./provider";
|
||||
|
||||
export const JobSubmitter = () => {
|
||||
const router = useRouter();
|
||||
const { job_options } = router.query;
|
||||
const { job_options, id } = router.query;
|
||||
console.log(id);
|
||||
const { user } = useUser();
|
||||
|
||||
const { submitJob, loading, error } = useSubmitJob();
|
||||
const { submittedURL, rows, siteMap, setSiteMap, jobOptions, setJobOptions } =
|
||||
useJobSubmitterProvider();
|
||||
const {
|
||||
jobId,
|
||||
setJobId,
|
||||
submittedURL,
|
||||
rows,
|
||||
siteMap,
|
||||
setSiteMap,
|
||||
jobOptions,
|
||||
setJobOptions,
|
||||
} = useJobSubmitterProvider();
|
||||
|
||||
useEffect(() => {
|
||||
if (job_options) {
|
||||
parseJobOptions(job_options as string, setJobOptions, setSiteMap);
|
||||
parseJobOptions(
|
||||
id as string,
|
||||
job_options as string,
|
||||
setJobOptions,
|
||||
setSiteMap,
|
||||
setJobId
|
||||
);
|
||||
}
|
||||
}, [job_options]);
|
||||
|
||||
const handleSubmit = async () => {
|
||||
await submitJob(submittedURL, rows, user, jobOptions, siteMap, false, null);
|
||||
await submitJob(
|
||||
submittedURL,
|
||||
rows,
|
||||
user,
|
||||
jobOptions,
|
||||
siteMap,
|
||||
false,
|
||||
null,
|
||||
jobId
|
||||
);
|
||||
};
|
||||
|
||||
return (
|
||||
|
||||
@@ -10,6 +10,8 @@ import React, {
|
||||
} from "react";
|
||||
|
||||
type JobSubmitterProviderType = {
|
||||
jobId: string;
|
||||
setJobId: Dispatch<React.SetStateAction<string>>;
|
||||
submittedURL: string;
|
||||
setSubmittedURL: Dispatch<React.SetStateAction<string>>;
|
||||
rows: Element[];
|
||||
@@ -36,6 +38,7 @@ const JobSubmitterProvider = createContext<JobSubmitterProviderType>(
|
||||
);
|
||||
|
||||
export const Provider = ({ children }: PropsWithChildren) => {
|
||||
const [jobId, setJobId] = useState<string>("");
|
||||
const [submittedURL, setSubmittedURL] = useState<string>("");
|
||||
const [rows, setRows] = useState<Element[]>([]);
|
||||
const [results, setResults] = useState<Result>({});
|
||||
@@ -55,6 +58,8 @@ export const Provider = ({ children }: PropsWithChildren) => {
|
||||
|
||||
const value: JobSubmitterProviderType = useMemo(
|
||||
() => ({
|
||||
jobId,
|
||||
setJobId,
|
||||
submittedURL,
|
||||
setSubmittedURL,
|
||||
rows,
|
||||
@@ -76,6 +81,7 @@ export const Provider = ({ children }: PropsWithChildren) => {
|
||||
closeSnackbar,
|
||||
}),
|
||||
[
|
||||
jobId,
|
||||
submittedURL,
|
||||
rows,
|
||||
results,
|
||||
|
||||
@@ -23,10 +23,17 @@ export const useImportJobConfig = () => {
|
||||
});
|
||||
}
|
||||
|
||||
setJobOptions(jobConfig.job_options);
|
||||
if (
|
||||
jobConfig.job_options &&
|
||||
Array.isArray(jobConfig.job_options.proxies)
|
||||
) {
|
||||
jobConfig.job_options.proxies = "";
|
||||
}
|
||||
|
||||
setJobOptions(jobConfig.job_options || {});
|
||||
setSiteMap(jobConfig.site_map);
|
||||
setSubmittedURL(jobConfig.url);
|
||||
setRows(jobConfig.elements);
|
||||
setSubmittedURL(jobConfig.url || "");
|
||||
setRows(jobConfig.elements || []);
|
||||
};
|
||||
|
||||
reader.readAsText(file);
|
||||
|
||||
@@ -82,7 +82,10 @@ export const useSubmitJob = () => {
|
||||
setSnackbarOpen(true);
|
||||
})
|
||||
.catch((error) => {
|
||||
setSnackbarMessage(error || "An error occurred.");
|
||||
const errorMessage =
|
||||
error instanceof Error ? error.message : "An error occurred.";
|
||||
console.log(errorMessage);
|
||||
setSnackbarMessage(errorMessage);
|
||||
setSnackbarSeverity("error");
|
||||
setSnackbarOpen(true);
|
||||
})
|
||||
|
||||
@@ -3,9 +3,11 @@ import { Dispatch, SetStateAction } from "react";
|
||||
import { RawJobOptions, SiteMap } from "@/types";
|
||||
|
||||
export const parseJobOptions = (
|
||||
id: string,
|
||||
job_options: string,
|
||||
setJobOptions: Dispatch<SetStateAction<RawJobOptions>>,
|
||||
setSiteMap?: Dispatch<SetStateAction<SiteMap | null>>
|
||||
setSiteMap?: Dispatch<SetStateAction<SiteMap | null>>,
|
||||
setJobId?: Dispatch<SetStateAction<string>>
|
||||
) => {
|
||||
if (job_options) {
|
||||
const jsonOptions = JSON.parse(job_options as string);
|
||||
@@ -47,6 +49,10 @@ export const parseJobOptions = (
|
||||
newJobOptions.return_html = true;
|
||||
}
|
||||
|
||||
if (id && setJobId) {
|
||||
setJobId(id);
|
||||
}
|
||||
|
||||
setJobOptions(newJobOptions);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -21,15 +21,16 @@ export default async function handler(
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Error: ${response.statusText}`);
|
||||
const result = await response.json();
|
||||
|
||||
if (response.status === 500) {
|
||||
res.status(500).json({ error: result.error });
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
res.status(200).json(result);
|
||||
} catch (error) {
|
||||
console.error("Error submitting scrape job:", error);
|
||||
res.status(500).json({ error: "Internal Server Error" });
|
||||
res.status(500).json({ error: error });
|
||||
}
|
||||
} else {
|
||||
res.setHeader("Allow", ["POST"]);
|
||||
|
||||
Reference in New Issue
Block a user