mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-18 05:35:47 +00:00
wip: cleanup
This commit is contained in:
96
README.md
96
README.md
@@ -1,68 +1,68 @@
|
|||||||
# Webapp Template
|
# Scraperr
|
||||||
|
|
||||||
Template designed to quickly build full stack apps.
|
Scraperr is a self-hosted web application that allows users to scrape data from web pages by specifying elements via XPath. Users can submit URLs and the corresponding elements to be scraped, and the results will be displayed in a table.
|
||||||
|
|
||||||
Utilizes Github Actions and Ansible to build Docker images to quickly deploy onto an AWS EC2 Debian instance.
|
From the table, users can download a csv of the job's results, along with an option to rerun the job.
|
||||||
|
|
||||||
## Technologies
|
## Features
|
||||||
|
|
||||||
- Containerization: Docker/Docker Compose
|
- Submit URLs for web scraping
|
||||||
|
- Add and manage elements to scrape using XPath
|
||||||
|
- Display results of scraped data
|
||||||
|
- Download csv containing results
|
||||||
|
- Rerun jobs
|
||||||
|
|
||||||
- Frontend: React/Next.js
|
## Installation
|
||||||
|
|
||||||
- Backend: FastAPI
|
1. Clone the repository:
|
||||||
|
|
||||||
- Frameworks/Libraries: PDM, TailwindCSS
|
```sh
|
||||||
|
git clone https://github.com/jaypyles/scraperr.git
|
||||||
|
cd scraperr
|
||||||
|
```
|
||||||
|
|
||||||
## Prerequisites
|
1. Deploy
|
||||||
|
```sh
|
||||||
|
make up
|
||||||
|
```
|
||||||
|
|
||||||
- Install Ansible
|
The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy.
|
||||||
|
|
||||||
- Create a Dockerhub account/repo and fill out the Github repo environmental variables:
|
## Usage
|
||||||
|
|
||||||
- DOCKERHUB_TOKEN
|
1. Open the application in your browser at `http://localhost`.
|
||||||
- DOCKERHUB_USERNAME
|
2. Enter the URL you want to scrape in the URL field.
|
||||||
- DOCKERHUB_REPO
|
3. Add elements to scrape by specifying a name and the corresponding XPath.
|
||||||
|
4. Click the "Submit" button to start the scraping process.
|
||||||
|
5. The results will be displayed in the "Results" section.
|
||||||
|
|
||||||
- Complete the `config.yaml` and the `inventory.yaml` in the `ansible` directory
|
## API Endpoints
|
||||||
|
|
||||||
- `github_repo`: Github repo clone address
|
Use this service as an API for your own jobs.
|
||||||
- `deploy_path`: Path where to clone the repo to on the server
|
|
||||||
- `deploy_command`: `Make` command to run to deploy on the server
|
|
||||||
|
|
||||||
- Add your domain in HOSTNAME_PROD in the `Makefile`
|
- `/api/submit-scrape-job`: Endpoint to submit the scraping job. Accepts a POST request with the following payload:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"url": "http://example.com",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"name": "ElementName",
|
||||||
|
"xpath": "/div[@class='example']"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"user": "user@example.com",
|
||||||
|
"time_created": "2024-07-07T12:34:56.789Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Deployment
|
## License
|
||||||
|
|
||||||
### Local Deployment
|
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
||||||
|
|
||||||
Uses `make` to quickly dispatch `docker-compose` commands.
|
### Contributions
|
||||||
|
|
||||||
- `deps`: rebuilds the frontend to deploy statically using the api
|
Development made easy by developing from [webapp template](https://github.com/jaypyles/webapp-template]. View documentation for extra information.
|
||||||
|
|
||||||
- `build`: builds the container using `docker-compose build `
|
Start development server:
|
||||||
|
|
||||||
- `up-prd`: ups the container using `docker-compose -f docker-compose.yml up`
|
`make deps build up-dev`
|
||||||
|
|
||||||
- `up-dev`: ups the container using `docker-compose -f docker-compose.yml -f docker-compose.dev.yml up`
|
|
||||||
which will deploy with local volumes.
|
|
||||||
|
|
||||||
Ex: `make deps build up-dev`
|
|
||||||
|
|
||||||
### Server Deployment
|
|
||||||
|
|
||||||
Easy deployment using `make setup deploy` after completing the required config files.
|
|
||||||
|
|
||||||
- `setup`: Install dependencies and clone repo onto server
|
|
||||||
|
|
||||||
- `deploy`: Deploy on server
|
|
||||||
|
|
||||||
To use a SSL certificate, uncomment the volumes under the `traefik` service. Add your own certificates for use in Traefik.
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
volumes:
|
|
||||||
- "/var/run/docker.sock:/var/run/docker.sock:ro"
|
|
||||||
- "./dynamic_conf.yaml:/etc/traefik/dynamic_conf.yaml"
|
|
||||||
- "/etc/letsencrypt/live/domain/fullchain.pem:/etc/certs/ssl-cert.pem"
|
|
||||||
- "/etc/letsencrypt/live/domain/privkey.pem:/etc/certs/ssl-cert.key"
|
|
||||||
```
|
|
||||||
|
|||||||
@@ -44,6 +44,11 @@ def read_root():
|
|||||||
return FileResponse("./dist/index.html")
|
return FileResponse("./dist/index.html")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/favicon.ico")
|
||||||
|
def read_favicon():
|
||||||
|
return FileResponse("dist/favicon.ico")
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/submit-scrape-job")
|
@app.post("/api/submit-scrape-job")
|
||||||
async def submit_scrape_job(job: SubmitScrapeJob):
|
async def submit_scrape_job(job: SubmitScrapeJob):
|
||||||
LOG.info(f"Recieved job: {job}")
|
LOG.info(f"Recieved job: {job}")
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
version: "3"
|
version: "3"
|
||||||
services:
|
services:
|
||||||
webscrape:
|
scraperr:
|
||||||
labels:
|
labels:
|
||||||
- "traefik.enable=true"
|
- "traefik.enable=true"
|
||||||
- "traefik.http.routers.frontend.rule=Host(`${HOSTNAME}`)"
|
- "traefik.http.routers.frontend.rule=Host(`${HOSTNAME}`)"
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
services:
|
services:
|
||||||
webscrape:
|
scraperr:
|
||||||
build:
|
build:
|
||||||
context: ./
|
context: ./
|
||||||
container_name: webscrape
|
container_name: webscrape
|
||||||
|
|||||||
BIN
public/favicon.ico
Normal file
BIN
public/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 454 B |
Binary file not shown.
|
Before Width: | Height: | Size: 12 KiB |
@@ -3,7 +3,7 @@
|
|||||||
"name": "Create React App Sample",
|
"name": "Create React App Sample",
|
||||||
"icons": [
|
"icons": [
|
||||||
{
|
{
|
||||||
"src": "favicon.ico",
|
"src": "favicon.png",
|
||||||
"sizes": "64x64 32x32 24x24 16x16",
|
"sizes": "64x64 32x32 24x24 16x16",
|
||||||
"type": "image/x-icon"
|
"type": "image/x-icon"
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -69,16 +69,17 @@ const JobTable: React.FC<JobTableProps> = ({ jobs }) => {
|
|||||||
bgcolor="background.default"
|
bgcolor="background.default"
|
||||||
display="flex"
|
display="flex"
|
||||||
justifyContent="center"
|
justifyContent="center"
|
||||||
|
minHeight="100vh"
|
||||||
|
p={3}
|
||||||
>
|
>
|
||||||
<Box
|
<Box
|
||||||
className="flex flex-col justify-center align-center items-center"
|
className="flex flex-col justify-start align-center items-center"
|
||||||
width="100%"
|
width="100%"
|
||||||
maxWidth="100%"
|
maxWidth="100%"
|
||||||
bgcolor="background.default"
|
bgcolor="background.default"
|
||||||
p={3}
|
|
||||||
overflow="auto"
|
overflow="auto"
|
||||||
>
|
>
|
||||||
<Typography variant="h4" gutterBottom>
|
<Typography variant="h4" gutterBottom sx={{ mt: 3 }}>
|
||||||
Scrape Jobs
|
Scrape Jobs
|
||||||
</Typography>
|
</Typography>
|
||||||
<Box sx={{ overflow: "auto", width: "75%" }}>
|
<Box sx={{ overflow: "auto", width: "75%" }}>
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ import "../styles/globals.css";
|
|||||||
import React, { useState, useEffect } from "react";
|
import React, { useState, useEffect } from "react";
|
||||||
import type { AppProps } from "next/app";
|
import type { AppProps } from "next/app";
|
||||||
import Head from "next/head";
|
import Head from "next/head";
|
||||||
import { SessionProvider } from "next-auth/react";
|
|
||||||
import { ThemeProvider, CssBaseline } from "@mui/material";
|
import { ThemeProvider, CssBaseline } from "@mui/material";
|
||||||
import NavDrawer from "../components/NavDrawer";
|
import NavDrawer from "../components/NavDrawer";
|
||||||
import { darkTheme, lightTheme } from "../styles/themes";
|
import { darkTheme, lightTheme } from "../styles/themes";
|
||||||
@@ -34,7 +33,7 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
|
|||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
<Head>
|
<Head>
|
||||||
<title>Webapp Template</title>
|
<title>Scraperr</title>
|
||||||
</Head>
|
</Head>
|
||||||
<AuthProvider>
|
<AuthProvider>
|
||||||
<ThemeProvider theme={isDarkMode ? darkTheme : lightTheme}>
|
<ThemeProvider theme={isDarkMode ? darkTheme : lightTheme}>
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ export default function Document() {
|
|||||||
httpEquiv="Content-Security-Policy"
|
httpEquiv="Content-Security-Policy"
|
||||||
content="upgrade-insecure-requests"
|
content="upgrade-insecure-requests"
|
||||||
/>
|
/>
|
||||||
<meta name="description" content="Webapp Template" />
|
<meta name="description" content="Scraperr" />
|
||||||
</Head>
|
</Head>
|
||||||
<body>
|
<body>
|
||||||
<noscript>You need to enable JavaScript to run this app.</noscript>
|
<noscript>You need to enable JavaScript to run this app.</noscript>
|
||||||
|
|||||||
@@ -148,7 +148,7 @@ const Home = () => {
|
|||||||
>
|
>
|
||||||
<Container maxWidth="md">
|
<Container maxWidth="md">
|
||||||
<Typography variant="h1" gutterBottom textAlign="center">
|
<Typography variant="h1" gutterBottom textAlign="center">
|
||||||
Web Scraper
|
Scraperr
|
||||||
</Typography>
|
</Typography>
|
||||||
<div
|
<div
|
||||||
style={{ marginBottom: "20px" }}
|
style={{ marginBottom: "20px" }}
|
||||||
|
|||||||
@@ -1,3 +1,7 @@
|
|||||||
@tailwind base;
|
@tailwind base;
|
||||||
@tailwind components;
|
@tailwind components;
|
||||||
@tailwind utilities;
|
@tailwind utilities;
|
||||||
|
|
||||||
|
#__next {
|
||||||
|
min-height: 100vh;
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user