diff --git a/README.md b/README.md index 136fe84..e0c896c 100644 --- a/README.md +++ b/README.md @@ -1,68 +1,68 @@ -# Webapp Template +# Scraperr -Template designed to quickly build full stack apps. +Scraperr is a self-hosted web application that allows users to scrape data from web pages by specifying elements via XPath. Users can submit URLs and the corresponding elements to be scraped, and the results will be displayed in a table. -Utilizes Github Actions and Ansible to build Docker images to quickly deploy onto an AWS EC2 Debian instance. +From the table, users can download a csv of the job's results, along with an option to rerun the job. -## Technologies +## Features -- Containerization: Docker/Docker Compose +- Submit URLs for web scraping +- Add and manage elements to scrape using XPath +- Display results of scraped data +- Download csv containing results +- Rerun jobs -- Frontend: React/Next.js +## Installation -- Backend: FastAPI +1. Clone the repository: -- Frameworks/Libraries: PDM, TailwindCSS + ```sh + git clone https://github.com/jaypyles/scraperr.git + cd scraperr + ``` -## Prerequisites +1. Deploy + ```sh + make up + ``` -- Install Ansible +The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy. -- Create a Dockerhub account/repo and fill out the Github repo environmental variables: +## Usage - - DOCKERHUB_TOKEN - - DOCKERHUB_USERNAME - - DOCKERHUB_REPO +1. Open the application in your browser at `http://localhost`. +2. Enter the URL you want to scrape in the URL field. +3. Add elements to scrape by specifying a name and the corresponding XPath. +4. Click the "Submit" button to start the scraping process. +5. The results will be displayed in the "Results" section. -- Complete the `config.yaml` and the `inventory.yaml` in the `ansible` directory +## API Endpoints - - `github_repo`: Github repo clone address - - `deploy_path`: Path where to clone the repo to on the server - - `deploy_command`: `Make` command to run to deploy on the server +Use this service as an API for your own jobs. -- Add your domain in HOSTNAME_PROD in the `Makefile` +- `/api/submit-scrape-job`: Endpoint to submit the scraping job. Accepts a POST request with the following payload: + ```json + { + "url": "http://example.com", + "elements": [ + { + "name": "ElementName", + "xpath": "/div[@class='example']" + } + ], + "user": "user@example.com", + "time_created": "2024-07-07T12:34:56.789Z" + } + ``` -## Deployment +## License -### Local Deployment +This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. -Uses `make` to quickly dispatch `docker-compose` commands. +### Contributions -- `deps`: rebuilds the frontend to deploy statically using the api +Development made easy by developing from [webapp template](https://github.com/jaypyles/webapp-template]. View documentation for extra information. -- `build`: builds the container using `docker-compose build ` +Start development server: -- `up-prd`: ups the container using `docker-compose -f docker-compose.yml up` - -- `up-dev`: ups the container using `docker-compose -f docker-compose.yml -f docker-compose.dev.yml up` - which will deploy with local volumes. - -Ex: `make deps build up-dev` - -### Server Deployment - -Easy deployment using `make setup deploy` after completing the required config files. - -- `setup`: Install dependencies and clone repo onto server - -- `deploy`: Deploy on server - -To use a SSL certificate, uncomment the volumes under the `traefik` service. Add your own certificates for use in Traefik. - -```yaml -volumes: - - "/var/run/docker.sock:/var/run/docker.sock:ro" - - "./dynamic_conf.yaml:/etc/traefik/dynamic_conf.yaml" - - "/etc/letsencrypt/live/domain/fullchain.pem:/etc/certs/ssl-cert.pem" - - "/etc/letsencrypt/live/domain/privkey.pem:/etc/certs/ssl-cert.key" -``` +`make deps build up-dev` diff --git a/api/backend/app.py b/api/backend/app.py index a0759d5..9d5246c 100644 --- a/api/backend/app.py +++ b/api/backend/app.py @@ -44,6 +44,11 @@ def read_root(): return FileResponse("./dist/index.html") +@app.get("/favicon.ico") +def read_favicon(): + return FileResponse("dist/favicon.ico") + + @app.post("/api/submit-scrape-job") async def submit_scrape_job(job: SubmitScrapeJob): LOG.info(f"Recieved job: {job}") diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index ad5ad2f..b49eb00 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -1,6 +1,6 @@ version: "3" services: - webscrape: + scraperr: labels: - "traefik.enable=true" - "traefik.http.routers.frontend.rule=Host(`${HOSTNAME}`)" diff --git a/docker-compose.yml b/docker-compose.yml index 30e4024..fe16695 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,5 @@ services: - webscrape: + scraperr: build: context: ./ container_name: webscrape diff --git a/public/favicon.ico b/public/favicon.ico new file mode 100644 index 0000000..d92a3be Binary files /dev/null and b/public/favicon.ico differ diff --git a/public/favicon.png b/public/favicon.png deleted file mode 100644 index 09c972f..0000000 Binary files a/public/favicon.png and /dev/null differ diff --git a/public/manifest.json b/public/manifest.json index 080d6c7..e83b198 100644 --- a/public/manifest.json +++ b/public/manifest.json @@ -3,7 +3,7 @@ "name": "Create React App Sample", "icons": [ { - "src": "favicon.ico", + "src": "favicon.png", "sizes": "64x64 32x32 24x24 16x16", "type": "image/x-icon" }, diff --git a/src/components/JobTable.tsx b/src/components/JobTable.tsx index 6632a5a..7d6153a 100644 --- a/src/components/JobTable.tsx +++ b/src/components/JobTable.tsx @@ -69,16 +69,17 @@ const JobTable: React.FC = ({ jobs }) => { bgcolor="background.default" display="flex" justifyContent="center" + minHeight="100vh" + p={3} > - + Scrape Jobs diff --git a/src/pages/_app.tsx b/src/pages/_app.tsx index 321e141..f586dc0 100644 --- a/src/pages/_app.tsx +++ b/src/pages/_app.tsx @@ -4,7 +4,6 @@ import "../styles/globals.css"; import React, { useState, useEffect } from "react"; import type { AppProps } from "next/app"; import Head from "next/head"; -import { SessionProvider } from "next-auth/react"; import { ThemeProvider, CssBaseline } from "@mui/material"; import NavDrawer from "../components/NavDrawer"; import { darkTheme, lightTheme } from "../styles/themes"; @@ -34,7 +33,7 @@ const App: React.FC = ({ Component, pageProps }) => { return ( <> - Webapp Template + Scraperr diff --git a/src/pages/_document.tsx b/src/pages/_document.tsx index 160afaa..4d1f08d 100644 --- a/src/pages/_document.tsx +++ b/src/pages/_document.tsx @@ -10,7 +10,7 @@ export default function Document() { httpEquiv="Content-Security-Policy" content="upgrade-insecure-requests" /> - + diff --git a/src/pages/index.tsx b/src/pages/index.tsx index 677ed9f..707ba82 100644 --- a/src/pages/index.tsx +++ b/src/pages/index.tsx @@ -148,7 +148,7 @@ const Home = () => { > - Web Scraper + Scraperr