wip: cleanup

This commit is contained in:
Jayden Pyles
2024-07-07 13:33:56 -05:00
parent 30e4d5c5f7
commit 6844880c17
12 changed files with 67 additions and 58 deletions

View File

@@ -1,68 +1,68 @@
# Webapp Template
# Scraperr
Template designed to quickly build full stack apps.
Scraperr is a self-hosted web application that allows users to scrape data from web pages by specifying elements via XPath. Users can submit URLs and the corresponding elements to be scraped, and the results will be displayed in a table.
Utilizes Github Actions and Ansible to build Docker images to quickly deploy onto an AWS EC2 Debian instance.
From the table, users can download a csv of the job's results, along with an option to rerun the job.
## Technologies
## Features
- Containerization: Docker/Docker Compose
- Submit URLs for web scraping
- Add and manage elements to scrape using XPath
- Display results of scraped data
- Download csv containing results
- Rerun jobs
- Frontend: React/Next.js
## Installation
- Backend: FastAPI
1. Clone the repository:
- Frameworks/Libraries: PDM, TailwindCSS
```sh
git clone https://github.com/jaypyles/scraperr.git
cd scraperr
```
## Prerequisites
1. Deploy
```sh
make up
```
- Install Ansible
The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy.
- Create a Dockerhub account/repo and fill out the Github repo environmental variables:
## Usage
- DOCKERHUB_TOKEN
- DOCKERHUB_USERNAME
- DOCKERHUB_REPO
1. Open the application in your browser at `http://localhost`.
2. Enter the URL you want to scrape in the URL field.
3. Add elements to scrape by specifying a name and the corresponding XPath.
4. Click the "Submit" button to start the scraping process.
5. The results will be displayed in the "Results" section.
- Complete the `config.yaml` and the `inventory.yaml` in the `ansible` directory
## API Endpoints
- `github_repo`: Github repo clone address
- `deploy_path`: Path where to clone the repo to on the server
- `deploy_command`: `Make` command to run to deploy on the server
Use this service as an API for your own jobs.
- Add your domain in HOSTNAME_PROD in the `Makefile`
- `/api/submit-scrape-job`: Endpoint to submit the scraping job. Accepts a POST request with the following payload:
```json
{
"url": "http://example.com",
"elements": [
{
"name": "ElementName",
"xpath": "/div[@class='example']"
}
],
"user": "user@example.com",
"time_created": "2024-07-07T12:34:56.789Z"
}
```
## Deployment
## License
### Local Deployment
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
Uses `make` to quickly dispatch `docker-compose` commands.
### Contributions
- `deps`: rebuilds the frontend to deploy statically using the api
Development made easy by developing from [webapp template](https://github.com/jaypyles/webapp-template]. View documentation for extra information.
- `build`: builds the container using `docker-compose build `
Start development server:
- `up-prd`: ups the container using `docker-compose -f docker-compose.yml up`
- `up-dev`: ups the container using `docker-compose -f docker-compose.yml -f docker-compose.dev.yml up`
which will deploy with local volumes.
Ex: `make deps build up-dev`
### Server Deployment
Easy deployment using `make setup deploy` after completing the required config files.
- `setup`: Install dependencies and clone repo onto server
- `deploy`: Deploy on server
To use a SSL certificate, uncomment the volumes under the `traefik` service. Add your own certificates for use in Traefik.
```yaml
volumes:
- "/var/run/docker.sock:/var/run/docker.sock:ro"
- "./dynamic_conf.yaml:/etc/traefik/dynamic_conf.yaml"
- "/etc/letsencrypt/live/domain/fullchain.pem:/etc/certs/ssl-cert.pem"
- "/etc/letsencrypt/live/domain/privkey.pem:/etc/certs/ssl-cert.key"
```
`make deps build up-dev`

View File

@@ -44,6 +44,11 @@ def read_root():
return FileResponse("./dist/index.html")
@app.get("/favicon.ico")
def read_favicon():
return FileResponse("dist/favicon.ico")
@app.post("/api/submit-scrape-job")
async def submit_scrape_job(job: SubmitScrapeJob):
LOG.info(f"Recieved job: {job}")

View File

@@ -1,6 +1,6 @@
version: "3"
services:
webscrape:
scraperr:
labels:
- "traefik.enable=true"
- "traefik.http.routers.frontend.rule=Host(`${HOSTNAME}`)"

View File

@@ -1,5 +1,5 @@
services:
webscrape:
scraperr:
build:
context: ./
container_name: webscrape

BIN
public/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 454 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

View File

@@ -3,7 +3,7 @@
"name": "Create React App Sample",
"icons": [
{
"src": "favicon.ico",
"src": "favicon.png",
"sizes": "64x64 32x32 24x24 16x16",
"type": "image/x-icon"
},

View File

@@ -69,16 +69,17 @@ const JobTable: React.FC<JobTableProps> = ({ jobs }) => {
bgcolor="background.default"
display="flex"
justifyContent="center"
minHeight="100vh"
p={3}
>
<Box
className="flex flex-col justify-center align-center items-center"
className="flex flex-col justify-start align-center items-center"
width="100%"
maxWidth="100%"
bgcolor="background.default"
p={3}
overflow="auto"
>
<Typography variant="h4" gutterBottom>
<Typography variant="h4" gutterBottom sx={{ mt: 3 }}>
Scrape Jobs
</Typography>
<Box sx={{ overflow: "auto", width: "75%" }}>

View File

@@ -4,7 +4,6 @@ import "../styles/globals.css";
import React, { useState, useEffect } from "react";
import type { AppProps } from "next/app";
import Head from "next/head";
import { SessionProvider } from "next-auth/react";
import { ThemeProvider, CssBaseline } from "@mui/material";
import NavDrawer from "../components/NavDrawer";
import { darkTheme, lightTheme } from "../styles/themes";
@@ -34,7 +33,7 @@ const App: React.FC<AppProps> = ({ Component, pageProps }) => {
return (
<>
<Head>
<title>Webapp Template</title>
<title>Scraperr</title>
</Head>
<AuthProvider>
<ThemeProvider theme={isDarkMode ? darkTheme : lightTheme}>

View File

@@ -10,7 +10,7 @@ export default function Document() {
httpEquiv="Content-Security-Policy"
content="upgrade-insecure-requests"
/>
<meta name="description" content="Webapp Template" />
<meta name="description" content="Scraperr" />
</Head>
<body>
<noscript>You need to enable JavaScript to run this app.</noscript>

View File

@@ -148,7 +148,7 @@ const Home = () => {
>
<Container maxWidth="md">
<Typography variant="h1" gutterBottom textAlign="center">
Web Scraper
Scraperr
</Typography>
<div
style={{ marginBottom: "20px" }}

View File

@@ -1,3 +1,7 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
#__next {
min-height: 100vh;
}