diff --git a/README.md b/README.md index 14a66d0..f63b03a 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Scraperr is a self-hosted web application that allows users to scrape data from From the table, users can download an excel sheet of the job's results, along with an option to rerun the job. -View the [docs](https://scraperr-docs.pages.dev). +View the [docs](https://scraperr-docs.pages.dev) for a quickstart guide and more information. ## Features @@ -64,87 +64,12 @@ View the [docs](https://scraperr-docs.pages.dev). ![chat](https://github.com/jaypyles/www-scrape/blob/master/docs/chat_page.png) -## Installation - -1. Clone the repository: - - ```sh - git clone https://github.com/jaypyles/scraperr.git - - ``` - -2. Set environmental variables and labels in `docker-compose.yml`. - -```yaml -scraperr: - labels: - - "traefik.enable=true" - - "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost - - "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https - - "traefik.http.services.scraperr.loadbalancer.server.port=3000" - -scraperr_api: - environment: - - LOG_LEVEL=INFO - - MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB - - SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string) - - ALGORITHM=HS256 # authentication encoding algorithm - - ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes - labels: - - "traefik.enable=true" - - "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost - - "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https - - "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api" - - "traefik.http.routers.scraperr_api.middlewares=api-stripprefix" - - "traefik.http.services.scraperr_api.loadbalancer.server.port=8000" - -mongo: - environment: - MONGO_INITDB_ROOT_USERNAME: root - MONGO_INITDB_ROOT_PASSWORD: example -``` - -Don't want to use `traefik`? This configuration can be used in other reverse proxies, as long as the API is proxied to `/api` of the frontend container. This is currently -not able to be used without a reverse proxy, due to limitations of runtime client-side environmental variables in `next.js`. - -3. Deploy - -```sh -make up -``` - -The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy. - -## Usage - -1. Open the application in your browser at `http://localhost`. -2. Enter the URL you want to scrape in the URL field. -3. Add elements to scrape by specifying a name and the corresponding XPath. -4. Click the "Submit" button to queue URL to be scraped. -5. View queue in the "Previous Jobs" section. - ## API Endpoints Use this service as an API for your own projects. Due to this using FastAPI, a docs page is available at `/docs` for the API. ![docs](https://github.com/jaypyles/www-scrape/blob/master/docs/docs_page.png) -## AI - -Currently supports either an Ollama instance or OpenAI's ChatGPT, using your own API key. Setting up is easy as either setting the Ollama url or the OpenAI API key in the API's environmental variables in the `docker-compose.yml` file: - -```yaml -scraperr_api: - environment: - - OLLAMA_URL=http://ollama:11434 - - OLLAMA_MODEL=llama3.1 - # or - - OPENAI_KEY= - - OPENAI_MODEL=gpt3.5-turbo -``` - -The model's names are taken from the documentation of their respective technologies. - ## Troubleshooting Q: When running Scraperr, I'm met with "404 Page not found". diff --git a/api/backend/app.py b/api/backend/app.py index 9415ec9..9f130b1 100644 --- a/api/backend/app.py +++ b/api/backend/app.py @@ -25,7 +25,7 @@ logging.basicConfig( LOG = logging.getLogger(__name__) -app = FastAPI(title="api") +app = FastAPI(title="api", root_path="/api") app.add_middleware( CORSMiddleware, diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 0c47476..8c2d902 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -2,12 +2,6 @@ version: "3" services: scraperr: command: ["npm", "run", "dev"] - labels: - - "traefik.enable=true" - - "traefik.http.routers.scraperr.rule=Host(`localhost`)" - - "traefik.http.routers.scraperr.entrypoints=web" - - "traefik.http.services.scraperr.loadbalancer.server.port=3000" - - "traefik.http.routers.scraperr.tls=false" volumes: - "$PWD/src:/app/src" - "$PWD/public:/app/public" @@ -16,7 +10,5 @@ services: - "$PWD/package-lock.json:/app/package-lock.json" - "$PWD/tsconfig.json:/app/tsconfig.json" scraperr_api: - ports: - - "8000:8000" volumes: - "$PWD/api:/project/api" diff --git a/docker-compose.yml b/docker-compose.yml index fcf51db..30f26c3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,11 +6,11 @@ services: dockerfile: docker/frontend/Dockerfile container_name: scraperr command: ["npm", "run", "start"] - labels: - - "traefik.enable=true" - - "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost - - "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https - - "traefik.http.services.scraperr.loadbalancer.server.port=3000" + environment: + - NEXT_PUBLIC_API_URL=http://localhost:8000 # your API URL + - SERVER_URL=http://scraperr_api:8000 # your docker container API URL + ports: + - 80:3000 networks: - web scraperr_api: @@ -21,36 +21,15 @@ services: dockerfile: docker/api/Dockerfile environment: - LOG_LEVEL=INFO - - OLLAMA_URL=http://ollama:11434 - - OLLAMA_MODEL=phi3 - MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB - SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string) - ALGORITHM=HS256 # authentication encoding algorithm - ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes container_name: scraperr_api + ports: + - 8000:8000 volumes: - /var/run/docker.sock:/var/run/docker.sock - labels: - - "traefik.enable=true" - - "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost - - "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https - - "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api" - - "traefik.http.routers.scraperr_api.middlewares=api-stripprefix" - - "traefik.http.services.scraperr_api.loadbalancer.server.port=8000" - networks: - - web - traefik: - image: traefik:latest - container_name: traefik - command: - - "--providers.docker=true" - - "--entrypoints.web.address=:80" - - "--entrypoints.websecure.address=:443" - ports: - - 80:80 - - 443:443 - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro" networks: - web mongo: diff --git a/src/components/jobs/JobTable.tsx b/src/components/jobs/JobTable.tsx index fecea28..53f56b4 100644 --- a/src/components/jobs/JobTable.tsx +++ b/src/components/jobs/JobTable.tsx @@ -48,11 +48,14 @@ export const JobTable: React.FC = ({ jobs, setJobs }) => { const router = useRouter(); const handleDownload = async (ids: string[]) => { - const response = await fetch(`${Constants.DOMAIN}/api/download`, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ ids: ids }), - }); + const response = await fetch( + `${process.env.NEXT_PUBLIC_API_URL}/api/download`, + { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ ids: ids }), + } + ); if (response.ok) { const blob = await response.blob(); @@ -104,11 +107,14 @@ export const JobTable: React.FC = ({ jobs, setJobs }) => { }; const handleDeleteSelected = async () => { - const response = await fetch(`${Constants.DOMAIN}/api/delete-scrape-jobs`, { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ ids: Array.from(selectedJobs) }), - }); + const response = await fetch( + `${process.env.NEXT_PUBLIC_API_URL}/api/delete-scrape-jobs`, + { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ ids: Array.from(selectedJobs) }), + } + ); if (response.ok) { setJobs((jobs) => @@ -142,7 +148,7 @@ export const JobTable: React.FC = ({ jobs, setJobs }) => { value: value, }; - await fetch(`${Constants.DOMAIN}/api/update`, { + await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/update`, { method: "POST", headers: { "Content-Type": "application/json", diff --git a/src/contexts/AuthContext.tsx b/src/contexts/AuthContext.tsx index 63e0207..8d41065 100644 --- a/src/contexts/AuthContext.tsx +++ b/src/contexts/AuthContext.tsx @@ -25,7 +25,7 @@ export const AuthProvider: React.FC = ({ children }) => { const token = Cookies.get("token"); if (token) { axios - .get(`${Constants.DOMAIN}/api/auth/users/me`, { + .get(`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`, { headers: { Authorization: `Bearer ${token}` }, }) .then((response) => { @@ -43,7 +43,7 @@ export const AuthProvider: React.FC = ({ children }) => { params.append("username", email); params.append("password", password); const response = await axios.post( - `${Constants.DOMAIN}/api/auth/token`, + `${process.env.NEXT_PUBLIC_API_URL}/api/auth/token`, params ); Cookies.set("token", response.data.access_token, { @@ -54,7 +54,7 @@ export const AuthProvider: React.FC = ({ children }) => { sameSite: "Lax", }); const userResponse = await axios.get( - `${Constants.DOMAIN}/api/auth/users/me`, + `${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`, { headers: { Authorization: `Bearer ${response.data.access_token}` }, } diff --git a/src/lib/utils.ts b/src/lib/utils.ts index ac16466..7bb2798 100644 --- a/src/lib/utils.ts +++ b/src/lib/utils.ts @@ -11,7 +11,7 @@ export const fetchJobs = async ( fetchOptions: fetchOptions = {} ) => { const token = Cookies.get("token"); - await fetch(`/api/retrieve-scrape-jobs`, { + await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`, { method: "POST", headers: { "content-type": "application/json", @@ -29,12 +29,15 @@ export const fetchJobs = async ( export const fetchJob = async (id: string) => { const token = Cookies.get("token"); try { - const response = await fetch(`/api/job/${id}`, { - headers: { - "content-type": "application/json", - Authorization: `Bearer ${token}`, - }, - }); + const response = await fetch( + `${process.env.NEXT_PUBLIC_API_URL}/api/job/${id}`, + { + headers: { + "content-type": "application/json", + Authorization: `Bearer ${token}`, + }, + } + ); const data = await response.json(); return data; } catch (error) { @@ -48,12 +51,15 @@ export const checkAI = async ( ) => { const token = Cookies.get("token"); try { - const response = await fetch(`/api/ai/check`, { - headers: { - "content-type": "application/json", - Authorization: `Bearer ${token}`, - }, - }); + const response = await fetch( + `${process.env.NEXT_PUBLIC_API_URL}/api/ai/check`, + { + headers: { + "content-type": "application/json", + Authorization: `Bearer ${token}`, + }, + } + ); const data = await response.json(); setAiEnabled(data); } catch (error) { @@ -69,7 +75,7 @@ export const updateJob = async (ids: string[], field: string, value: any) => { field: field, value: value, }; - await fetch(`/api/update`, { + await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/update`, { method: "POST", headers: { "content-type": "application/json", diff --git a/src/pages/chat.tsx b/src/pages/chat.tsx index 9f44de1..3a9d26f 100644 --- a/src/pages/chat.tsx +++ b/src/pages/chat.tsx @@ -81,7 +81,7 @@ const AI: React.FC = () => { }. The following messages will pertain to the content of the scraped job.`, }; - const response = await fetch("/api/ai", { + const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/ai`, { method: "POST", headers: { "Content-Type": "application/json", diff --git a/src/pages/jobs.tsx b/src/pages/jobs.tsx index 9a8a008..3e131ee 100644 --- a/src/pages/jobs.tsx +++ b/src/pages/jobs.tsx @@ -22,7 +22,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => { if (token) { try { const userResponse = await axios.get( - `http://scraperr_api:8000/api/auth/users/me`, + `${process.env.SERVER_URL}/api/auth/users/me`, { headers: { Authorization: `Bearer ${token}` }, } @@ -30,7 +30,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => { user = userResponse.data; const jobsResponse = await axios.post( - `http://scraperr_api:8000/api/retrieve-scrape-jobs`, + `${process.env.SERVER_URL}/api/retrieve-scrape-jobs`, { user: user.email }, { headers: { diff --git a/src/pages/logs.tsx b/src/pages/logs.tsx index ec146fa..aecb74b 100644 --- a/src/pages/logs.tsx +++ b/src/pages/logs.tsx @@ -6,7 +6,9 @@ interface logs { export async function getStaticProps() { try { - const response = await fetch(`http://scraperr_api:8000/initial_logs`); + const response = await fetch( + `${process.env.NEXT_PUBLIC_API_URL}/initial_logs` + ); const logJson: logs = await response.json(); const initialLogs = logJson.logs; diff --git a/src/pages/statistics.tsx b/src/pages/statistics.tsx index 11b10c2..66c3457 100644 --- a/src/pages/statistics.tsx +++ b/src/pages/statistics.tsx @@ -30,7 +30,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => { if (token) { try { const averageElementResponse = await fetch( - `http://scraperr_api:8000/statistics/get-average-element-per-link`, + `${process.env.SERVER_URL}/statistics/get-average-element-per-link`, { headers: { Authorization: `Bearer ${token}` }, } @@ -39,7 +39,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => { averageElement = await averageElementResponse.json(); const averageJobResponse = await fetch( - `http://scraperr_api:8000/statistics/get-average-jobs-per-day`, + `${process.env.SERVER_URL}/statistics/get-average-jobs-per-day`, { headers: { Authorization: `Bearer ${token}` }, } @@ -76,7 +76,7 @@ const Statistics: React.FC = ({ averageElement, averageJob }) => { const fetchElementsData = async () => { try { const response = await fetch( - `${Constants.DOMAIN}/api/statistics/get-average-element-per-link`, + `${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-element-per-link`, { headers: { "Content-Type": "application/json", @@ -94,7 +94,7 @@ const Statistics: React.FC = ({ averageElement, averageJob }) => { const fetchJobsData = async () => { try { const response = await fetch( - `${Constants.DOMAIN}/api/statistics/get-average-jobs-per-day`, + `${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-jobs-per-day`, { headers: { "Content-Type": "application/json", diff --git a/src/services/api-service/functions/submit-job.ts b/src/services/api-service/functions/submit-job.ts index 9e82a71..3147dfe 100644 --- a/src/services/api-service/functions/submit-job.ts +++ b/src/services/api-service/functions/submit-job.ts @@ -7,19 +7,22 @@ export const submitJob = async ( jobOptions: any, customHeaders: any ) => { - return await fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, { - method: "POST", - headers: { "content-type": "application/json" }, - body: JSON.stringify({ - url: submittedURL, - elements: rows, - user: user?.email, - time_created: new Date().toISOString(), - job_options: { - ...jobOptions, - custom_headers: customHeaders, - proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [], - }, - }), - }); + return await fetch( + `${process.env.NEXT_PUBLIC_API_URL}/api/submit-scrape-job`, + { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + url: submittedURL, + elements: rows, + user: user?.email, + time_created: new Date().toISOString(), + job_options: { + ...jobOptions, + custom_headers: customHeaders, + proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [], + }, + }), + } + ); };