Refactor: Remove Proxy Dependency (#44)
Some checks are pending
Unit Tests / unit-tests (push) Waiting to run

This commit is contained in:
Jayden Pyles
2024-11-12 17:30:07 -06:00
committed by GitHub
parent 1dfd3ca92a
commit b3bf780eda
12 changed files with 77 additions and 164 deletions

View File

@@ -13,7 +13,7 @@ Scraperr is a self-hosted web application that allows users to scrape data from
From the table, users can download an excel sheet of the job's results, along with an option to rerun the job.
View the [docs](https://scraperr-docs.pages.dev).
View the [docs](https://scraperr-docs.pages.dev) for a quickstart guide and more information.
## Features
@@ -64,87 +64,12 @@ View the [docs](https://scraperr-docs.pages.dev).
![chat](https://github.com/jaypyles/www-scrape/blob/master/docs/chat_page.png)
## Installation
1. Clone the repository:
```sh
git clone https://github.com/jaypyles/scraperr.git
```
2. Set environmental variables and labels in `docker-compose.yml`.
```yaml
scraperr:
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
scraperr_api:
environment:
- LOG_LEVEL=INFO
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"
mongo:
environment:
MONGO_INITDB_ROOT_USERNAME: root
MONGO_INITDB_ROOT_PASSWORD: example
```
Don't want to use `traefik`? This configuration can be used in other reverse proxies, as long as the API is proxied to `/api` of the frontend container. This is currently
not able to be used without a reverse proxy, due to limitations of runtime client-side environmental variables in `next.js`.
3. Deploy
```sh
make up
```
The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy.
## Usage
1. Open the application in your browser at `http://localhost`.
2. Enter the URL you want to scrape in the URL field.
3. Add elements to scrape by specifying a name and the corresponding XPath.
4. Click the "Submit" button to queue URL to be scraped.
5. View queue in the "Previous Jobs" section.
## API Endpoints
Use this service as an API for your own projects. Due to this using FastAPI, a docs page is available at `/docs` for the API.
![docs](https://github.com/jaypyles/www-scrape/blob/master/docs/docs_page.png)
## AI
Currently supports either an Ollama instance or OpenAI's ChatGPT, using your own API key. Setting up is easy as either setting the Ollama url or the OpenAI API key in the API's environmental variables in the `docker-compose.yml` file:
```yaml
scraperr_api:
environment:
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=llama3.1
# or
- OPENAI_KEY=<your_key>
- OPENAI_MODEL=gpt3.5-turbo
```
The model's names are taken from the documentation of their respective technologies.
## Troubleshooting
Q: When running Scraperr, I'm met with "404 Page not found".

View File

@@ -25,7 +25,7 @@ logging.basicConfig(
LOG = logging.getLogger(__name__)
app = FastAPI(title="api")
app = FastAPI(title="api", root_path="/api")
app.add_middleware(
CORSMiddleware,

View File

@@ -2,12 +2,6 @@ version: "3"
services:
scraperr:
command: ["npm", "run", "dev"]
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)"
- "traefik.http.routers.scraperr.entrypoints=web"
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
- "traefik.http.routers.scraperr.tls=false"
volumes:
- "$PWD/src:/app/src"
- "$PWD/public:/app/public"
@@ -16,7 +10,5 @@ services:
- "$PWD/package-lock.json:/app/package-lock.json"
- "$PWD/tsconfig.json:/app/tsconfig.json"
scraperr_api:
ports:
- "8000:8000"
volumes:
- "$PWD/api:/project/api"

View File

@@ -6,11 +6,11 @@ services:
dockerfile: docker/frontend/Dockerfile
container_name: scraperr
command: ["npm", "run", "start"]
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
environment:
- NEXT_PUBLIC_API_URL=http://localhost:8000 # your API URL
- SERVER_URL=http://scraperr_api:8000 # your docker container API URL
ports:
- 80:3000
networks:
- web
scraperr_api:
@@ -21,36 +21,15 @@ services:
dockerfile: docker/api/Dockerfile
environment:
- LOG_LEVEL=INFO
- OLLAMA_URL=http://ollama:11434
- OLLAMA_MODEL=phi3
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
- ALGORITHM=HS256 # authentication encoding algorithm
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
container_name: scraperr_api
ports:
- 8000:8000
volumes:
- /var/run/docker.sock:/var/run/docker.sock
labels:
- "traefik.enable=true"
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"
networks:
- web
traefik:
image: traefik:latest
container_name: traefik
command:
- "--providers.docker=true"
- "--entrypoints.web.address=:80"
- "--entrypoints.websecure.address=:443"
ports:
- 80:80
- 443:443
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro"
networks:
- web
mongo:

View File

@@ -48,11 +48,14 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
const router = useRouter();
const handleDownload = async (ids: string[]) => {
const response = await fetch(`${Constants.DOMAIN}/api/download`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: ids }),
});
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/download`,
{
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: ids }),
}
);
if (response.ok) {
const blob = await response.blob();
@@ -104,11 +107,14 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
};
const handleDeleteSelected = async () => {
const response = await fetch(`${Constants.DOMAIN}/api/delete-scrape-jobs`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: Array.from(selectedJobs) }),
});
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/delete-scrape-jobs`,
{
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ ids: Array.from(selectedJobs) }),
}
);
if (response.ok) {
setJobs((jobs) =>
@@ -142,7 +148,7 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
value: value,
};
await fetch(`${Constants.DOMAIN}/api/update`, {
await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/update`, {
method: "POST",
headers: {
"Content-Type": "application/json",

View File

@@ -25,7 +25,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
const token = Cookies.get("token");
if (token) {
axios
.get(`${Constants.DOMAIN}/api/auth/users/me`, {
.get(`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`, {
headers: { Authorization: `Bearer ${token}` },
})
.then((response) => {
@@ -43,7 +43,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
params.append("username", email);
params.append("password", password);
const response = await axios.post(
`${Constants.DOMAIN}/api/auth/token`,
`${process.env.NEXT_PUBLIC_API_URL}/api/auth/token`,
params
);
Cookies.set("token", response.data.access_token, {
@@ -54,7 +54,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
sameSite: "Lax",
});
const userResponse = await axios.get(
`${Constants.DOMAIN}/api/auth/users/me`,
`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
{
headers: { Authorization: `Bearer ${response.data.access_token}` },
}

View File

@@ -11,7 +11,7 @@ export const fetchJobs = async (
fetchOptions: fetchOptions = {}
) => {
const token = Cookies.get("token");
await fetch(`/api/retrieve-scrape-jobs`, {
await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`, {
method: "POST",
headers: {
"content-type": "application/json",
@@ -29,12 +29,15 @@ export const fetchJobs = async (
export const fetchJob = async (id: string) => {
const token = Cookies.get("token");
try {
const response = await fetch(`/api/job/${id}`, {
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
});
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/job/${id}`,
{
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
const data = await response.json();
return data;
} catch (error) {
@@ -48,12 +51,15 @@ export const checkAI = async (
) => {
const token = Cookies.get("token");
try {
const response = await fetch(`/api/ai/check`, {
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
});
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/ai/check`,
{
headers: {
"content-type": "application/json",
Authorization: `Bearer ${token}`,
},
}
);
const data = await response.json();
setAiEnabled(data);
} catch (error) {
@@ -69,7 +75,7 @@ export const updateJob = async (ids: string[], field: string, value: any) => {
field: field,
value: value,
};
await fetch(`/api/update`, {
await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/update`, {
method: "POST",
headers: {
"content-type": "application/json",

View File

@@ -81,7 +81,7 @@ const AI: React.FC = () => {
}. The following messages will pertain to the content of the scraped job.`,
};
const response = await fetch("/api/ai", {
const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/ai`, {
method: "POST",
headers: {
"Content-Type": "application/json",

View File

@@ -22,7 +22,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
if (token) {
try {
const userResponse = await axios.get(
`http://scraperr_api:8000/api/auth/users/me`,
`${process.env.SERVER_URL}/api/auth/users/me`,
{
headers: { Authorization: `Bearer ${token}` },
}
@@ -30,7 +30,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
user = userResponse.data;
const jobsResponse = await axios.post(
`http://scraperr_api:8000/api/retrieve-scrape-jobs`,
`${process.env.SERVER_URL}/api/retrieve-scrape-jobs`,
{ user: user.email },
{
headers: {

View File

@@ -6,7 +6,9 @@ interface logs {
export async function getStaticProps() {
try {
const response = await fetch(`http://scraperr_api:8000/initial_logs`);
const response = await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/initial_logs`
);
const logJson: logs = await response.json();
const initialLogs = logJson.logs;

View File

@@ -30,7 +30,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
if (token) {
try {
const averageElementResponse = await fetch(
`http://scraperr_api:8000/statistics/get-average-element-per-link`,
`${process.env.SERVER_URL}/statistics/get-average-element-per-link`,
{
headers: { Authorization: `Bearer ${token}` },
}
@@ -39,7 +39,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
averageElement = await averageElementResponse.json();
const averageJobResponse = await fetch(
`http://scraperr_api:8000/statistics/get-average-jobs-per-day`,
`${process.env.SERVER_URL}/statistics/get-average-jobs-per-day`,
{
headers: { Authorization: `Bearer ${token}` },
}
@@ -76,7 +76,7 @@ const Statistics: React.FC<StatProps> = ({ averageElement, averageJob }) => {
const fetchElementsData = async () => {
try {
const response = await fetch(
`${Constants.DOMAIN}/api/statistics/get-average-element-per-link`,
`${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-element-per-link`,
{
headers: {
"Content-Type": "application/json",
@@ -94,7 +94,7 @@ const Statistics: React.FC<StatProps> = ({ averageElement, averageJob }) => {
const fetchJobsData = async () => {
try {
const response = await fetch(
`${Constants.DOMAIN}/api/statistics/get-average-jobs-per-day`,
`${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-jobs-per-day`,
{
headers: {
"Content-Type": "application/json",

View File

@@ -7,19 +7,22 @@ export const submitJob = async (
jobOptions: any,
customHeaders: any
) => {
return await fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, {
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
url: submittedURL,
elements: rows,
user: user?.email,
time_created: new Date().toISOString(),
job_options: {
...jobOptions,
custom_headers: customHeaders,
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
},
}),
});
return await fetch(
`${process.env.NEXT_PUBLIC_API_URL}/api/submit-scrape-job`,
{
method: "POST",
headers: { "content-type": "application/json" },
body: JSON.stringify({
url: submittedURL,
elements: rows,
user: user?.email,
time_created: new Date().toISOString(),
job_options: {
...jobOptions,
custom_headers: customHeaders,
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
},
}),
}
);
};