mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-14 11:46:17 +00:00
Refactor: Remove Proxy Dependency (#44)
Some checks are pending
Unit Tests / unit-tests (push) Waiting to run
Some checks are pending
Unit Tests / unit-tests (push) Waiting to run
This commit is contained in:
77
README.md
77
README.md
@@ -13,7 +13,7 @@ Scraperr is a self-hosted web application that allows users to scrape data from
|
||||
|
||||
From the table, users can download an excel sheet of the job's results, along with an option to rerun the job.
|
||||
|
||||
View the [docs](https://scraperr-docs.pages.dev).
|
||||
View the [docs](https://scraperr-docs.pages.dev) for a quickstart guide and more information.
|
||||
|
||||
## Features
|
||||
|
||||
@@ -64,87 +64,12 @@ View the [docs](https://scraperr-docs.pages.dev).
|
||||
|
||||

|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```sh
|
||||
git clone https://github.com/jaypyles/scraperr.git
|
||||
|
||||
```
|
||||
|
||||
2. Set environmental variables and labels in `docker-compose.yml`.
|
||||
|
||||
```yaml
|
||||
scraperr:
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
|
||||
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
|
||||
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
|
||||
|
||||
scraperr_api:
|
||||
environment:
|
||||
- LOG_LEVEL=INFO
|
||||
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
|
||||
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
|
||||
- ALGORITHM=HS256 # authentication encoding algorithm
|
||||
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
|
||||
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
|
||||
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
|
||||
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
|
||||
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"
|
||||
|
||||
mongo:
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: root
|
||||
MONGO_INITDB_ROOT_PASSWORD: example
|
||||
```
|
||||
|
||||
Don't want to use `traefik`? This configuration can be used in other reverse proxies, as long as the API is proxied to `/api` of the frontend container. This is currently
|
||||
not able to be used without a reverse proxy, due to limitations of runtime client-side environmental variables in `next.js`.
|
||||
|
||||
3. Deploy
|
||||
|
||||
```sh
|
||||
make up
|
||||
```
|
||||
|
||||
The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy.
|
||||
|
||||
## Usage
|
||||
|
||||
1. Open the application in your browser at `http://localhost`.
|
||||
2. Enter the URL you want to scrape in the URL field.
|
||||
3. Add elements to scrape by specifying a name and the corresponding XPath.
|
||||
4. Click the "Submit" button to queue URL to be scraped.
|
||||
5. View queue in the "Previous Jobs" section.
|
||||
|
||||
## API Endpoints
|
||||
|
||||
Use this service as an API for your own projects. Due to this using FastAPI, a docs page is available at `/docs` for the API.
|
||||
|
||||

|
||||
|
||||
## AI
|
||||
|
||||
Currently supports either an Ollama instance or OpenAI's ChatGPT, using your own API key. Setting up is easy as either setting the Ollama url or the OpenAI API key in the API's environmental variables in the `docker-compose.yml` file:
|
||||
|
||||
```yaml
|
||||
scraperr_api:
|
||||
environment:
|
||||
- OLLAMA_URL=http://ollama:11434
|
||||
- OLLAMA_MODEL=llama3.1
|
||||
# or
|
||||
- OPENAI_KEY=<your_key>
|
||||
- OPENAI_MODEL=gpt3.5-turbo
|
||||
```
|
||||
|
||||
The model's names are taken from the documentation of their respective technologies.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
Q: When running Scraperr, I'm met with "404 Page not found".
|
||||
|
||||
@@ -25,7 +25,7 @@ logging.basicConfig(
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(title="api")
|
||||
app = FastAPI(title="api", root_path="/api")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
|
||||
@@ -2,12 +2,6 @@ version: "3"
|
||||
services:
|
||||
scraperr:
|
||||
command: ["npm", "run", "dev"]
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.scraperr.rule=Host(`localhost`)"
|
||||
- "traefik.http.routers.scraperr.entrypoints=web"
|
||||
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
|
||||
- "traefik.http.routers.scraperr.tls=false"
|
||||
volumes:
|
||||
- "$PWD/src:/app/src"
|
||||
- "$PWD/public:/app/public"
|
||||
@@ -16,7 +10,5 @@ services:
|
||||
- "$PWD/package-lock.json:/app/package-lock.json"
|
||||
- "$PWD/tsconfig.json:/app/tsconfig.json"
|
||||
scraperr_api:
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- "$PWD/api:/project/api"
|
||||
|
||||
@@ -6,11 +6,11 @@ services:
|
||||
dockerfile: docker/frontend/Dockerfile
|
||||
container_name: scraperr
|
||||
command: ["npm", "run", "start"]
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost
|
||||
- "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https
|
||||
- "traefik.http.services.scraperr.loadbalancer.server.port=3000"
|
||||
environment:
|
||||
- NEXT_PUBLIC_API_URL=http://localhost:8000 # your API URL
|
||||
- SERVER_URL=http://scraperr_api:8000 # your docker container API URL
|
||||
ports:
|
||||
- 80:3000
|
||||
networks:
|
||||
- web
|
||||
scraperr_api:
|
||||
@@ -21,36 +21,15 @@ services:
|
||||
dockerfile: docker/api/Dockerfile
|
||||
environment:
|
||||
- LOG_LEVEL=INFO
|
||||
- OLLAMA_URL=http://ollama:11434
|
||||
- OLLAMA_MODEL=phi3
|
||||
- MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB
|
||||
- SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string)
|
||||
- ALGORITHM=HS256 # authentication encoding algorithm
|
||||
- ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes
|
||||
container_name: scraperr_api
|
||||
ports:
|
||||
- 8000:8000
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost
|
||||
- "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https
|
||||
- "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api"
|
||||
- "traefik.http.routers.scraperr_api.middlewares=api-stripprefix"
|
||||
- "traefik.http.services.scraperr_api.loadbalancer.server.port=8000"
|
||||
networks:
|
||||
- web
|
||||
traefik:
|
||||
image: traefik:latest
|
||||
container_name: traefik
|
||||
command:
|
||||
- "--providers.docker=true"
|
||||
- "--entrypoints.web.address=:80"
|
||||
- "--entrypoints.websecure.address=:443"
|
||||
ports:
|
||||
- 80:80
|
||||
- 443:443
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro"
|
||||
networks:
|
||||
- web
|
||||
mongo:
|
||||
|
||||
@@ -48,11 +48,14 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
||||
const router = useRouter();
|
||||
|
||||
const handleDownload = async (ids: string[]) => {
|
||||
const response = await fetch(`${Constants.DOMAIN}/api/download`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ ids: ids }),
|
||||
});
|
||||
const response = await fetch(
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/download`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ ids: ids }),
|
||||
}
|
||||
);
|
||||
|
||||
if (response.ok) {
|
||||
const blob = await response.blob();
|
||||
@@ -104,11 +107,14 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
||||
};
|
||||
|
||||
const handleDeleteSelected = async () => {
|
||||
const response = await fetch(`${Constants.DOMAIN}/api/delete-scrape-jobs`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ ids: Array.from(selectedJobs) }),
|
||||
});
|
||||
const response = await fetch(
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/delete-scrape-jobs`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ ids: Array.from(selectedJobs) }),
|
||||
}
|
||||
);
|
||||
|
||||
if (response.ok) {
|
||||
setJobs((jobs) =>
|
||||
@@ -142,7 +148,7 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => {
|
||||
value: value,
|
||||
};
|
||||
|
||||
await fetch(`${Constants.DOMAIN}/api/update`, {
|
||||
await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/update`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
|
||||
@@ -25,7 +25,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
|
||||
const token = Cookies.get("token");
|
||||
if (token) {
|
||||
axios
|
||||
.get(`${Constants.DOMAIN}/api/auth/users/me`, {
|
||||
.get(`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`, {
|
||||
headers: { Authorization: `Bearer ${token}` },
|
||||
})
|
||||
.then((response) => {
|
||||
@@ -43,7 +43,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
|
||||
params.append("username", email);
|
||||
params.append("password", password);
|
||||
const response = await axios.post(
|
||||
`${Constants.DOMAIN}/api/auth/token`,
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/auth/token`,
|
||||
params
|
||||
);
|
||||
Cookies.set("token", response.data.access_token, {
|
||||
@@ -54,7 +54,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => {
|
||||
sameSite: "Lax",
|
||||
});
|
||||
const userResponse = await axios.get(
|
||||
`${Constants.DOMAIN}/api/auth/users/me`,
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`,
|
||||
{
|
||||
headers: { Authorization: `Bearer ${response.data.access_token}` },
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ export const fetchJobs = async (
|
||||
fetchOptions: fetchOptions = {}
|
||||
) => {
|
||||
const token = Cookies.get("token");
|
||||
await fetch(`/api/retrieve-scrape-jobs`, {
|
||||
await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
@@ -29,12 +29,15 @@ export const fetchJobs = async (
|
||||
export const fetchJob = async (id: string) => {
|
||||
const token = Cookies.get("token");
|
||||
try {
|
||||
const response = await fetch(`/api/job/${id}`, {
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
Authorization: `Bearer ${token}`,
|
||||
},
|
||||
});
|
||||
const response = await fetch(
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/job/${id}`,
|
||||
{
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
Authorization: `Bearer ${token}`,
|
||||
},
|
||||
}
|
||||
);
|
||||
const data = await response.json();
|
||||
return data;
|
||||
} catch (error) {
|
||||
@@ -48,12 +51,15 @@ export const checkAI = async (
|
||||
) => {
|
||||
const token = Cookies.get("token");
|
||||
try {
|
||||
const response = await fetch(`/api/ai/check`, {
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
Authorization: `Bearer ${token}`,
|
||||
},
|
||||
});
|
||||
const response = await fetch(
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/ai/check`,
|
||||
{
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
Authorization: `Bearer ${token}`,
|
||||
},
|
||||
}
|
||||
);
|
||||
const data = await response.json();
|
||||
setAiEnabled(data);
|
||||
} catch (error) {
|
||||
@@ -69,7 +75,7 @@ export const updateJob = async (ids: string[], field: string, value: any) => {
|
||||
field: field,
|
||||
value: value,
|
||||
};
|
||||
await fetch(`/api/update`, {
|
||||
await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/update`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
|
||||
@@ -81,7 +81,7 @@ const AI: React.FC = () => {
|
||||
}. The following messages will pertain to the content of the scraped job.`,
|
||||
};
|
||||
|
||||
const response = await fetch("/api/ai", {
|
||||
const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/api/ai`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
|
||||
@@ -22,7 +22,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
|
||||
if (token) {
|
||||
try {
|
||||
const userResponse = await axios.get(
|
||||
`http://scraperr_api:8000/api/auth/users/me`,
|
||||
`${process.env.SERVER_URL}/api/auth/users/me`,
|
||||
{
|
||||
headers: { Authorization: `Bearer ${token}` },
|
||||
}
|
||||
@@ -30,7 +30,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
|
||||
user = userResponse.data;
|
||||
|
||||
const jobsResponse = await axios.post(
|
||||
`http://scraperr_api:8000/api/retrieve-scrape-jobs`,
|
||||
`${process.env.SERVER_URL}/api/retrieve-scrape-jobs`,
|
||||
{ user: user.email },
|
||||
{
|
||||
headers: {
|
||||
|
||||
@@ -6,7 +6,9 @@ interface logs {
|
||||
|
||||
export async function getStaticProps() {
|
||||
try {
|
||||
const response = await fetch(`http://scraperr_api:8000/initial_logs`);
|
||||
const response = await fetch(
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/initial_logs`
|
||||
);
|
||||
const logJson: logs = await response.json();
|
||||
const initialLogs = logJson.logs;
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
|
||||
if (token) {
|
||||
try {
|
||||
const averageElementResponse = await fetch(
|
||||
`http://scraperr_api:8000/statistics/get-average-element-per-link`,
|
||||
`${process.env.SERVER_URL}/statistics/get-average-element-per-link`,
|
||||
{
|
||||
headers: { Authorization: `Bearer ${token}` },
|
||||
}
|
||||
@@ -39,7 +39,7 @@ export const getServerSideProps: GetServerSideProps = async (context) => {
|
||||
averageElement = await averageElementResponse.json();
|
||||
|
||||
const averageJobResponse = await fetch(
|
||||
`http://scraperr_api:8000/statistics/get-average-jobs-per-day`,
|
||||
`${process.env.SERVER_URL}/statistics/get-average-jobs-per-day`,
|
||||
{
|
||||
headers: { Authorization: `Bearer ${token}` },
|
||||
}
|
||||
@@ -76,7 +76,7 @@ const Statistics: React.FC<StatProps> = ({ averageElement, averageJob }) => {
|
||||
const fetchElementsData = async () => {
|
||||
try {
|
||||
const response = await fetch(
|
||||
`${Constants.DOMAIN}/api/statistics/get-average-element-per-link`,
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-element-per-link`,
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
@@ -94,7 +94,7 @@ const Statistics: React.FC<StatProps> = ({ averageElement, averageJob }) => {
|
||||
const fetchJobsData = async () => {
|
||||
try {
|
||||
const response = await fetch(
|
||||
`${Constants.DOMAIN}/api/statistics/get-average-jobs-per-day`,
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/statistics/get-average-jobs-per-day`,
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
|
||||
@@ -7,19 +7,22 @@ export const submitJob = async (
|
||||
jobOptions: any,
|
||||
customHeaders: any
|
||||
) => {
|
||||
return await fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, {
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
url: submittedURL,
|
||||
elements: rows,
|
||||
user: user?.email,
|
||||
time_created: new Date().toISOString(),
|
||||
job_options: {
|
||||
...jobOptions,
|
||||
custom_headers: customHeaders,
|
||||
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
|
||||
},
|
||||
}),
|
||||
});
|
||||
return await fetch(
|
||||
`${process.env.NEXT_PUBLIC_API_URL}/api/submit-scrape-job`,
|
||||
{
|
||||
method: "POST",
|
||||
headers: { "content-type": "application/json" },
|
||||
body: JSON.stringify({
|
||||
url: submittedURL,
|
||||
elements: rows,
|
||||
user: user?.email,
|
||||
time_created: new Date().toISOString(),
|
||||
job_options: {
|
||||
...jobOptions,
|
||||
custom_headers: customHeaders,
|
||||
proxies: jobOptions.proxies ? jobOptions.proxies.split(",") : [],
|
||||
},
|
||||
}),
|
||||
}
|
||||
);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user