mirror of
				https://github.com/jaypyles/Scraperr.git
				synced 2025-10-31 06:27:06 +00:00 
			
		
		
		
	Compare commits
	
		
			65 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | d4edb9d93e | ||
|   | 5ebd96b62b | ||
|   | d602d3330a | ||
|   | 6639e8b48f | ||
|   | 263e46ba4d | ||
|   | f815a58efc | ||
|   | 50ec5df657 | ||
|   | 28de0f362c | ||
|   | 6b33723cac | ||
|   | 5c89e4d7d2 | ||
|   | ed0828a585 | ||
|   | 1b8c8c779a | ||
|   | 267cc73657 | ||
|   | 92ff16d9c3 | ||
|   | 8b2e5dc9c3 | ||
|   | 7f1bc295ac | ||
|   | 031572325f | ||
|   | 48d3bf9214 | ||
|   | e07abcd089 | ||
|   | 8a933b88a7 | ||
|   | 863dbcd044 | ||
|   | de40181a6f | ||
|   | 8703f706a1 | ||
|   | b40d378bbf | ||
|   | 8123e1f149 | ||
|   | 8cd30599fa | ||
|   | a58212b214 | ||
|   | a6ab6ec71d | ||
|   | c5c9427af4 | ||
|   | e8d80c1a77 | ||
|   | ee8047ac78 | ||
|   | e74c4f392c | ||
|   | 6b484952a3 | ||
|   | 2283808605 | ||
|   | ee5ada70f7 | ||
|   | 56cc457e6e | ||
|   | 21a38181de | ||
|   | 3063bc0d53 | ||
|   | f42e7ed531 | ||
|   | c197f2becd | ||
|   | a534129702 | ||
|   | 455ed049c9 | ||
|   | de4ccfbf3a | ||
|   | 3475d66995 | ||
|   | 186b4a0231 | ||
|   | 0af0ebf5b5 | ||
|   | ef35db00d7 | ||
|   | d65e600ec3 | ||
|   | 6fe145f649 | ||
|   | 563ca2245e | ||
|   | d54fdbd405 | ||
|   | 7169755cd2 | ||
|   | 15b56b5704 | ||
|   | bf6b740005 | ||
|   | c339e75e06 | ||
|   | b6ed40e6cf | ||
|   | 3085f9d31a | ||
|   | 7d80ff5c7f | ||
|   | 3a0762f1e3 | ||
|   | dc4d219205 | ||
|   | b3bf780eda | ||
|   | 1dfd3ca92a | ||
|   | fe51140a0e | ||
|   | dd6cec6679 | ||
|   | 2339ba1b77 | 
							
								
								
									
										4
									
								
								.dockerignore
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								.dockerignore
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,4 @@ | ||||
| node_modules | ||||
| npm-debug.log | ||||
| Dockerfile | ||||
| .dockerignore | ||||
							
								
								
									
										32
									
								
								.github/ISSUE_TEMPLATE/bug_report.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								.github/ISSUE_TEMPLATE/bug_report.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| --- | ||||
| name: Bug report | ||||
| about: 'Bug reporting ' | ||||
| title: '' | ||||
| labels: '' | ||||
| assignees: '' | ||||
|  | ||||
| --- | ||||
|  | ||||
| **Describe the bug** | ||||
| A clear and concise description of what the bug is. | ||||
|  | ||||
| **To Reproduce** | ||||
| Steps to reproduce the behavior: | ||||
| 1. Go to '...' | ||||
| 2. Click on '....' | ||||
| 3. Scroll down to '....' | ||||
| 4. See error | ||||
|  | ||||
| **Expected behavior** | ||||
| A clear and concise description of what you expected to happen. | ||||
|  | ||||
| **Screenshots** | ||||
| If applicable, add screenshots to help explain your problem. | ||||
|  | ||||
| **Desktop (please complete the following information):** | ||||
|  - OS: [e.g. iOS] | ||||
|  - Browser [e.g. chrome, safari] | ||||
|  - Version [e.g. 22] | ||||
|  | ||||
| **Additional context** | ||||
| Add any other context about the problem here. | ||||
							
								
								
									
										50
									
								
								.github/actions/push-to-helm/action.yaml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								.github/actions/push-to-helm/action.yaml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | ||||
| name: Publish Helm Chart | ||||
| description: Publish a Helm chart to a target repository | ||||
|  | ||||
| inputs: | ||||
|   app-repo-token: | ||||
|     required: true | ||||
|     description: "The token for the target repository" | ||||
|  | ||||
| runs: | ||||
|   using: 'composite' | ||||
|   steps: | ||||
|     - name: Checkout app repo | ||||
|       uses: actions/checkout@v4 | ||||
|  | ||||
|     - name: Set up Helm | ||||
|       uses: azure/setup-helm@v3 | ||||
|  | ||||
|     - name: Package Helm chart | ||||
|       run: | | ||||
|         mkdir -p packaged | ||||
|         helm package helm -d packaged | ||||
|       shell: bash | ||||
|  | ||||
|     - name: Clone target Helm repo | ||||
|       run: | | ||||
|         git clone https://github.com/jaypyles/helm.git target-repo | ||||
|         cd target-repo | ||||
|         git config user.name "github-actions" | ||||
|         git config user.email "github-actions@github.com" | ||||
|         git fetch origin gh-pages  # Fetch gh-pages explicitly | ||||
|         git checkout gh-pages      # Checkout gh-pages branch | ||||
|         git pull origin gh-pages    # Pull latest changes from gh-pages | ||||
|       shell: bash | ||||
|  | ||||
|     - name: Copy package and update index | ||||
|       run: | | ||||
|         APP_NAME="scraperr" | ||||
|         mkdir -p target-repo/charts/$APP_NAME | ||||
|         cp packaged/*.tgz target-repo/charts/$APP_NAME/ | ||||
|         cd target-repo/charts/$APP_NAME | ||||
|         helm repo index . --url https://jaypyles.github.io/helm/charts/$APP_NAME | ||||
|       shell: bash | ||||
|  | ||||
|     - name: Commit and push to target repo | ||||
|       run: | | ||||
|         cd target-repo | ||||
|         git add charts/ | ||||
|         git commit -m "Update $APP_NAME chart $(date +'%Y-%m-%d %H:%M:%S')" || echo "No changes" | ||||
|         git push https://x-access-token:${{ inputs.app-repo-token }}@github.com/jaypyles/helm.git gh-pages | ||||
|       shell: bash | ||||
							
								
								
									
										58
									
								
								.github/actions/run-cypress-tests/action.yaml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								.github/actions/run-cypress-tests/action.yaml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,58 @@ | ||||
| name: Run Cypress Tests | ||||
|  | ||||
| description: Run Cypress tests | ||||
|  | ||||
| runs: | ||||
|   using: "composite" | ||||
|   steps: | ||||
|     - name: Checkout code | ||||
|       uses: actions/checkout@v4 | ||||
|  | ||||
|     - name: Setup Node | ||||
|       uses: actions/setup-node@v4 | ||||
|       with: | ||||
|         node-version: 22 | ||||
|  | ||||
|     - name: Setup Docker project | ||||
|       shell: bash | ||||
|       run: make build-ci up-ci | ||||
|  | ||||
|     - name: Install dependencies | ||||
|       shell: bash | ||||
|       run: yarn install | ||||
|  | ||||
|     - name: Wait for frontend to be ready | ||||
|       shell: bash | ||||
|       run: | | ||||
|         for i in {1..10}; do | ||||
|           curl -s http://127.0.0.1:80 && echo "Frontend is ready" && exit 0 | ||||
|           echo "Waiting for frontend to be ready... attempt $i" | ||||
|           sleep 1 | ||||
|         done | ||||
|         echo "Frontend failed to be ready after 10 retries" | ||||
|         exit 1 | ||||
|  | ||||
|     - name: Wait for backend to be ready | ||||
|       shell: bash | ||||
|       run: | | ||||
|         for i in {1..10}; do | ||||
|           curl -s http://127.0.0.1:8000 && echo "Backend is ready" && exit 0 | ||||
|           echo "Waiting for backend to be ready... attempt $i" | ||||
|           sleep 1 | ||||
|         done | ||||
|         echo "Backend failed to be ready after 10 retries" | ||||
|         exit 1 | ||||
|  | ||||
|     - name: Show backend logs on failure | ||||
|       if: failure() | ||||
|       shell: bash | ||||
|       run: | | ||||
|         echo "== Docker Containers ==" | ||||
|         docker ps -a | ||||
|         echo "== Backend Logs ==" | ||||
|         docker logs $(docker ps -a --filter "name=scraperr_api" --format "{{.Names}}") || echo "Could not get backend logs" | ||||
|  | ||||
|     - name: Run Cypress tests | ||||
|       shell: bash | ||||
|       run: npm run cy:run | ||||
|  | ||||
							
								
								
									
										51
									
								
								.github/workflows/docker-image.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										51
									
								
								.github/workflows/docker-image.yml
									
									
									
									
										vendored
									
									
								
							| @@ -1,9 +1,6 @@ | ||||
| name: ci | ||||
| requires: | ||||
|   - unit-tests | ||||
| name: Docker Image | ||||
| on: | ||||
|   push: | ||||
|     branches: ["master"] | ||||
|   workflow_dispatch: | ||||
|  | ||||
| jobs: | ||||
|   build: | ||||
| @@ -12,6 +9,12 @@ jobs: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
|  | ||||
|       - name: Get version from helm chart | ||||
|         run: | | ||||
|           VERSION=$(grep "version:" ./helm/Chart.yaml | cut -d: -f2 | tr -d ' ') | ||||
|           echo "VERSION=$VERSION" >> $GITHUB_ENV | ||||
|           echo "Version is $VERSION" | ||||
|  | ||||
|       - name: Login to Docker Hub | ||||
|         uses: docker/login-action@v3 | ||||
|         with: | ||||
| @@ -27,7 +30,9 @@ jobs: | ||||
|           context: . | ||||
|           file: ./docker/frontend/Dockerfile | ||||
|           push: true | ||||
|           tags: ${{ secrets.DOCKERHUB_USERNAME }}/${{ secrets.DOCKERHUB_REPO }}:latest | ||||
|           tags: | | ||||
|             ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:latest | ||||
|             ${{ secrets.DOCKERHUB_USERNAME }}/scraperr:${{ env.VERSION }} | ||||
|  | ||||
|       - name: Build and push api | ||||
|         uses: docker/build-push-action@v5 | ||||
| @@ -35,4 +40,36 @@ jobs: | ||||
|           context: . | ||||
|           file: ./docker/api/Dockerfile | ||||
|           push: true | ||||
|           tags: ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest | ||||
|           tags: | | ||||
|             ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest | ||||
|             ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:${{ env.VERSION }} | ||||
|  | ||||
|   push-helm-chart: | ||||
|     runs-on: ubuntu-latest | ||||
|     needs: | ||||
|       - build | ||||
|     steps: | ||||
|       - uses: actions/checkout@v4 | ||||
|  | ||||
|       - name: Push Helm Chart | ||||
|         uses: ./.github/actions/push-to-helm | ||||
|         with: | ||||
|           app-repo-token: ${{ secrets.GPAT_TOKEN }} | ||||
|  | ||||
|   success-message: | ||||
|     runs-on: ubuntu-latest | ||||
|     needs: | ||||
|       - build | ||||
|       - push-helm-chart | ||||
|     steps: | ||||
|       - name: Send Discord Message | ||||
|         uses: jaypyles/discord-webhook-action@v1.0.0 | ||||
|         with: | ||||
|           webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }} | ||||
|           content: "Scraperr Successfully Built Docker Images" | ||||
|           username: "Scraperr CI" | ||||
|           embed-title: "✅ Deployment Status" | ||||
|           embed-description: "Scraperr successfully built docker images." | ||||
|           embed-color: 3066993 # Green | ||||
|           embed-footer-text: "Scraperr CI" | ||||
|           embed-timestamp: ${{ github.event.head_commit.timestamp }} | ||||
|   | ||||
							
								
								
									
										38
									
								
								.github/workflows/unit-tests.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										38
									
								
								.github/workflows/unit-tests.yml
									
									
									
									
										vendored
									
									
								
							| @@ -4,9 +4,11 @@ on: | ||||
|   push: | ||||
|     branches: | ||||
|       - master | ||||
|  | ||||
|   pull_request: | ||||
|     branches: | ||||
|       - master | ||||
|     types: [opened, synchronize, reopened] | ||||
|  | ||||
|   workflow_dispatch: | ||||
|  | ||||
| jobs: | ||||
|   unit-tests: | ||||
| @@ -15,11 +17,41 @@ jobs: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
|  | ||||
|       - name: Set env | ||||
|         run: echo "ENV=test" >> $GITHUB_ENV | ||||
|  | ||||
|       - name: Install pdm | ||||
|         run: pip install pdm | ||||
|  | ||||
|       - name: Install project dependencies | ||||
|         run: pdm install | ||||
|  | ||||
|       - name: Install playwright | ||||
|         run: pdm run playwright install | ||||
|  | ||||
|       - name: Run tests | ||||
|         run: PYTHONPATH=. pdm run pytest api/backend/tests | ||||
|         run: PYTHONPATH=. pdm run pytest -v -ra api/backend/tests | ||||
|  | ||||
|   cypress-tests: | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - uses: actions/checkout@v4 | ||||
|       - uses: ./.github/actions/run-cypress-tests | ||||
|  | ||||
|   success-message: | ||||
|     runs-on: ubuntu-latest | ||||
|     needs: | ||||
|       - unit-tests | ||||
|       - cypress-tests | ||||
|     steps: | ||||
|       - name: Send Discord Message | ||||
|         uses: jaypyles/discord-webhook-action@v1.0.0 | ||||
|         with: | ||||
|           webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }} | ||||
|           content: "Scraperr Successfully Passed Tests" | ||||
|           username: "Scraperr CI" | ||||
|           embed-title: "✅ Deployment Status" | ||||
|           embed-description: "Scraperr successfully passed all tests." | ||||
|           embed-color: 3066993 # Green | ||||
|           embed-footer-text: "Scraperr CI" | ||||
|           embed-timestamp: ${{ github.event.head_commit.timestamp }} | ||||
|   | ||||
							
								
								
									
										16
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										16
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -187,3 +187,19 @@ cython_debug/ | ||||
| postgres_data | ||||
| .vscode | ||||
| ollama | ||||
| data | ||||
|  | ||||
| media/images | ||||
| media/videos | ||||
| media/audio | ||||
| media/pdfs | ||||
| media/spreadsheets | ||||
| media/presentations | ||||
| media/documents | ||||
| media/recordings | ||||
| media/download_summary.txt | ||||
|  | ||||
| cypress/screenshots | ||||
| cypress/videos | ||||
|  | ||||
| docker-compose.dev.local.yml | ||||
							
								
								
									
										2
									
								
								.prettierignore
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								.prettierignore
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| *.yaml | ||||
| *.yml | ||||
							
								
								
									
										1
									
								
								.python-version
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.python-version
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| 3.10.12 | ||||
							
								
								
									
										1
									
								
								FUNDING.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								FUNDING.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| custom: ["https://www.buymeacoffee.com/jaypyles"] | ||||
							
								
								
									
										12
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								Makefile
									
									
									
									
									
								
							| @@ -1,6 +1,6 @@ | ||||
| .DEFAULT_GOAL := help | ||||
|  | ||||
| COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.yml | ||||
| COMPOSE_DEV = docker compose -f docker-compose.yml -f docker-compose.dev.local.yml | ||||
| COMPOSE_PROD = docker compose -f docker-compose.yml | ||||
|  | ||||
| .PHONY: help deps build pull up up-dev down setup deploy | ||||
| @@ -17,6 +17,7 @@ help: | ||||
| 	@echo "  make down    		- Stop and remove containers, networks, images, and volumes" | ||||
| 	@echo "  make setup   		- Setup server with dependencies and clone repo" | ||||
| 	@echo "  make deploy  		- Deploy site onto server" | ||||
| 	@echo "  make cypress-start	- Start Cypress" | ||||
| 	@echo "" | ||||
|  | ||||
| logs: | ||||
| @@ -51,3 +52,12 @@ setup: | ||||
|  | ||||
| deploy: | ||||
| 	ansible-playbook -i ./ansible/inventory.yaml ./ansible/deploy_site.yaml -v | ||||
|  | ||||
| build-ci: | ||||
| 	docker compose -f docker-compose.yml -f docker-compose.dev.yml build | ||||
|  | ||||
| up-ci: | ||||
| 	docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d --force-recreate | ||||
|  | ||||
| cypress-start: | ||||
| 	DISPLAY=:0 npx cypress open | ||||
							
								
								
									
										193
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										193
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,178 +1,71 @@ | ||||
|  | ||||
|  | ||||
| <div align="center"> | ||||
|   <img src="https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=for-the-badge&logo=mongodb&logoColor=white" alt="MongoDB" /> | ||||
|   <img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI" /> | ||||
|   <img src="https://img.shields.io/badge/Next-black?style=for-the-badge&logo=next.js&logoColor=white" alt="Next JS" /> | ||||
|   <img src="https://img.shields.io/badge/tailwindcss-%2338B2AC.svg?style=for-the-badge&logo=tailwind-css&logoColor=white" alt="TailwindCSS" /> | ||||
|   <img src="https://github.com/jaypyles/www-scrape/blob/master/docs/logo_picture.png" alt="Scraperr Logo" width="250px"> | ||||
|    | ||||
|   **A powerful self-hosted web scraping solution** | ||||
|    | ||||
|   <div> | ||||
|     <img src="https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=for-the-badge&logo=mongodb&logoColor=white" alt="MongoDB" /> | ||||
|     <img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI" /> | ||||
|     <img src="https://img.shields.io/badge/Next-black?style=for-the-badge&logo=next.js&logoColor=white" alt="Next JS" /> | ||||
|     <img src="https://img.shields.io/badge/tailwindcss-%2338B2AC.svg?style=for-the-badge&logo=tailwind-css&logoColor=white" alt="TailwindCSS" /> | ||||
|   </div> | ||||
| </div> | ||||
|  | ||||
| # Summary | ||||
| ## 📋 Overview | ||||
|  | ||||
| Scraperr is a self-hosted web application that allows users to scrape data from web pages by specifying elements via XPath. Users can submit URLs and the corresponding elements to be scraped, and the results will be displayed in a table. | ||||
| Scrape websites without writing a single line of code. | ||||
|  | ||||
| From the table, users can download an excel sheet of the job's results, along with an option to rerun the job. | ||||
| > 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information. | ||||
|  | ||||
| View the [docs](https://scraperr-docs.pages.dev). | ||||
| <div align="center"> | ||||
|   <img src="https://github.com/jaypyles/www-scrape/blob/master/docs/main_page.png" alt="Scraperr Main Interface" width="800px"> | ||||
| </div> | ||||
|  | ||||
| ## Features | ||||
| ## ✨ Key Features | ||||
|  | ||||
| ### Submitting URLs for Scraping | ||||
| - **XPath-Based Extraction**: Precisely target page elements | ||||
| - **Queue Management**: Submit and manage multiple scraping jobs | ||||
| - **Domain Spidering**: Option to scrape all pages within the same domain | ||||
| - **Custom Headers**: Add JSON headers to your scraping requests | ||||
| - **Media Downloads**: Automatically download images, videos, and other media | ||||
| - **Results Visualization**: View scraped data in a structured table format | ||||
| - **Data Export**: Export your results in markdown and csv formats | ||||
| - **Notifcation Channels**: Send completion notifcations, through various channels | ||||
|  | ||||
| - Submit/Queue URLs for web scraping | ||||
| - Add and manage elements to scrape using XPath | ||||
| - Scrape all pages within same domain | ||||
| - Add custom json headers to send in requests to URLs | ||||
| - Display results of scraped data | ||||
| ## 🚀 Getting Started | ||||
|  | ||||
|  | ||||
| ### Docker | ||||
|  | ||||
| ### Managing Previous Jobs | ||||
|  | ||||
| - Download csv containing results | ||||
| - Rerun jobs | ||||
| - View status of queued jobs | ||||
| - Favorite and view favorited jobs | ||||
|  | ||||
|  | ||||
|  | ||||
| ### User Management | ||||
|  | ||||
| - User login/signup to organize jobs (optional) | ||||
|  | ||||
|  | ||||
|  | ||||
| ### Log Viewing | ||||
|  | ||||
| - View app logs inside of web ui | ||||
|  | ||||
|  | ||||
|  | ||||
| ### Statistics View | ||||
|  | ||||
| - View a small statistics view of jobs ran | ||||
|  | ||||
|  | ||||
|  | ||||
| ### AI Integration | ||||
|  | ||||
| - Include the results of a selected job into the context of a conversation | ||||
| - Currently supports: | ||||
|  | ||||
| 1. Ollama | ||||
| 2. OpenAI | ||||
|  | ||||
|  | ||||
|  | ||||
| ## Installation | ||||
|  | ||||
| 1. Clone the repository: | ||||
|  | ||||
|    ```sh | ||||
|    git clone https://github.com/jaypyles/scraperr.git | ||||
|  | ||||
|    ``` | ||||
|  | ||||
| 2. Set environmental variables and labels in `docker-compose.yml`. | ||||
|  | ||||
| ```yaml | ||||
| scraperr: | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost | ||||
|       - "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https | ||||
|       - "traefik.http.services.scraperr.loadbalancer.server.port=3000" | ||||
|  | ||||
| scraperr_api: | ||||
|  environment: | ||||
|       - LOG_LEVEL=INFO | ||||
|       - MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB | ||||
|       - SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string) | ||||
|       - ALGORITHM=HS256 # authentication encoding algorithm | ||||
|       - ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes | ||||
|   labels: | ||||
|         - "traefik.enable=true" | ||||
|         - "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost | ||||
|         - "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https | ||||
|         - "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api" | ||||
|         - "traefik.http.routers.scraperr_api.middlewares=api-stripprefix" | ||||
|         - "traefik.http.services.scraperr_api.loadbalancer.server.port=8000" | ||||
|  | ||||
| mongo: | ||||
|     environment: | ||||
|       MONGO_INITDB_ROOT_USERNAME: root | ||||
|       MONGO_INITDB_ROOT_PASSWORD: example | ||||
| ``` | ||||
|  | ||||
| Don't want to use `traefik`? This configuration can be used in other reverse proxies, as long as the API is proxied to `/api` of the frontend container. This is currently | ||||
| not able to be used without a reverse proxy, due to limitations of runtime client-side environmental variables in `next.js`. | ||||
|  | ||||
| 3. Deploy | ||||
|  | ||||
| ```sh | ||||
| ```bash | ||||
| make up | ||||
| ``` | ||||
|  | ||||
| The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy. | ||||
| ### Helm | ||||
|  | ||||
| ## Usage | ||||
| > Refer to the docs for helm deployment: https://scraperr-docs.pages.dev/guides/helm-deployment | ||||
|  | ||||
| 1. Open the application in your browser at `http://localhost`. | ||||
| 2. Enter the URL you want to scrape in the URL field. | ||||
| 3. Add elements to scrape by specifying a name and the corresponding XPath. | ||||
| 4. Click the "Submit" button to queue URL to be scraped. | ||||
| 5. View queue in the "Previous Jobs" section. | ||||
| ## ⚖️ Legal and Ethical Guidelines | ||||
|  | ||||
| ## API Endpoints | ||||
| When using Scraperr, please remember to: | ||||
|  | ||||
| Use this service as an API for your own projects. Due to this using FastAPI, a docs page is available at `/docs` for the API. | ||||
| 1. **Respect `robots.txt`**: Always check a website's `robots.txt` file to verify which pages permit scraping | ||||
| 2. **Terms of Service**: Adhere to each website's Terms of Service regarding data extraction | ||||
| 3. **Rate Limiting**: Implement reasonable delays between requests to avoid overloading servers | ||||
|  | ||||
|  | ||||
| > **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool. | ||||
|  | ||||
| ## AI | ||||
| ## 💬 Join the Community | ||||
|  | ||||
| Currently supports either an Ollama instance or OpenAI's ChatGPT, using your own API key. Setting up is easy as either setting the Ollama url or the OpenAI API key in the API's environmental variables in the `docker-compose.yml` file: | ||||
| Get support, report bugs, and chat with other users and contributors. | ||||
|  | ||||
| ```yaml | ||||
| scraperr_api: | ||||
|   environment: | ||||
|     - OLLAMA_URL=http://ollama:11434 | ||||
|     - OLLAMA_MODEL=llama3.1 | ||||
|     # or | ||||
|     - OPENAI_KEY=<your_key> | ||||
|     - OPENAI_MODEL=gpt3.5-turbo | ||||
| ``` | ||||
| 👉 [Join the Scraperr Discord](https://discord.gg/89q7scsGEK) | ||||
|  | ||||
| The model's names are taken from the documentation of their respective technologies. | ||||
|  | ||||
| ## Troubleshooting | ||||
|  | ||||
| Q: When running Scraperr, I'm met with "404 Page not found".   | ||||
| A: This is probably an issue with MongoDB related to running Scraperr in a VM. You should see something liks this in `make logs`: | ||||
|  | ||||
| ``` | ||||
| WARNING: MongoDB 5.0+ requires a CPU with AVX support, and your current system does not appear to have that! | ||||
| ``` | ||||
|  | ||||
| To resolve this issue, simply set CPU host type to `host`. This can be done in Proxmox in the VM settings > Processor. [Related issue](https://github.com/jaypyles/Scraperr/issues/9). | ||||
|  | ||||
| ## Legal and Ethical Considerations | ||||
|  | ||||
| When using Scraperr, please ensure that you: | ||||
|  | ||||
| 1. **Check Robots.txt**: Verify allowed pages by reviewing the `robots.txt` file of the target website. | ||||
| 2. **Compliance**: Always comply with the website's Terms of Service (ToS) regarding web scraping. | ||||
|  | ||||
| **Disclaimer**: This tool is intended for use only on websites that permit scraping. The author is not responsible for any misuse of this tool. | ||||
|  | ||||
| ## License | ||||
| ## 📄 License | ||||
|  | ||||
| This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. | ||||
|  | ||||
| ### Contributions | ||||
| ## 👏 Contributions | ||||
|  | ||||
| Development made easy by developing from [webapp template](https://github.com/jaypyles/webapp-template). View documentation for extra information. | ||||
| Development made easier with the [webapp template](https://github.com/jaypyles/webapp-template). | ||||
|  | ||||
| Start development server: | ||||
|  | ||||
| `make deps build up-dev` | ||||
| To get started, simply run `make build up-dev`. | ||||
| @@ -1,3 +0,0 @@ | ||||
| github_repo: https://github.com/jaypyles/webapp-template.git | ||||
| deploy_path: /home/admin/site-test6 | ||||
| deploy_command: make pull up-prd | ||||
| @@ -1,10 +0,0 @@ | ||||
| - name: Deploy site | ||||
|   hosts: all | ||||
|   become: true | ||||
|   vars_files: | ||||
|     - ./config.yaml | ||||
|   tasks: | ||||
|     - name: Deploy | ||||
|       command: "{{deploy_command}}" | ||||
|       args: | ||||
|         chdir: "{{deploy_path}}" | ||||
| @@ -1,6 +0,0 @@ | ||||
| all: | ||||
|   hosts: | ||||
|     host1: | ||||
|       ansible_host: 192.168.0.1 | ||||
|       ansible_user: admin | ||||
|       ansible_ssh_private_key_file: private_key.pem | ||||
| @@ -1,54 +0,0 @@ | ||||
| - name: Install Docker and run make pull up | ||||
|   hosts: all | ||||
|   become: true | ||||
|   vars_files: | ||||
|     - ./config.yaml | ||||
|   tasks: | ||||
|     - name: Update apt cache | ||||
|       apt: | ||||
|         update_cache: yes | ||||
|     - name: Install required packages | ||||
|       apt: | ||||
|         name: | ||||
|           - apt-transport-https | ||||
|           - ca-certificates | ||||
|           - curl | ||||
|           - gnupg-agent | ||||
|           - software-properties-common | ||||
|           - rsync | ||||
|           - make | ||||
|         state: present | ||||
|     - name: Add Docker’s official GPG key | ||||
|       apt_key: | ||||
|         url: https://download.docker.com/linux/ubuntu/gpg | ||||
|         state: present | ||||
|     - name: Add Docker APT repository | ||||
|       apt_repository: | ||||
|         repo: deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable | ||||
|         state: present | ||||
|     - name: Update apt cache again after adding Docker repo | ||||
|       apt: | ||||
|         update_cache: yes | ||||
|     - name: Install Docker | ||||
|       apt: | ||||
|         name: docker-ce | ||||
|         state: present | ||||
|     - name: Start and enable Docker service | ||||
|       systemd: | ||||
|         name: docker | ||||
|         enabled: yes | ||||
|         state: started | ||||
|     - name: Install Docker Compose | ||||
|       apt: | ||||
|         name: docker-compose-plugin | ||||
|         state: present | ||||
|     - name: Verify Docker is installed | ||||
|       command: docker --version | ||||
|       register: docker_version | ||||
|     - name: Display Docker version | ||||
|       debug: | ||||
|         msg: "Docker version: {{ docker_version.stdout }}" | ||||
|     - name: Clone repo | ||||
|       ansible.builtin.git: | ||||
|         repo: "{{github_repo}}" | ||||
|         dest: "{{deploy_path}}" | ||||
							
								
								
									
										6
									
								
								api/backend/ai/agent/actions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								api/backend/ai/agent/actions.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| from typing_extensions import TypedDict | ||||
|  | ||||
|  | ||||
| class Action(TypedDict): | ||||
|     type: str | ||||
|     url: str | ||||
							
								
								
									
										94
									
								
								api/backend/ai/agent/agent.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										94
									
								
								api/backend/ai/agent/agent.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,94 @@ | ||||
| import random | ||||
| from typing import Any | ||||
|  | ||||
| from camoufox import AsyncCamoufox | ||||
| from playwright.async_api import Page | ||||
|  | ||||
| from api.backend.ai.agent.utils import ( | ||||
|     capture_elements, | ||||
|     convert_to_markdown, | ||||
|     parse_response, | ||||
| ) | ||||
|  | ||||
| from api.backend.ai.clients import ask_open_ai, ask_ollama, open_ai_key | ||||
|  | ||||
| from api.backend.ai.agent.prompts import ( | ||||
|     ELEMENT_EXTRACTION_PROMPT, | ||||
|     EXTRACT_ELEMENTS_PROMPT, | ||||
| ) | ||||
|  | ||||
| from api.backend.job.scraping.collect_media import collect_media | ||||
| from api.backend.worker.logger import LOG | ||||
|  | ||||
| from api.backend.job.scraping.add_custom import add_custom_items | ||||
|  | ||||
| from api.backend.models import CapturedElement | ||||
|  | ||||
|  | ||||
| ask_ai = ask_open_ai if open_ai_key else ask_ollama | ||||
|  | ||||
|  | ||||
| async def scrape_with_agent(agent_job: dict[str, Any]): | ||||
|     LOG.info(f"Starting work for agent job: {agent_job}") | ||||
|     pages = set() | ||||
|  | ||||
|     if agent_job["job_options"]["proxies"]: | ||||
|         proxy = random.choice(agent_job["job_options"]["proxies"]) | ||||
|         LOG.info(f"Using proxy: {proxy}") | ||||
|  | ||||
|     async with AsyncCamoufox(headless=True) as browser: | ||||
|         page: Page = await browser.new_page() | ||||
|  | ||||
|         await add_custom_items( | ||||
|             agent_job["url"], | ||||
|             page, | ||||
|             agent_job["job_options"]["custom_cookies"], | ||||
|             agent_job["job_options"]["custom_headers"], | ||||
|         ) | ||||
|  | ||||
|         try: | ||||
|             await page.set_viewport_size({"width": 1920, "height": 1080}) | ||||
|             await page.goto(agent_job["url"], timeout=60000) | ||||
|  | ||||
|             if agent_job["job_options"]["collect_media"]: | ||||
|                 await collect_media(agent_job["id"], page) | ||||
|  | ||||
|             html_content = await page.content() | ||||
|             markdown_content = convert_to_markdown(html_content) | ||||
|  | ||||
|             response = await ask_ai( | ||||
|                 ELEMENT_EXTRACTION_PROMPT.format( | ||||
|                     extraction_prompt=EXTRACT_ELEMENTS_PROMPT, | ||||
|                     webpage=markdown_content, | ||||
|                     prompt=agent_job["prompt"], | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|             xpaths = parse_response(response) | ||||
|  | ||||
|             captured_elements = await capture_elements(page, xpaths) | ||||
|  | ||||
|             final_url = page.url | ||||
|  | ||||
|             pages.add((html_content, final_url)) | ||||
|         finally: | ||||
|             await page.close() | ||||
|             await browser.close() | ||||
|  | ||||
|     name_to_elements = {} | ||||
|  | ||||
|     for page in pages: | ||||
|         for element in captured_elements: | ||||
|             if element.name not in name_to_elements: | ||||
|                 name_to_elements[element.name] = [] | ||||
|  | ||||
|             name_to_elements[element.name].append(element) | ||||
|  | ||||
|     scraped_elements: list[dict[str, dict[str, list[CapturedElement]]]] = [ | ||||
|         { | ||||
|             page[1]: name_to_elements, | ||||
|         } | ||||
|         for page in pages | ||||
|     ] | ||||
|  | ||||
|     return scraped_elements | ||||
							
								
								
									
										58
									
								
								api/backend/ai/agent/prompts.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								api/backend/ai/agent/prompts.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,58 @@ | ||||
| EXTRACT_ELEMENTS_PROMPT = """ | ||||
| You are an assistant that extracts XPath expressions from webpages. | ||||
|  | ||||
| You will receive HTML content in markdown format. | ||||
|  | ||||
| Each element in the markdown has their xpath shown above them in a path like: | ||||
| <!-- //div --> | ||||
|  | ||||
| Respond only with a list of general XPath expressions inside `<xpaths>...</xpaths>` tags. | ||||
|  | ||||
| You will also decide the decision of what to do next. If there is no decision available, return nothing for that section. | ||||
| """ | ||||
|  | ||||
| ELEMENT_EXTRACTION_PROMPT = """ | ||||
| {extraction_prompt} | ||||
|  | ||||
| **Guidelines:** | ||||
| - Prefer shorter, more general XPaths like `//div[...]` or `//span[...]`. | ||||
| - Avoid overly specific or deep paths like `//div[3]/ul/li[2]/a`. | ||||
| - Do **not** chain multiple elements deeply (e.g., `//div/span/a`). | ||||
| - Use XPaths further down the tree when possible. | ||||
| - Do not include any extra explanation or text. | ||||
| - One XPath is acceptable if that's all that's needed. | ||||
| - Try and limit it down to 1 - 3 xpaths. | ||||
| - Include a name for each xpath. | ||||
|  | ||||
| <important> | ||||
| - USE THE MOST SIMPLE XPATHS POSSIBLE. | ||||
| - USE THE MOST GENERAL XPATHS POSSIBLE. | ||||
| - USE THE MOST SPECIFIC XPATHS POSSIBLE. | ||||
| - USE THE MOST GENERAL XPATHS POSSIBLE. | ||||
| </important> | ||||
|  | ||||
| **Example Format:** | ||||
| ```xml | ||||
| <xpaths> | ||||
| - <name: insert_name_here>: <xpath: //div> | ||||
| - <name: insert_name_here>: <xpath: //span> | ||||
| - <name: insert_name_here>: <xpath: //span[contains(@text, 'example')]> | ||||
| - <name: insert_name_here>: <xpath: //div[contains(@text, 'example')]> | ||||
| - <name: insert_name_here>: <xpath: //a[@href]> | ||||
| - etc | ||||
| </xpaths> | ||||
|  | ||||
| <decision> | ||||
|     <next_page> | ||||
|         - //a[@href='next_page_url'] | ||||
|     </next_page> | ||||
| </decision> | ||||
| ``` | ||||
|  | ||||
| **Input webpage:** | ||||
| {webpage} | ||||
|  | ||||
| **Target content:** | ||||
| {prompt} | ||||
|  | ||||
| """ | ||||
							
								
								
									
										252
									
								
								api/backend/ai/agent/utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										252
									
								
								api/backend/ai/agent/utils.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,252 @@ | ||||
| from lxml import html, etree | ||||
| import re | ||||
| from playwright.async_api import Page | ||||
|  | ||||
| from api.backend.models import CapturedElement | ||||
|  | ||||
| from api.backend.job.scraping.scraping_utils import clean_format_characters | ||||
|  | ||||
|  | ||||
| def convert_to_markdown(html_str: str): | ||||
|     parser = html.HTMLParser() | ||||
|     tree = html.fromstring(html_str, parser=parser) | ||||
|     root = tree.getroottree() | ||||
|  | ||||
|     def format_attributes(el: etree._Element) -> str: | ||||
|         """Convert element attributes into a string.""" | ||||
|         return " ".join(f'{k}="{v}"' for k, v in el.attrib.items()) | ||||
|  | ||||
|     def is_visible(el: etree._Element) -> bool: | ||||
|         style = el.attrib.get("style", "").lower() | ||||
|         class_ = el.attrib.get("class", "").lower() | ||||
|  | ||||
|         # Check for visibility styles | ||||
|         if "display: none" in style or "visibility: hidden" in style: | ||||
|             return False | ||||
|         if "opacity: 0" in style or "opacity:0" in style: | ||||
|             return False | ||||
|         if "height: 0" in style or "width: 0" in style: | ||||
|             return False | ||||
|  | ||||
|         # Check for common hidden classes | ||||
|         if any( | ||||
|             hidden in class_ | ||||
|             for hidden in ["hidden", "invisible", "truncate", "collapse"] | ||||
|         ): | ||||
|             return False | ||||
|  | ||||
|         # Check for hidden attributes | ||||
|         if el.attrib.get("hidden") is not None: | ||||
|             return False | ||||
|         if el.attrib.get("aria-hidden") == "true": | ||||
|             return False | ||||
|  | ||||
|         # Check for empty or whitespace-only content | ||||
|         if not el.text and len(el) == 0: | ||||
|             return False | ||||
|  | ||||
|         return True | ||||
|  | ||||
|     def is_layout_or_decorative(el: etree._Element) -> bool: | ||||
|         tag = el.tag.lower() | ||||
|  | ||||
|         # Layout elements | ||||
|         if tag in {"nav", "footer", "header", "aside", "main", "section"}: | ||||
|             return True | ||||
|  | ||||
|         # Decorative elements | ||||
|         if tag in {"svg", "path", "circle", "rect", "line", "polygon", "polyline"}: | ||||
|             return True | ||||
|  | ||||
|         # Check id and class for layout/decorative keywords | ||||
|         id_class = " ".join( | ||||
|             [el.attrib.get("id", ""), el.attrib.get("class", "")] | ||||
|         ).lower() | ||||
|  | ||||
|         layout_keywords = { | ||||
|             "sidebar", | ||||
|             "nav", | ||||
|             "header", | ||||
|             "footer", | ||||
|             "menu", | ||||
|             "advert", | ||||
|             "ads", | ||||
|             "breadcrumb", | ||||
|             "container", | ||||
|             "wrapper", | ||||
|             "layout", | ||||
|             "grid", | ||||
|             "flex", | ||||
|             "row", | ||||
|             "column", | ||||
|             "section", | ||||
|             "banner", | ||||
|             "hero", | ||||
|             "card", | ||||
|             "modal", | ||||
|             "popup", | ||||
|             "tooltip", | ||||
|             "dropdown", | ||||
|             "overlay", | ||||
|         } | ||||
|  | ||||
|         return any(keyword in id_class for keyword in layout_keywords) | ||||
|  | ||||
|     # Tags to ignore in the final markdown output | ||||
|     included_tags = { | ||||
|         "div", | ||||
|         "span", | ||||
|         "a", | ||||
|         "p", | ||||
|         "h1", | ||||
|         "h2", | ||||
|         "h3", | ||||
|         "h4", | ||||
|         "h5", | ||||
|         "h6", | ||||
|         "img", | ||||
|         "button", | ||||
|         "input", | ||||
|         "textarea", | ||||
|         "ul", | ||||
|         "ol", | ||||
|         "li", | ||||
|         "table", | ||||
|         "tr", | ||||
|         "td", | ||||
|         "th", | ||||
|         "input", | ||||
|         "textarea", | ||||
|         "select", | ||||
|         "option", | ||||
|         "optgroup", | ||||
|         "fieldset", | ||||
|         "legend", | ||||
|     } | ||||
|  | ||||
|     special_elements = [] | ||||
|     normal_elements = [] | ||||
|  | ||||
|     for el in tree.iter(): | ||||
|         if el.tag is etree.Comment: | ||||
|             continue | ||||
|  | ||||
|         tag = el.tag.lower() | ||||
|  | ||||
|         if tag not in included_tags: | ||||
|             continue | ||||
|  | ||||
|         if not is_visible(el): | ||||
|             continue | ||||
|  | ||||
|         if is_layout_or_decorative(el): | ||||
|             continue | ||||
|  | ||||
|         path = root.getpath(el) | ||||
|         attrs = format_attributes(el) | ||||
|         attrs_str = f" {attrs}" if attrs else "" | ||||
|         text = el.text.strip() if el.text else "" | ||||
|  | ||||
|         if not text and not attrs: | ||||
|             continue | ||||
|  | ||||
|         # input elements | ||||
|         if tag == "button": | ||||
|             prefix = "🔘 **<button>**" | ||||
|             special_elements.append(f"<!-- {path} -->\n{prefix} {text}") | ||||
|         elif tag == "a": | ||||
|             href = el.attrib.get("href", "") | ||||
|             prefix = f"🔗 **<a href='{href}'>**" | ||||
|             special_elements.append(f"<!-- {path} -->\n{prefix} {text}") | ||||
|         elif tag == "input": | ||||
|             input_type = el.attrib.get("type", "text") | ||||
|             prefix = f"📝 **<input type='{input_type}'>**" | ||||
|             special_elements.append(f"<!-- {path} -->\n{prefix}") | ||||
|         else: | ||||
|             prefix = f"**<{tag}{attrs_str}>**" | ||||
|  | ||||
|             if text: | ||||
|                 normal_elements.append(f"<!-- {path} -->\n{prefix} {text}") | ||||
|  | ||||
|     return "\n\n".join(normal_elements + special_elements)  # type: ignore | ||||
|  | ||||
|  | ||||
| def parse_response(text: str) -> list[dict[str, str]]: | ||||
|     xpaths = re.findall(r"<xpaths>(.*?)</xpaths>", text, re.DOTALL) | ||||
|     results = [] | ||||
|  | ||||
|     if xpaths: | ||||
|         lines = xpaths[0].strip().splitlines() | ||||
|         for line in lines: | ||||
|             if line.strip().startswith("-"): | ||||
|                 name = re.findall(r"<name: (.*?)>", line)[0] | ||||
|                 xpath = re.findall(r"<xpath: (.*?)>", line)[0] | ||||
|                 results.append({"name": name, "xpath": xpath}) | ||||
|             else: | ||||
|                 results.append({"name": "", "xpath": line.strip()}) | ||||
|  | ||||
|     return results | ||||
|  | ||||
|  | ||||
| def parse_next_page(text: str) -> str | None: | ||||
|     next_page = re.findall(r"<next_page>(.*?)</next_page>", text, re.DOTALL) | ||||
|  | ||||
|     if next_page: | ||||
|         lines = next_page[0].strip().splitlines() | ||||
|         next_page = [ | ||||
|             line.strip().lstrip("-").strip() | ||||
|             for line in lines | ||||
|             if line.strip().startswith("-") | ||||
|         ] | ||||
|  | ||||
|     return next_page[0] if next_page else None | ||||
|  | ||||
|  | ||||
| async def capture_elements( | ||||
|     page: Page, xpaths: list[dict[str, str]] | ||||
| ) -> list[CapturedElement]: | ||||
|     captured_elements = [] | ||||
|     seen_texts = set() | ||||
|  | ||||
|     for xpath in xpaths: | ||||
|         try: | ||||
|             locator = page.locator(f"xpath={xpath['xpath']}") | ||||
|             count = await locator.count() | ||||
|  | ||||
|             for i in range(count): | ||||
|                 element_text = "" | ||||
|  | ||||
|                 element_handle = await locator.nth(i).element_handle() | ||||
|  | ||||
|                 if not element_handle: | ||||
|                     continue | ||||
|  | ||||
|                 link = await element_handle.get_attribute("href") or "" | ||||
|  | ||||
|                 text = await element_handle.text_content() | ||||
|  | ||||
|                 if text: | ||||
|                     element_text += text | ||||
|  | ||||
|                 if link: | ||||
|                     element_text += f" ({link})" | ||||
|  | ||||
|                 cleaned = clean_format_characters(element_text) | ||||
|  | ||||
|                 if cleaned in seen_texts: | ||||
|                     continue | ||||
|  | ||||
|                 seen_texts.add(cleaned) | ||||
|  | ||||
|                 captured_elements.append( | ||||
|                     CapturedElement( | ||||
|                         name=xpath["name"], | ||||
|                         text=cleaned, | ||||
|                         xpath=xpath["xpath"], | ||||
|                     ) | ||||
|                 ) | ||||
|  | ||||
|         except Exception as e: | ||||
|             print(f"Error processing xpath {xpath}: {e}") | ||||
|  | ||||
|     return captured_elements | ||||
| @@ -1,32 +1,29 @@ | ||||
| # STL | ||||
| import os | ||||
| import logging | ||||
| from collections.abc import Iterable, AsyncGenerator | ||||
|  | ||||
| # PDM | ||||
| from openai import OpenAI | ||||
| from fastapi import APIRouter | ||||
| from fastapi.responses import JSONResponse, StreamingResponse | ||||
| from openai.types.chat import ChatCompletionMessageParam | ||||
|  | ||||
| # LOCAL | ||||
| from ollama import Message, AsyncClient | ||||
| from ollama import Message | ||||
| from api.backend.models import AI | ||||
|  | ||||
| from api.backend.ai.clients import ( | ||||
|     llama_client, | ||||
|     llama_model, | ||||
|     openai_client, | ||||
|     open_ai_model, | ||||
|     open_ai_key, | ||||
| ) | ||||
|  | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
| ai_router = APIRouter() | ||||
|  | ||||
| # Load environment variables | ||||
| open_ai_key = os.getenv("OPENAI_KEY") | ||||
| open_ai_model = os.getenv("OPENAI_MODEL") | ||||
| llama_url = os.getenv("OLLAMA_URL") | ||||
| llama_model = os.getenv("OLLAMA_MODEL") | ||||
|  | ||||
| # Initialize clients | ||||
| openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None | ||||
| llama_client = AsyncClient(host=llama_url) if llama_url else None | ||||
|  | ||||
|  | ||||
| async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]: | ||||
|     if llama_client and llama_model: | ||||
| @@ -43,6 +40,14 @@ async def llama_chat(chat_messages: list[Message]) -> AsyncGenerator[str, None]: | ||||
| async def openai_chat( | ||||
|     chat_messages: Iterable[ChatCompletionMessageParam], | ||||
| ) -> AsyncGenerator[str, None]: | ||||
|     if openai_client and not open_ai_model: | ||||
|         LOG.error("OpenAI model is not set") | ||||
|         yield "An error occurred while processing your request." | ||||
|  | ||||
|     if not openai_client: | ||||
|         LOG.error("OpenAI client is not set") | ||||
|         yield "An error occurred while processing your request." | ||||
|  | ||||
|     if openai_client and open_ai_model: | ||||
|         try: | ||||
|             response = openai_client.chat.completions.create( | ||||
| @@ -67,4 +72,4 @@ async def ai(c: AI): | ||||
|  | ||||
| @ai_router.get("/ai/check") | ||||
| async def check(): | ||||
|     return JSONResponse(content=bool(open_ai_key or llama_model)) | ||||
|     return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)}) | ||||
|   | ||||
							
								
								
									
										38
									
								
								api/backend/ai/clients.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								api/backend/ai/clients.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| import os | ||||
|  | ||||
| from openai import OpenAI | ||||
| from ollama import AsyncClient | ||||
|  | ||||
|  | ||||
| # Load environment variables | ||||
| open_ai_key = os.getenv("OPENAI_KEY") | ||||
| open_ai_model = os.getenv("OPENAI_MODEL") | ||||
| llama_url = os.getenv("OLLAMA_URL") | ||||
| llama_model = os.getenv("OLLAMA_MODEL") | ||||
|  | ||||
| # Initialize clients | ||||
| openai_client = OpenAI(api_key=open_ai_key) if open_ai_key else None | ||||
| llama_client = AsyncClient(host=llama_url) if llama_url else None | ||||
|  | ||||
|  | ||||
| async def ask_open_ai(prompt: str) -> str: | ||||
|     if not openai_client: | ||||
|         raise ValueError("OpenAI client not initialized") | ||||
|  | ||||
|     response = openai_client.chat.completions.create( | ||||
|         model=open_ai_model or "gpt-4.1-mini", | ||||
|         messages=[{"role": "user", "content": prompt}], | ||||
|     ) | ||||
|  | ||||
|     return response.choices[0].message.content or "" | ||||
|  | ||||
|  | ||||
| async def ask_ollama(prompt: str) -> str: | ||||
|     if not llama_client: | ||||
|         raise ValueError("Ollama client not initialized") | ||||
|  | ||||
|     response = await llama_client.chat( | ||||
|         model=llama_model or "", messages=[{"role": "user", "content": prompt}] | ||||
|     ) | ||||
|  | ||||
|     return response.message.content or "" | ||||
| @@ -1,9 +1,14 @@ | ||||
| # STL | ||||
| import os | ||||
| import logging | ||||
| import apscheduler  # type: ignore | ||||
| from contextlib import asynccontextmanager | ||||
|  | ||||
| # PDM | ||||
| from fastapi import FastAPI | ||||
| import apscheduler.schedulers | ||||
| import apscheduler.schedulers.background | ||||
| from fastapi import FastAPI, Request, status | ||||
| from fastapi.exceptions import RequestValidationError | ||||
| from fastapi.middleware.cors import CORSMiddleware | ||||
|  | ||||
| # LOCAL | ||||
| @@ -11,8 +16,12 @@ from api.backend.ai.ai_router import ai_router | ||||
| from api.backend.auth.auth_router import auth_router | ||||
| from api.backend.utils import get_log_level | ||||
| from api.backend.routers.job_router import job_router | ||||
| from api.backend.routers.log_router import log_router | ||||
| from api.backend.routers.stats_router import stats_router | ||||
| from api.backend.database.startup import init_database | ||||
| from fastapi.responses import JSONResponse | ||||
|  | ||||
| from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler | ||||
| from api.backend.scheduler import scheduler | ||||
|  | ||||
| log_level = os.getenv("LOG_LEVEL") | ||||
| LOG_LEVEL = get_log_level(log_level) | ||||
| @@ -25,7 +34,30 @@ logging.basicConfig( | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
| app = FastAPI(title="api") | ||||
|  | ||||
| @asynccontextmanager | ||||
| async def lifespan(app: FastAPI): | ||||
|     # Startup | ||||
|     LOG.info("Starting application...") | ||||
|  | ||||
|     init_database() | ||||
|  | ||||
|     LOG.info("Starting cron scheduler...") | ||||
|     start_cron_scheduler(scheduler) | ||||
|     scheduler.start() | ||||
|     LOG.info("Cron scheduler started successfully") | ||||
|  | ||||
|     yield | ||||
|  | ||||
|     # Shutdown | ||||
|     LOG.info("Shutting down application...") | ||||
|     LOG.info("Stopping cron scheduler...") | ||||
|     scheduler.shutdown(wait=False)  # Set wait=False to not block shutdown | ||||
|     LOG.info("Cron scheduler stopped") | ||||
|     LOG.info("Application shutdown complete") | ||||
|  | ||||
|  | ||||
| app = FastAPI(title="api", root_path="/api", lifespan=lifespan) | ||||
|  | ||||
| app.add_middleware( | ||||
|     CORSMiddleware, | ||||
| @@ -35,9 +67,17 @@ app.add_middleware( | ||||
|     allow_headers=["*"], | ||||
| ) | ||||
|  | ||||
|  | ||||
| app.include_router(auth_router) | ||||
| app.include_router(ai_router) | ||||
| app.include_router(job_router) | ||||
| app.include_router(log_router) | ||||
| app.include_router(stats_router) | ||||
|  | ||||
|  | ||||
| @app.exception_handler(RequestValidationError) | ||||
| async def validation_exception_handler(request: Request, exc: RequestValidationError): | ||||
|     exc_str = f"{exc}".replace("\n", " ").replace("   ", " ") | ||||
|     logging.error(f"{request}: {exc_str}") | ||||
|     content = {"status_code": 10422, "message": exc_str, "data": None} | ||||
|     return JSONResponse( | ||||
|         content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY | ||||
|     ) | ||||
|   | ||||
| @@ -1,5 +1,6 @@ | ||||
| # STL | ||||
| from datetime import timedelta | ||||
| import os | ||||
|  | ||||
| # PDM | ||||
| from fastapi import Depends, APIRouter, HTTPException, status | ||||
| @@ -7,7 +8,6 @@ from fastapi.security import OAuth2PasswordRequestForm | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.schemas import User, Token, UserCreate | ||||
| from api.backend.database import get_user_collection | ||||
| from api.backend.auth.auth_utils import ( | ||||
|     ACCESS_TOKEN_EXPIRE_MINUTES, | ||||
|     get_current_user, | ||||
| @@ -15,9 +15,14 @@ from api.backend.auth.auth_utils import ( | ||||
|     get_password_hash, | ||||
|     create_access_token, | ||||
| ) | ||||
| import logging | ||||
|  | ||||
| from api.backend.database.common import update | ||||
|  | ||||
| auth_router = APIRouter() | ||||
|  | ||||
| LOG = logging.getLogger("auth_router") | ||||
|  | ||||
|  | ||||
| @auth_router.post("/auth/token", response_model=Token) | ||||
| async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): | ||||
| @@ -43,15 +48,26 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends( | ||||
|  | ||||
| @auth_router.post("/auth/signup", response_model=User) | ||||
| async def create_user(user: UserCreate): | ||||
|     users_collection = get_user_collection() | ||||
|     hashed_password = get_password_hash(user.password) | ||||
|     user_dict = user.model_dump() | ||||
|     user_dict["hashed_password"] = hashed_password | ||||
|     del user_dict["password"] | ||||
|     _ = await users_collection.insert_one(user_dict) | ||||
|  | ||||
|     query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)" | ||||
|     _ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"])) | ||||
|  | ||||
|     return user_dict | ||||
|  | ||||
|  | ||||
| @auth_router.get("/auth/users/me", response_model=User) | ||||
| async def read_users_me(current_user: User = Depends(get_current_user)): | ||||
|     return current_user | ||||
|  | ||||
|  | ||||
| @auth_router.get("/auth/check") | ||||
| async def check_auth(): | ||||
|     return { | ||||
|         "registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True", | ||||
|         "recordings_enabled": os.environ.get("RECORDINGS_ENABLED", "true").lower() | ||||
|         == "true", | ||||
|     } | ||||
|   | ||||
| @@ -1,7 +1,5 @@ | ||||
| # STL | ||||
| import os | ||||
| from gc import disable | ||||
| from queue import Empty | ||||
| from typing import Any, Optional | ||||
| from datetime import datetime, timedelta | ||||
| import logging | ||||
| @@ -15,15 +13,16 @@ from fastapi.security import OAuth2PasswordBearer | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.schemas import User, UserInDB, TokenData | ||||
| from api.backend.database import get_user_collection | ||||
|  | ||||
| from api.backend.database.common import query | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
| _ = load_dotenv() | ||||
|  | ||||
| SECRET_KEY = os.getenv("SECRET_KEY") or "" | ||||
| ALGORITHM = os.getenv("ALGORITHM") or "" | ||||
| ACCESS_TOKEN_EXPIRE_MINUTES = os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES") | ||||
| SECRET_KEY = os.getenv("SECRET_KEY") or "secret" | ||||
| ALGORITHM = os.getenv("ALGORITHM") or "HS256" | ||||
| ACCESS_TOKEN_EXPIRE_MINUTES = os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES") or 600 | ||||
|  | ||||
| pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") | ||||
| oauth2_scheme = OAuth2PasswordBearer(tokenUrl="auth/token") | ||||
| @@ -40,8 +39,8 @@ def get_password_hash(password: str): | ||||
|  | ||||
|  | ||||
| async def get_user(email: str): | ||||
|     user_collection = get_user_collection() | ||||
|     user = await user_collection.find_one({"email": email}) | ||||
|     user_query = "SELECT * FROM users WHERE email = ?" | ||||
|     user = query(user_query, (email,))[0] | ||||
|  | ||||
|     if not user: | ||||
|         return | ||||
| @@ -77,27 +76,42 @@ def create_access_token( | ||||
|  | ||||
|  | ||||
| async def get_current_user(token: str = Depends(oauth2_scheme)): | ||||
|     LOG.info(f"Getting current user with token: {token}") | ||||
|     LOG.debug(f"Getting current user with token: {token}") | ||||
|  | ||||
|     if not token: | ||||
|         LOG.debug("No token provided") | ||||
|         return EMPTY_USER | ||||
|  | ||||
|     if len(token.split(".")) != 3: | ||||
|         LOG.error(f"Malformed token: {token}") | ||||
|         return EMPTY_USER | ||||
|  | ||||
|     try: | ||||
|         LOG.debug( | ||||
|             f"Decoding token: {token} with secret key: {SECRET_KEY} and algorithm: {ALGORITHM}" | ||||
|         ) | ||||
|  | ||||
|         if token.startswith("Bearer "): | ||||
|             token = token.split(" ")[1] | ||||
|  | ||||
|         payload: Optional[dict[str, Any]] = jwt.decode( | ||||
|             token, SECRET_KEY, algorithms=[ALGORITHM] | ||||
|         ) | ||||
|  | ||||
|         if not payload: | ||||
|             LOG.error("No payload found in token") | ||||
|             return EMPTY_USER | ||||
|  | ||||
|         email = payload.get("sub") | ||||
|  | ||||
|         if email is None: | ||||
|             LOG.error("No email found in payload") | ||||
|             return EMPTY_USER | ||||
|  | ||||
|         token_data = TokenData(email=email) | ||||
|  | ||||
|     except JWTError: | ||||
|     except JWTError as e: | ||||
|         LOG.error(f"JWTError occurred: {e}") | ||||
|         return EMPTY_USER | ||||
|  | ||||
|     except Exception as e: | ||||
| @@ -105,7 +119,6 @@ async def get_current_user(token: str = Depends(oauth2_scheme)): | ||||
|         return EMPTY_USER | ||||
|  | ||||
|     user = await get_user(email=token_data.email) | ||||
|  | ||||
|     if user is None: | ||||
|         return EMPTY_USER | ||||
|  | ||||
|   | ||||
							
								
								
									
										16
									
								
								api/backend/constants.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								api/backend/constants.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| from pathlib import Path | ||||
| import os | ||||
|  | ||||
| DATABASE_PATH = "data/database.db" | ||||
| RECORDINGS_DIR = Path("media/recordings") | ||||
| RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true" | ||||
| MEDIA_DIR = Path("media") | ||||
| MEDIA_TYPES = [ | ||||
|     "audio", | ||||
|     "documents", | ||||
|     "images", | ||||
|     "pdfs", | ||||
|     "presentations", | ||||
|     "spreadsheets", | ||||
|     "videos", | ||||
| ] | ||||
| @@ -1,23 +0,0 @@ | ||||
| # STL | ||||
| import os | ||||
| from typing import Any | ||||
|  | ||||
| # PDM | ||||
| from dotenv import load_dotenv | ||||
| from motor.motor_asyncio import AsyncIOMotorClient | ||||
|  | ||||
| _ = load_dotenv() | ||||
|  | ||||
| MONGODB_URI = os.getenv("MONGODB_URI") | ||||
|  | ||||
|  | ||||
| def get_user_collection(): | ||||
|     client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI) | ||||
|     db = client["scrape"] | ||||
|     return db["users"] | ||||
|  | ||||
|  | ||||
| def get_job_collection(): | ||||
|     client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI) | ||||
|     db = client["scrape"] | ||||
|     return db["jobs"] | ||||
							
								
								
									
										3
									
								
								api/backend/database/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/database/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .common import insert, QUERIES, update | ||||
|  | ||||
| __all__ = ["insert", "QUERIES", "update"] | ||||
							
								
								
									
										92
									
								
								api/backend/database/common.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										92
									
								
								api/backend/database/common.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,92 @@ | ||||
| import sqlite3 | ||||
| from typing import Any, Optional | ||||
| from api.backend.constants import DATABASE_PATH | ||||
| from api.backend.utils import format_json, format_sql_row_to_python | ||||
| from api.backend.database.schema import INIT_QUERY | ||||
| from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY | ||||
| import logging | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def connect(): | ||||
|     connection = sqlite3.connect(DATABASE_PATH) | ||||
|     connection.set_trace_callback(print) | ||||
|     cursor = connection.cursor() | ||||
|     return cursor | ||||
|  | ||||
|  | ||||
| def insert(query: str, values: tuple[Any, ...]): | ||||
|     connection = sqlite3.connect(DATABASE_PATH) | ||||
|     cursor = connection.cursor() | ||||
|     copy = list(values) | ||||
|     format_json(copy) | ||||
|  | ||||
|     try: | ||||
|         _ = cursor.execute(query, copy) | ||||
|         connection.commit() | ||||
|     except sqlite3.Error as e: | ||||
|         LOG.error(f"An error occurred: {e}") | ||||
|     finally: | ||||
|         cursor.close() | ||||
|         connection.close() | ||||
|  | ||||
|  | ||||
| def query(query: str, values: Optional[tuple[Any, ...]] = None): | ||||
|     connection = sqlite3.connect(DATABASE_PATH) | ||||
|     connection.row_factory = sqlite3.Row | ||||
|     cursor = connection.cursor() | ||||
|     rows = [] | ||||
|     try: | ||||
|         if values: | ||||
|             _ = cursor.execute(query, values) | ||||
|         else: | ||||
|             _ = cursor.execute(query) | ||||
|  | ||||
|         rows = cursor.fetchall() | ||||
|  | ||||
|     finally: | ||||
|         cursor.close() | ||||
|         connection.close() | ||||
|  | ||||
|     formatted_rows: list[dict[str, Any]] = [] | ||||
|  | ||||
|     for row in rows: | ||||
|         row = dict(row) | ||||
|         formatted_row = format_sql_row_to_python(row) | ||||
|         formatted_rows.append(formatted_row) | ||||
|  | ||||
|     return formatted_rows | ||||
|  | ||||
|  | ||||
| def update(query: str, values: Optional[tuple[Any, ...]] = None): | ||||
|     connection = sqlite3.connect(DATABASE_PATH) | ||||
|     cursor = connection.cursor() | ||||
|  | ||||
|     copy = None | ||||
|  | ||||
|     if values: | ||||
|         copy = list(values) | ||||
|         format_json(copy) | ||||
|  | ||||
|     try: | ||||
|         if copy: | ||||
|             res = cursor.execute(query, copy) | ||||
|         else: | ||||
|             res = cursor.execute(query) | ||||
|         connection.commit() | ||||
|         return res.rowcount | ||||
|     except sqlite3.Error as e: | ||||
|         LOG.error(f"An error occurred: {e}") | ||||
|     finally: | ||||
|         cursor.close() | ||||
|         connection.close() | ||||
|  | ||||
|     return 0 | ||||
|  | ||||
|  | ||||
| QUERIES = { | ||||
|     "init": INIT_QUERY, | ||||
|     "insert_job": JOB_INSERT_QUERY, | ||||
|     "delete_job": DELETE_JOB_QUERY, | ||||
| } | ||||
							
								
								
									
										3
									
								
								api/backend/database/queries/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/database/queries/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY | ||||
|  | ||||
| __all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"] | ||||
							
								
								
									
										9
									
								
								api/backend/database/queries/queries.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								api/backend/database/queries/queries.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | ||||
| JOB_INSERT_QUERY = """ | ||||
| INSERT INTO jobs  | ||||
| (id, url, elements, user, time_created, result, status, chat, job_options, agent_mode, prompt) | ||||
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | ||||
| """ | ||||
|  | ||||
| DELETE_JOB_QUERY = """ | ||||
| DELETE FROM jobs WHERE id IN () | ||||
| """ | ||||
							
								
								
									
										3
									
								
								api/backend/database/schema/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/database/schema/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .schema import INIT_QUERY | ||||
|  | ||||
| __all__ = ["INIT_QUERY"] | ||||
							
								
								
									
										33
									
								
								api/backend/database/schema/schema.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								api/backend/database/schema/schema.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | ||||
| INIT_QUERY = """ | ||||
| CREATE TABLE IF NOT EXISTS jobs ( | ||||
|     id STRING PRIMARY KEY NOT NULL, | ||||
|     url STRING NOT NULL, | ||||
|     elements JSON NOT NULL, | ||||
|     user STRING, | ||||
|     time_created DATETIME NOT NULL, | ||||
|     result JSON NOT NULL, | ||||
|     status STRING NOT NULL, | ||||
|     chat JSON, | ||||
|     job_options JSON | ||||
| ); | ||||
|  | ||||
| CREATE TABLE IF NOT EXISTS users ( | ||||
|     email STRING PRIMARY KEY NOT NULL, | ||||
|     hashed_password STRING NOT NULL, | ||||
|     full_name STRING, | ||||
|     disabled BOOLEAN | ||||
| ); | ||||
|  | ||||
| CREATE TABLE IF NOT EXISTS cron_jobs ( | ||||
|     id STRING PRIMARY KEY NOT NULL, | ||||
|     user_email STRING NOT NULL, | ||||
|     job_id STRING NOT NULL, | ||||
|     cron_expression STRING NOT NULL, | ||||
|     time_created DATETIME NOT NULL, | ||||
|     time_updated DATETIME NOT NULL, | ||||
|     FOREIGN KEY (job_id) REFERENCES jobs(id) | ||||
| ); | ||||
|  | ||||
| ALTER TABLE jobs ADD COLUMN agent_mode BOOLEAN NOT NULL DEFAULT FALSE; | ||||
| ALTER TABLE jobs ADD COLUMN prompt STRING; | ||||
| """ | ||||
							
								
								
									
										55
									
								
								api/backend/database/startup.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								api/backend/database/startup.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,55 @@ | ||||
| import os | ||||
| from api.backend.database.common import connect, QUERIES, insert | ||||
| import logging | ||||
| import sqlite3 | ||||
|  | ||||
| from api.backend.auth.auth_utils import get_password_hash | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def init_database(): | ||||
|     cursor = connect() | ||||
|  | ||||
|     for query in QUERIES["init"].strip().split(";"): | ||||
|         query = query.strip() | ||||
|         if not query: | ||||
|             continue | ||||
|  | ||||
|         try: | ||||
|             LOG.info(f"Executing query: {query}") | ||||
|             _ = cursor.execute(query) | ||||
|         except sqlite3.OperationalError as e: | ||||
|             if "duplicate column name" in str(e).lower(): | ||||
|                 LOG.warning(f"Skipping duplicate column error: {e}") | ||||
|                 continue | ||||
|             else: | ||||
|                 LOG.error(f"Error executing query: {query}") | ||||
|                 raise | ||||
|  | ||||
|     if os.environ.get("REGISTRATION_ENABLED", "true").lower() == "false": | ||||
|         default_user_email = os.environ.get("DEFAULT_USER_EMAIL") | ||||
|         default_user_password = os.environ.get("DEFAULT_USER_PASSWORD") | ||||
|         default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME") | ||||
|  | ||||
|         if ( | ||||
|             not default_user_email | ||||
|             or not default_user_password | ||||
|             or not default_user_full_name | ||||
|         ): | ||||
|             LOG.error( | ||||
|                 "DEFAULT_USER_EMAIL, DEFAULT_USER_PASSWORD, or DEFAULT_USER_FULL_NAME is not set!" | ||||
|             ) | ||||
|             exit(1) | ||||
|  | ||||
|         query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)" | ||||
|         _ = insert( | ||||
|             query, | ||||
|             ( | ||||
|                 default_user_email, | ||||
|                 get_password_hash(default_user_password), | ||||
|                 default_user_full_name, | ||||
|             ), | ||||
|         ) | ||||
|  | ||||
|     cursor.close() | ||||
| @@ -1,119 +0,0 @@ | ||||
| # STL | ||||
| import logging | ||||
| from typing import Any, Optional | ||||
|  | ||||
| # PDM | ||||
| from pymongo import DESCENDING | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.models import FetchOptions | ||||
| from api.backend.database import get_job_collection | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| async def insert(item: dict[str, Any]) -> None: | ||||
|     collection = get_job_collection() | ||||
|     i = await collection.insert_one(item) | ||||
|     LOG.info(f"Inserted item: {i}") | ||||
|  | ||||
|  | ||||
| async def get_queued_job(): | ||||
|     collection = get_job_collection() | ||||
|     return await collection.find_one( | ||||
|         {"status": "Queued"}, sort=[("created_at", DESCENDING)] | ||||
|     ) | ||||
|  | ||||
|  | ||||
| async def query( | ||||
|     filter: dict[str, Any], fetch_options: Optional[FetchOptions] = None | ||||
| ) -> list[dict[str, Any]]: | ||||
|     collection = get_job_collection() | ||||
|     cursor = collection.find(filter) | ||||
|     results: list[dict[str, Any]] = [] | ||||
|  | ||||
|     async for document in cursor: | ||||
|         del document["_id"] | ||||
|  | ||||
|         if fetch_options and not fetch_options.chat and document.get("chat"): | ||||
|             del document["chat"] | ||||
|  | ||||
|         results.append(document) | ||||
|  | ||||
|     return results | ||||
|  | ||||
|  | ||||
| async def update_job(ids: list[str], field: str, value: Any): | ||||
|     collection = get_job_collection() | ||||
|     for id in ids: | ||||
|         _ = await collection.update_one( | ||||
|             {"id": id}, | ||||
|             {"$set": {field: value}}, | ||||
|         ) | ||||
|  | ||||
|  | ||||
| async def delete_jobs(jobs: list[str]): | ||||
|     collection = get_job_collection() | ||||
|     result = await collection.delete_many({"id": {"$in": jobs}}) | ||||
|     LOG.info(f"{result.deleted_count} documents deleted") | ||||
|  | ||||
|     return True if result.deleted_count > 0 else False | ||||
|  | ||||
|  | ||||
| async def average_elements_per_link(user: str): | ||||
|     collection = get_job_collection() | ||||
|     pipeline = [ | ||||
|         {"$match": {"status": "Completed", "user": user}}, | ||||
|         { | ||||
|             "$project": { | ||||
|                 "date": { | ||||
|                     "$dateToString": {"format": "%Y-%m-%d", "date": "$time_created"} | ||||
|                 }, | ||||
|                 "num_elements": {"$size": "$elements"}, | ||||
|             } | ||||
|         }, | ||||
|         { | ||||
|             "$group": { | ||||
|                 "_id": "$date", | ||||
|                 "average_elements": {"$avg": "$num_elements"}, | ||||
|                 "count": {"$sum": 1}, | ||||
|             } | ||||
|         }, | ||||
|         {"$sort": {"_id": 1}}, | ||||
|     ] | ||||
|     cursor = collection.aggregate(pipeline) | ||||
|     results: list[dict[str, Any]] = [] | ||||
|  | ||||
|     async for document in cursor: | ||||
|         results.append( | ||||
|             { | ||||
|                 "date": document["_id"], | ||||
|                 "average_elements": document["average_elements"], | ||||
|                 "count": document["count"], | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|     return results | ||||
|  | ||||
|  | ||||
| async def get_jobs_per_day(user: str): | ||||
|     collection = get_job_collection() | ||||
|     pipeline = [ | ||||
|         {"$match": {"status": "Completed", "user": user}}, | ||||
|         { | ||||
|             "$project": { | ||||
|                 "date": { | ||||
|                     "$dateToString": {"format": "%Y-%m-%d", "date": "$time_created"} | ||||
|                 } | ||||
|             } | ||||
|         }, | ||||
|         {"$group": {"_id": "$date", "job_count": {"$sum": 1}}}, | ||||
|         {"$sort": {"_id": 1}}, | ||||
|     ] | ||||
|     cursor = collection.aggregate(pipeline) | ||||
|  | ||||
|     results: list[dict[str, Any]] = [] | ||||
|     async for document in cursor: | ||||
|         results.append({"date": document["_id"], "job_count": document["job_count"]}) | ||||
|  | ||||
|     return results | ||||
							
								
								
									
										17
									
								
								api/backend/job/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								api/backend/job/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,17 @@ | ||||
| from .job import ( | ||||
|     insert, | ||||
|     update_job, | ||||
|     delete_jobs, | ||||
|     get_jobs_per_day, | ||||
|     get_queued_job, | ||||
|     average_elements_per_link, | ||||
| ) | ||||
|  | ||||
| __all__ = [ | ||||
|     "insert", | ||||
|     "update_job", | ||||
|     "delete_jobs", | ||||
|     "get_jobs_per_day", | ||||
|     "get_queued_job", | ||||
|     "average_elements_per_link", | ||||
| ] | ||||
							
								
								
									
										100
									
								
								api/backend/job/cron_scheduling/cron_scheduling.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								api/backend/job/cron_scheduling/cron_scheduling.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,100 @@ | ||||
| import datetime | ||||
| from typing import Any | ||||
| import uuid | ||||
| from api.backend.database.common import insert, query | ||||
| from api.backend.models import CronJob | ||||
| from apscheduler.schedulers.background import BackgroundScheduler  # type: ignore | ||||
| from apscheduler.triggers.cron import CronTrigger  # type: ignore | ||||
|  | ||||
| from api.backend.job import insert as insert_job | ||||
| import logging | ||||
|  | ||||
| LOG = logging.getLogger("Cron Scheduler") | ||||
|  | ||||
|  | ||||
| def insert_cron_job(cron_job: CronJob): | ||||
|     query = """ | ||||
|     INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated) | ||||
|     VALUES (?, ?, ?, ?, ?, ?) | ||||
|     """ | ||||
|     values = ( | ||||
|         cron_job.id, | ||||
|         cron_job.user_email, | ||||
|         cron_job.job_id, | ||||
|         cron_job.cron_expression, | ||||
|         cron_job.time_created, | ||||
|         cron_job.time_updated, | ||||
|     ) | ||||
|  | ||||
|     insert(query, values) | ||||
|  | ||||
|     return True | ||||
|  | ||||
|  | ||||
| def delete_cron_job(id: str, user_email: str): | ||||
|     query = """ | ||||
|     DELETE FROM cron_jobs | ||||
|     WHERE id = ? AND user_email = ? | ||||
|     """ | ||||
|     values = (id, user_email) | ||||
|     insert(query, values) | ||||
|  | ||||
|     return True | ||||
|  | ||||
|  | ||||
| def get_cron_jobs(user_email: str): | ||||
|     cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,)) | ||||
|  | ||||
|     return cron_jobs | ||||
|  | ||||
|  | ||||
| def get_all_cron_jobs(): | ||||
|     cron_jobs = query("SELECT * FROM cron_jobs") | ||||
|  | ||||
|     return cron_jobs | ||||
|  | ||||
|  | ||||
| def insert_job_from_cron_job(job: dict[str, Any]): | ||||
|     insert_job( | ||||
|         { | ||||
|             **job, | ||||
|             "id": uuid.uuid4().hex, | ||||
|             "status": "Queued", | ||||
|             "result": "", | ||||
|             "chat": None, | ||||
|             "time_created": datetime.datetime.now(), | ||||
|             "time_updated": datetime.datetime.now(), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def get_cron_job_trigger(cron_expression: str): | ||||
|     expression_parts = cron_expression.split() | ||||
|  | ||||
|     if len(expression_parts) != 5: | ||||
|         print(f"Invalid cron expression: {cron_expression}") | ||||
|         return None | ||||
|  | ||||
|     minute, hour, day, month, day_of_week = expression_parts | ||||
|  | ||||
|     return CronTrigger( | ||||
|         minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def start_cron_scheduler(scheduler: BackgroundScheduler): | ||||
|     cron_jobs = get_all_cron_jobs() | ||||
|  | ||||
|     LOG.info(f"Cron jobs: {cron_jobs}") | ||||
|  | ||||
|     for job in cron_jobs: | ||||
|         queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],)) | ||||
|  | ||||
|         LOG.info(f"Adding job: {queried_job}") | ||||
|  | ||||
|         scheduler.add_job( | ||||
|             insert_job_from_cron_job, | ||||
|             get_cron_job_trigger(job["cron_expression"]), | ||||
|             id=job["id"], | ||||
|             args=[queried_job[0]], | ||||
|         ) | ||||
							
								
								
									
										99
									
								
								api/backend/job/job.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										99
									
								
								api/backend/job/job.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,99 @@ | ||||
| # STL | ||||
| import logging | ||||
| from typing import Any | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.utils import format_list_for_query | ||||
| from api.backend.database.common import ( | ||||
|     insert as common_insert, | ||||
|     query as common_query, | ||||
|     QUERIES, | ||||
|     update as common_update, | ||||
| ) | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def insert(item: dict[str, Any]) -> None: | ||||
|     common_insert( | ||||
|         QUERIES["insert_job"], | ||||
|         ( | ||||
|             item["id"], | ||||
|             item["url"], | ||||
|             item["elements"], | ||||
|             item["user"], | ||||
|             item["time_created"], | ||||
|             item["result"], | ||||
|             item["status"], | ||||
|             item["chat"], | ||||
|             item["job_options"], | ||||
|             item["agent_mode"], | ||||
|             item["prompt"], | ||||
|         ), | ||||
|     ) | ||||
|     LOG.info(f"Inserted item: {item}") | ||||
|  | ||||
|  | ||||
| async def get_queued_job(): | ||||
|     query = ( | ||||
|         "SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1" | ||||
|     ) | ||||
|     res = common_query(query) | ||||
|     LOG.info(f"Got queued job: {res}") | ||||
|     return res[0] if res else None | ||||
|  | ||||
|  | ||||
| async def update_job(ids: list[str], field: str, value: Any): | ||||
|     query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}" | ||||
|     res = common_update(query, tuple([value] + ids)) | ||||
|     LOG.info(f"Updated job: {res}") | ||||
|  | ||||
|  | ||||
| async def delete_jobs(jobs: list[str]): | ||||
|     if not jobs: | ||||
|         LOG.info("No jobs to delete.") | ||||
|         return False | ||||
|  | ||||
|     query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}" | ||||
|     res = common_update(query, tuple(jobs)) | ||||
|  | ||||
|     return res > 0 | ||||
|  | ||||
|  | ||||
| async def average_elements_per_link(user: str): | ||||
|     job_query = """ | ||||
|     SELECT  | ||||
|         DATE(time_created) AS date, | ||||
|         AVG(json_array_length(elements)) AS average_elements, | ||||
|         COUNT(*) AS count | ||||
|     FROM  | ||||
|         jobs | ||||
|     WHERE  | ||||
|         status = 'Completed' AND user = ? | ||||
|     GROUP BY  | ||||
|         DATE(time_created) | ||||
|     ORDER BY  | ||||
|         date ASC; | ||||
|     """ | ||||
|     results = common_query(job_query, (user,)) | ||||
|  | ||||
|     return results | ||||
|  | ||||
|  | ||||
| async def get_jobs_per_day(user: str): | ||||
|     job_query = """ | ||||
|     SELECT  | ||||
|         DATE(time_created) AS date, | ||||
|         COUNT(*) AS job_count | ||||
|     FROM  | ||||
|         jobs | ||||
|     WHERE  | ||||
|         status = 'Completed' AND user = ? | ||||
|     GROUP BY  | ||||
|         DATE(time_created) | ||||
|     ORDER BY  | ||||
|         date ASC; | ||||
|     """ | ||||
|     results = common_query(job_query, (user,)) | ||||
|  | ||||
|     return results | ||||
							
								
								
									
										3
									
								
								api/backend/job/models/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/job/models/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .job_options import JobOptions | ||||
|  | ||||
| __all__ = ["JobOptions"] | ||||
							
								
								
									
										16
									
								
								api/backend/job/models/job_options.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								api/backend/job/models/job_options.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| from pydantic import BaseModel | ||||
| from typing import Any, Optional | ||||
| from api.backend.job.models.site_map import SiteMap | ||||
|  | ||||
|  | ||||
| class FetchOptions(BaseModel): | ||||
|     chat: Optional[bool] = None | ||||
|  | ||||
|  | ||||
| class JobOptions(BaseModel): | ||||
|     multi_page_scrape: bool = False | ||||
|     custom_headers: dict[str, Any] = {} | ||||
|     proxies: list[str] = [] | ||||
|     site_map: Optional[SiteMap] = None | ||||
|     collect_media: bool = False | ||||
|     custom_cookies: list[dict[str, Any]] = [] | ||||
							
								
								
									
										14
									
								
								api/backend/job/models/site_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								api/backend/job/models/site_map.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| from pydantic import BaseModel | ||||
| from typing import Literal | ||||
|  | ||||
|  | ||||
| class Action(BaseModel): | ||||
|     type: Literal["click", "input"] | ||||
|     xpath: str | ||||
|     name: str | ||||
|     input: str = "" | ||||
|     do_once: bool = True | ||||
|  | ||||
|  | ||||
| class SiteMap(BaseModel): | ||||
|     actions: list[Action] | ||||
							
								
								
									
										48
									
								
								api/backend/job/scraping/add_custom.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								api/backend/job/scraping/add_custom.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | ||||
| from typing import Any, Optional | ||||
| from urllib.parse import urlparse | ||||
|  | ||||
| from playwright.async_api import Page, BrowserContext | ||||
|  | ||||
| import logging | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| async def add_custom_cookies( | ||||
|     custom_cookies: list[dict[str, Any]], | ||||
|     url: str, | ||||
|     context: BrowserContext, | ||||
| ) -> None: | ||||
|     parsed_url = urlparse(url) | ||||
|     domain = parsed_url.netloc | ||||
|  | ||||
|     for cookie in custom_cookies: | ||||
|         cookie_dict = { | ||||
|             "name": cookie.get("name", "default_name"), | ||||
|             "value": cookie.get("value", "default_value"), | ||||
|             "domain": domain, | ||||
|             "path": "/", | ||||
|         } | ||||
|  | ||||
|         LOG.info(f"Adding cookie: {cookie_dict}") | ||||
|         await context.add_cookies([cookie_dict])  # type: ignore | ||||
|  | ||||
|  | ||||
| async def add_custom_headers( | ||||
|     custom_headers: dict[str, Any], | ||||
|     page: Page, | ||||
| ) -> None: | ||||
|     await page.set_extra_http_headers(custom_headers) | ||||
|  | ||||
|  | ||||
| async def add_custom_items( | ||||
|     url: str, | ||||
|     page: Page, | ||||
|     cookies: Optional[list[dict[str, Any]]] = None, | ||||
|     headers: Optional[dict[str, Any]] = None, | ||||
| ) -> None: | ||||
|     if cookies: | ||||
|         await add_custom_cookies(cookies, url, page.context) | ||||
|  | ||||
|     if headers: | ||||
|         await add_custom_headers(headers, page) | ||||
							
								
								
									
										110
									
								
								api/backend/job/scraping/collect_media.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										110
									
								
								api/backend/job/scraping/collect_media.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,110 @@ | ||||
| import os | ||||
| from pathlib import Path | ||||
| import re | ||||
| from urllib.parse import urljoin, urlparse | ||||
| from typing import Dict, List | ||||
|  | ||||
| import aiohttp | ||||
| from playwright.async_api import Page | ||||
|  | ||||
| from api.backend.utils import LOG | ||||
|  | ||||
|  | ||||
| async def collect_media(id: str, page: Page) -> dict[str, list[dict[str, str]]]: | ||||
|     media_types = { | ||||
|         "images": "img", | ||||
|         "videos": "video", | ||||
|         "audio": "audio", | ||||
|         "pdfs": 'a[href$=".pdf"], a[href*=".pdf#page="]', | ||||
|         "documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]', | ||||
|         "presentations": 'a[href$=".ppt"], a[href$=".pptx"]', | ||||
|         "spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]', | ||||
|     } | ||||
|  | ||||
|     base_dir = Path("media") | ||||
|     base_dir.mkdir(exist_ok=True) | ||||
|  | ||||
|     media_urls = {} | ||||
|  | ||||
|     async with aiohttp.ClientSession() as session: | ||||
|         for media_type, selector in media_types.items(): | ||||
|             elements = await page.query_selector_all(selector) | ||||
|             urls: List[Dict[str, str]] = [] | ||||
|  | ||||
|             media_dir = base_dir / media_type | ||||
|             media_dir.mkdir(exist_ok=True) | ||||
|  | ||||
|             for element in elements: | ||||
|                 if media_type == "images": | ||||
|                     url = await element.get_attribute("src") | ||||
|                 elif media_type == "videos": | ||||
|                     url = await element.get_attribute( | ||||
|                         "src" | ||||
|                     ) or await element.get_attribute("data-src") | ||||
|                 else: | ||||
|                     url = await element.get_attribute("href") | ||||
|  | ||||
|                 if url and url.startswith("/"): | ||||
|                     root_url = urlparse(page.url) | ||||
|                     root_domain = f"{root_url.scheme}://{root_url.netloc}" | ||||
|                     url = f"{root_domain}{url}" | ||||
|  | ||||
|                 if url and re.match(r"^[\w\-]+/", url): | ||||
|                     root_url = urlparse(page.url) | ||||
|                     root_domain = f"{root_url.scheme}://{root_url.netloc}" | ||||
|                     url = urljoin(root_domain + "/", url) | ||||
|  | ||||
|                 if url and url.startswith(("http://", "https://")): | ||||
|                     try: | ||||
|                         parsed = urlparse(url) | ||||
|                         filename = ( | ||||
|                             os.path.basename(parsed.path) or f"{media_type}_{len(urls)}" | ||||
|                         ) | ||||
|  | ||||
|                         if "." not in filename: | ||||
|                             ext = { | ||||
|                                 "images": ".jpg", | ||||
|                                 "videos": ".mp4", | ||||
|                                 "audio": ".mp3", | ||||
|                                 "pdfs": ".pdf", | ||||
|                                 "documents": ".doc", | ||||
|                                 "presentations": ".ppt", | ||||
|                                 "spreadsheets": ".xls", | ||||
|                             }.get(media_type, "") | ||||
|                             filename += ext | ||||
|  | ||||
|                         if not os.path.exists(media_dir / id): | ||||
|                             os.makedirs(media_dir / id, exist_ok=True) | ||||
|  | ||||
|                         file_path = media_dir / id / f"{filename}" | ||||
|  | ||||
|                         async with session.get(url) as response: | ||||
|                             response.raise_for_status() | ||||
|  | ||||
|                             with open(file_path, "wb") as f: | ||||
|                                 while True: | ||||
|                                     chunk = await response.content.read(8192) | ||||
|                                     if not chunk: | ||||
|                                         break | ||||
|  | ||||
|                                     f.write(chunk) | ||||
|  | ||||
|                         urls.append({"url": url, "local_path": str(file_path)}) | ||||
|                         LOG.info(f"Downloaded {filename} to {file_path}") | ||||
|  | ||||
|                     except Exception as e: | ||||
|                         LOG.error(f"Error downloading {url}: {str(e)}") | ||||
|                         continue | ||||
|  | ||||
|             media_urls[media_type] = urls | ||||
|  | ||||
|     # Write summary | ||||
|     with open(base_dir / "download_summary.txt", "w") as f: | ||||
|         for media_type, downloads in media_urls.items(): | ||||
|             if downloads: | ||||
|                 f.write(f"\n=== {media_type.upper()} ===\n") | ||||
|                 for download in downloads: | ||||
|                     f.write(f"URL: {download['url']}\n") | ||||
|                     f.write(f"Saved to: {download['local_path']}\n\n") | ||||
|  | ||||
|     return media_urls | ||||
							
								
								
									
										45
									
								
								api/backend/job/scraping/scraping_utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								api/backend/job/scraping/scraping_utils.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | ||||
| import asyncio | ||||
| from typing import Set, Tuple | ||||
| from playwright.async_api import Page | ||||
|  | ||||
| from api.backend.utils import LOG | ||||
|  | ||||
| from api.backend.job.scraping.collect_media import collect_media as collect_media_utils | ||||
|  | ||||
|  | ||||
| async def scrape_content( | ||||
|     id: str, page: Page, pages: Set[Tuple[str, str]], collect_media: bool | ||||
| ) -> str: | ||||
|     last_height = await page.evaluate("document.body.scrollHeight") | ||||
|  | ||||
|     while True: | ||||
|         await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") | ||||
|         await asyncio.sleep(3) | ||||
|         new_height = await page.evaluate("document.body.scrollHeight") | ||||
|  | ||||
|         if new_height == last_height: | ||||
|             break | ||||
|  | ||||
|         last_height = new_height | ||||
|  | ||||
|     html = await page.content() | ||||
|     pages.add((html, page.url)) | ||||
|  | ||||
|     if collect_media: | ||||
|         LOG.info("Collecting media") | ||||
|         await collect_media_utils(id, page) | ||||
|  | ||||
|     return html | ||||
|  | ||||
|  | ||||
| def clean_format_characters(text: str) -> str: | ||||
|     text = text.strip() | ||||
|     text = text.replace("\n", " ") | ||||
|     text = text.replace("\t", " ") | ||||
|     text = text.replace("\r", " ") | ||||
|     text = text.replace("\f", " ") | ||||
|     text = text.replace("\v", " ") | ||||
|     text = text.replace("\b", " ") | ||||
|     text = text.replace("\a", " ") | ||||
|  | ||||
|     return text | ||||
							
								
								
									
										0
									
								
								api/backend/job/site_mapping/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								api/backend/job/site_mapping/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										77
									
								
								api/backend/job/site_mapping/site_mapping.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								api/backend/job/site_mapping/site_mapping.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,77 @@ | ||||
| import logging | ||||
| import asyncio | ||||
| from copy import deepcopy | ||||
| from typing import Any | ||||
|  | ||||
| from playwright.async_api import Page | ||||
|  | ||||
| from api.backend.job.models.site_map import Action, SiteMap | ||||
| from api.backend.job.scraping.scraping_utils import scrape_content | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]: | ||||
|     """Clear all actions that have been clicked.""" | ||||
|     cleared_site_map = deepcopy(site_map) | ||||
|     cleared_site_map["actions"] = [ | ||||
|         action for action in cleared_site_map["actions"] if not action["do_once"] | ||||
|     ] | ||||
|  | ||||
|     return cleared_site_map | ||||
|  | ||||
|  | ||||
| async def handle_input(action: Action, page: Page) -> bool: | ||||
|     try: | ||||
|         element = page.locator(f"xpath={action.xpath}") | ||||
|         LOG.info(f"Sending keys: {action.input} to element: {action.xpath}") | ||||
|         await element.fill(action.input) | ||||
|         return True | ||||
|     except Exception as e: | ||||
|         LOG.warning(f"Error handling input for xpath '{action.xpath}': {e}") | ||||
|         return False | ||||
|  | ||||
|  | ||||
| async def handle_click(action: Action, page: Page) -> bool: | ||||
|     try: | ||||
|         element = page.locator(f"xpath={action.xpath}") | ||||
|         LOG.info(f"Clicking element: {action.xpath}") | ||||
|         await element.click() | ||||
|         return True | ||||
|     except Exception as e: | ||||
|         LOG.warning(f"Error clicking element at xpath '{action.xpath}': {e}") | ||||
|         return False | ||||
|  | ||||
|  | ||||
| ACTION_MAP = { | ||||
|     "click": handle_click, | ||||
|     "input": handle_input, | ||||
| } | ||||
|  | ||||
|  | ||||
| async def handle_site_mapping( | ||||
|     id: str, | ||||
|     site_map_dict: dict[str, Any], | ||||
|     page: Page, | ||||
|     pages: set[tuple[str, str]], | ||||
|     collect_media: bool = False, | ||||
| ): | ||||
|     site_map = SiteMap(**site_map_dict) | ||||
|  | ||||
|     for action in site_map.actions: | ||||
|         action_handler = ACTION_MAP[action.type] | ||||
|         success = await action_handler(action, page) | ||||
|  | ||||
|         if not success: | ||||
|             return | ||||
|  | ||||
|         await asyncio.sleep(2) | ||||
|  | ||||
|     await scrape_content(id, page, pages, collect_media=collect_media) | ||||
|  | ||||
|     cleared_site_map_dict = clear_done_actions(site_map_dict) | ||||
|  | ||||
|     if cleared_site_map_dict["actions"]: | ||||
|         await handle_site_mapping( | ||||
|             id, cleared_site_map_dict, page, pages, collect_media=collect_media | ||||
|         ) | ||||
							
								
								
									
										36
									
								
								api/backend/job/utils/clean_job_format.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								api/backend/job/utils/clean_job_format.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | ||||
| from typing import Any | ||||
|  | ||||
| from api.backend.utils import clean_text | ||||
|  | ||||
|  | ||||
| def clean_job_format(jobs: list[dict[str, Any]]) -> dict[str, Any]: | ||||
|     """ | ||||
|     Convert a single job to a dictionary format. | ||||
|     """ | ||||
|     headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"] | ||||
|  | ||||
|     cleaned_rows = [] | ||||
|  | ||||
|     for job in jobs: | ||||
|         for res in job["result"]: | ||||
|             for url, elements in res.items(): | ||||
|                 for element_name, values in elements.items(): | ||||
|                     for value in values: | ||||
|                         text = clean_text(value.get("text", "")).strip() | ||||
|                         if text: | ||||
|                             cleaned_rows.append( | ||||
|                                 { | ||||
|                                     "id": job.get("id", ""), | ||||
|                                     "url": url, | ||||
|                                     "element_name": element_name, | ||||
|                                     "xpath": value.get("xpath", ""), | ||||
|                                     "text": text, | ||||
|                                     "user": job.get("user", ""), | ||||
|                                     "time_created": job.get("time_created", ""), | ||||
|                                 } | ||||
|                             ) | ||||
|  | ||||
|     return { | ||||
|         "headers": headers, | ||||
|         "rows": cleaned_rows, | ||||
|     } | ||||
							
								
								
									
										24
									
								
								api/backend/job/utils/stream_md_from_job_results.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								api/backend/job/utils/stream_md_from_job_results.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| from typing import Any | ||||
|  | ||||
| from api.backend.utils import clean_text | ||||
|  | ||||
|  | ||||
| def stream_md_from_job_results(jobs: list[dict[str, Any]]): | ||||
|     md = "# Job Results Summary\n\n" | ||||
|     for i, job in enumerate(jobs, start=1): | ||||
|         md += f"## Job #{i}\n" | ||||
|         yield f"- **Job URL:** {job.get('url', 'N/A')}\n" | ||||
|         yield f"- **Timestamp:** {job.get('time_created', 'N/A')}\n" | ||||
|         yield f"- **ID:** {job.get('id', 'N/A')}\n" | ||||
|         yield "### Extracted Results:\n" | ||||
|  | ||||
|         for res in job.get("result", []): | ||||
|             for url, elements in res.items(): | ||||
|                 yield f"\n#### URL: {url}\n" | ||||
|                 for element_name, values in elements.items(): | ||||
|                     for value in values: | ||||
|                         text = clean_text(value.get("text", "")).strip() | ||||
|                         if text: | ||||
|                             yield f"- **Element:** `{element_name}`\n" | ||||
|                             yield f"  - **Text:** {text}\n" | ||||
|         yield "\n---\n" | ||||
| @@ -1,15 +1,14 @@ | ||||
| # STL | ||||
| from typing import Any, Optional, Union | ||||
| from typing import Any, Literal, Optional, Union | ||||
| from datetime import datetime | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.job.models.job_options import JobOptions | ||||
|  | ||||
| # PDM | ||||
| import pydantic | ||||
|  | ||||
|  | ||||
| class FetchOptions(pydantic.BaseModel): | ||||
|     chat: Optional[bool] = None | ||||
|  | ||||
|  | ||||
| class Element(pydantic.BaseModel): | ||||
|     name: str | ||||
|     xpath: str | ||||
| @@ -22,18 +21,13 @@ class CapturedElement(pydantic.BaseModel): | ||||
|     name: str | ||||
|  | ||||
|  | ||||
| class JobOptions(pydantic.BaseModel): | ||||
|     multi_page_scrape: bool = False | ||||
|     custom_headers: Optional[dict[str, Any]] = {} | ||||
|     proxies: Optional[list[str]] = [] | ||||
|  | ||||
|  | ||||
| class RetrieveScrapeJobs(pydantic.BaseModel): | ||||
|     user: str | ||||
|  | ||||
|  | ||||
| class DownloadJob(pydantic.BaseModel): | ||||
|     ids: list[str] | ||||
|     job_format: Literal["csv", "md"] | ||||
|  | ||||
|  | ||||
| class DeleteScrapeJobs(pydantic.BaseModel): | ||||
| @@ -64,3 +58,19 @@ class Job(pydantic.BaseModel): | ||||
|     job_options: JobOptions | ||||
|     status: str = "Queued" | ||||
|     chat: Optional[str] = None | ||||
|     agent_mode: bool = False | ||||
|     prompt: Optional[str] = None | ||||
|  | ||||
|  | ||||
| class CronJob(pydantic.BaseModel): | ||||
|     id: Optional[str] = None | ||||
|     user_email: str | ||||
|     job_id: str | ||||
|     cron_expression: str | ||||
|     time_created: Optional[Union[datetime, str]] = None | ||||
|     time_updated: Optional[Union[datetime, str]] = None | ||||
|  | ||||
|  | ||||
| class DeleteCronJob(pydantic.BaseModel): | ||||
|     id: str | ||||
|     user_email: str | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| # STL | ||||
| import datetime | ||||
| import uuid | ||||
| import traceback | ||||
| from io import StringIO | ||||
| @@ -9,25 +10,39 @@ import random | ||||
| # PDM | ||||
| from fastapi import Depends, APIRouter | ||||
| from fastapi.encoders import jsonable_encoder | ||||
| from fastapi.responses import JSONResponse, StreamingResponse | ||||
| from fastapi.responses import FileResponse, JSONResponse, StreamingResponse | ||||
| from api.backend.scheduler import scheduler | ||||
| from apscheduler.triggers.cron import CronTrigger  # type: ignore | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.job import ( | ||||
|     query, | ||||
|     insert, | ||||
|     update_job, | ||||
|     delete_jobs, | ||||
| ) | ||||
| from api.backend.job import insert, update_job, delete_jobs | ||||
| from api.backend.models import ( | ||||
|     DeleteCronJob, | ||||
|     UpdateJobs, | ||||
|     DownloadJob, | ||||
|     FetchOptions, | ||||
|     DeleteScrapeJobs, | ||||
|     Job, | ||||
|     CronJob, | ||||
| ) | ||||
| from api.backend.schemas import User | ||||
| from api.backend.auth.auth_utils import get_current_user | ||||
| from api.backend.utils import clean_text | ||||
| from api.backend.utils import clean_text, format_list_for_query | ||||
| from api.backend.job.models.job_options import FetchOptions | ||||
|  | ||||
| from api.backend.database.common import query | ||||
|  | ||||
| from api.backend.job.cron_scheduling.cron_scheduling import ( | ||||
|     delete_cron_job, | ||||
|     get_cron_job_trigger, | ||||
|     insert_cron_job, | ||||
|     get_cron_jobs, | ||||
|     insert_job_from_cron_job, | ||||
| ) | ||||
|  | ||||
| from api.backend.job.utils.clean_job_format import clean_job_format | ||||
| from api.backend.job.utils.stream_md_from_job_results import stream_md_from_job_results | ||||
|  | ||||
| from api.backend.constants import MEDIA_DIR, MEDIA_TYPES, RECORDINGS_DIR | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
| @@ -47,10 +62,11 @@ async def submit_scrape_job(job: Job): | ||||
|         job.id = uuid.uuid4().hex | ||||
|  | ||||
|         job_dict = job.model_dump() | ||||
|         await insert(job_dict) | ||||
|         insert(job_dict) | ||||
|  | ||||
|         return JSONResponse(content={"id": job.id}) | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {traceback.format_exc()}") | ||||
|         return JSONResponse(content={"error": str(e)}, status_code=500) | ||||
|  | ||||
|  | ||||
| @@ -59,8 +75,11 @@ async def retrieve_scrape_jobs( | ||||
|     fetch_options: FetchOptions, user: User = Depends(get_current_user) | ||||
| ): | ||||
|     LOG.info(f"Retrieving jobs for account: {user.email}") | ||||
|     ATTRIBUTES = "chat" if fetch_options.chat else "*" | ||||
|  | ||||
|     try: | ||||
|         results = await query({"user": user.email}, fetch_options=fetch_options) | ||||
|         job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?" | ||||
|         results = query(job_query, (user.email,)) | ||||
|         return JSONResponse(content=jsonable_encoder(results[::-1])) | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {e}") | ||||
| @@ -72,8 +91,8 @@ async def job(id: str, user: User = Depends(get_current_user)): | ||||
|     LOG.info(f"Retrieving jobs for account: {user.email}") | ||||
|  | ||||
|     try: | ||||
|         filter = {"user": user.email, "id": id} | ||||
|         results = await query(filter) | ||||
|         job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?" | ||||
|         results = query(job_query, (user.email, id)) | ||||
|         return JSONResponse(content=jsonable_encoder(results)) | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {e}") | ||||
| @@ -85,43 +104,77 @@ async def download(download_job: DownloadJob): | ||||
|     LOG.info(f"Downloading job with ids: {download_job.ids}") | ||||
|  | ||||
|     try: | ||||
|         results = await query({"id": {"$in": download_job.ids}}) | ||||
|  | ||||
|         csv_buffer = StringIO() | ||||
|         csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL) | ||||
|  | ||||
|         headers = ["id", "url", "element_name", "xpath", "text", "user", "time_created"] | ||||
|         csv_writer.writerow(headers) | ||||
|  | ||||
|         for result in results: | ||||
|             for res in result["result"]: | ||||
|                 for url, elements in res.items(): | ||||
|                     for element_name, values in elements.items(): | ||||
|                         for value in values: | ||||
|                             text = clean_text(value.get("text", "")).strip() | ||||
|                             if text: | ||||
|                                 csv_writer.writerow( | ||||
|                                     [ | ||||
|                                         result.get("id", "") | ||||
|                                         + "-" | ||||
|                                         + str(random.randint(0, 1000000)), | ||||
|                                         url, | ||||
|                                         element_name, | ||||
|                                         value.get("xpath", ""), | ||||
|                                         text, | ||||
|                                         result.get("user", ""), | ||||
|                                         result.get("time_created", ""), | ||||
|                                     ] | ||||
|                                 ) | ||||
|  | ||||
|         _ = csv_buffer.seek(0) | ||||
|         response = StreamingResponse( | ||||
|             csv_buffer, | ||||
|             media_type="text/csv", | ||||
|         job_query = ( | ||||
|             f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}" | ||||
|         ) | ||||
|         response.headers["Content-Disposition"] = "attachment; filename=export.csv" | ||||
|         return response | ||||
|         results = query(job_query, tuple(download_job.ids)) | ||||
|  | ||||
|         if download_job.job_format == "csv": | ||||
|             csv_buffer = StringIO() | ||||
|             csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL) | ||||
|  | ||||
|             headers = [ | ||||
|                 "id", | ||||
|                 "url", | ||||
|                 "element_name", | ||||
|                 "xpath", | ||||
|                 "text", | ||||
|                 "user", | ||||
|                 "time_created", | ||||
|             ] | ||||
|             csv_writer.writerow(headers) | ||||
|  | ||||
|             for result in results: | ||||
|                 for res in result["result"]: | ||||
|                     for url, elements in res.items(): | ||||
|                         for element_name, values in elements.items(): | ||||
|                             for value in values: | ||||
|                                 text = clean_text(value.get("text", "")).strip() | ||||
|                                 if text: | ||||
|                                     csv_writer.writerow( | ||||
|                                         [ | ||||
|                                             result.get("id", "") | ||||
|                                             + "-" | ||||
|                                             + str(random.randint(0, 1000000)), | ||||
|                                             url, | ||||
|                                             element_name, | ||||
|                                             value.get("xpath", ""), | ||||
|                                             text, | ||||
|                                             result.get("user", ""), | ||||
|                                             result.get("time_created", ""), | ||||
|                                         ] | ||||
|                                     ) | ||||
|  | ||||
|             _ = csv_buffer.seek(0) | ||||
|             response = StreamingResponse( | ||||
|                 csv_buffer, | ||||
|                 media_type="text/csv", | ||||
|             ) | ||||
|             response.headers["Content-Disposition"] = "attachment; filename=export.csv" | ||||
|             return response | ||||
|  | ||||
|         elif download_job.job_format == "md": | ||||
|             response = StreamingResponse( | ||||
|                 stream_md_from_job_results(results), | ||||
|                 media_type="text/markdown", | ||||
|             ) | ||||
|  | ||||
|             response.headers["Content-Disposition"] = "attachment; filename=export.md" | ||||
|             return response | ||||
|  | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {e}") | ||||
|         traceback.print_exc() | ||||
|         return {"error": str(e)} | ||||
|  | ||||
|  | ||||
| @job_router.get("/job/{id}/convert-to-csv") | ||||
| async def convert_to_csv(id: str): | ||||
|     try: | ||||
|         job_query = f"SELECT * FROM jobs WHERE id = ?" | ||||
|         results = query(job_query, (id,)) | ||||
|  | ||||
|         return JSONResponse(content=clean_job_format(results)) | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {e}") | ||||
|         traceback.print_exc() | ||||
| @@ -136,3 +189,85 @@ async def delete(delete_scrape_jobs: DeleteScrapeJobs): | ||||
|         if result | ||||
|         else JSONResponse({"error": "Jobs not deleted."}) | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @job_router.post("/schedule-cron-job") | ||||
| async def schedule_cron_job(cron_job: CronJob): | ||||
|     if not cron_job.id: | ||||
|         cron_job.id = uuid.uuid4().hex | ||||
|  | ||||
|     if not cron_job.time_created: | ||||
|         cron_job.time_created = datetime.datetime.now() | ||||
|  | ||||
|     if not cron_job.time_updated: | ||||
|         cron_job.time_updated = datetime.datetime.now() | ||||
|  | ||||
|     insert_cron_job(cron_job) | ||||
|  | ||||
|     queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,)) | ||||
|  | ||||
|     scheduler.add_job( | ||||
|         insert_job_from_cron_job, | ||||
|         get_cron_job_trigger(cron_job.cron_expression), | ||||
|         id=cron_job.id, | ||||
|         args=[queried_job[0]], | ||||
|     ) | ||||
|  | ||||
|     return JSONResponse(content={"message": "Cron job scheduled successfully."}) | ||||
|  | ||||
|  | ||||
| @job_router.post("/delete-cron-job") | ||||
| async def delete_cron_job_request(request: DeleteCronJob): | ||||
|     if not request.id: | ||||
|         return JSONResponse( | ||||
|             content={"error": "Cron job id is required."}, status_code=400 | ||||
|         ) | ||||
|  | ||||
|     delete_cron_job(request.id, request.user_email) | ||||
|     scheduler.remove_job(request.id) | ||||
|  | ||||
|     return JSONResponse(content={"message": "Cron job deleted successfully."}) | ||||
|  | ||||
|  | ||||
| @job_router.get("/cron-jobs") | ||||
| async def get_cron_jobs_request(user: User = Depends(get_current_user)): | ||||
|     cron_jobs = get_cron_jobs(user.email) | ||||
|     return JSONResponse(content=jsonable_encoder(cron_jobs)) | ||||
|  | ||||
|  | ||||
| @job_router.get("/recordings/{id}") | ||||
| async def get_recording(id: str): | ||||
|     path = RECORDINGS_DIR / f"{id}.mp4" | ||||
|     if not path.exists(): | ||||
|         return JSONResponse(content={"error": "Recording not found."}, status_code=404) | ||||
|  | ||||
|     return FileResponse( | ||||
|         path, headers={"Content-Type": "video/mp4", "Accept-Ranges": "bytes"} | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @job_router.get("/get-media") | ||||
| async def get_media(id: str): | ||||
|     try: | ||||
|         files: dict[str, list[str]] = {} | ||||
|  | ||||
|         for media_type in MEDIA_TYPES: | ||||
|             path = MEDIA_DIR / media_type / f"{id}" | ||||
|  | ||||
|             files[media_type] = [file.name for file in path.glob("*")] | ||||
|  | ||||
|         return JSONResponse(content={"files": files}) | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {e}") | ||||
|         traceback.print_exc() | ||||
|         return JSONResponse(content={"error": str(e)}, status_code=500) | ||||
|  | ||||
|  | ||||
| @job_router.get("/media") | ||||
| async def get_media_file(id: str, type: str, file: str): | ||||
|     path = MEDIA_DIR / type / f"{id}" / file | ||||
|  | ||||
|     if not path.exists(): | ||||
|         return JSONResponse(content={"error": "Media file not found."}, status_code=404) | ||||
|  | ||||
|     return FileResponse(path) | ||||
|   | ||||
| @@ -1,46 +0,0 @@ | ||||
| # STL | ||||
| import logging | ||||
| import docker | ||||
|  | ||||
| # PDM | ||||
| from fastapi import APIRouter, HTTPException | ||||
| from fastapi.responses import JSONResponse, StreamingResponse | ||||
|  | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
| log_router = APIRouter() | ||||
|  | ||||
| client = docker.from_env() | ||||
|  | ||||
|  | ||||
| @log_router.get("/initial_logs") | ||||
| async def get_initial_logs(): | ||||
|     container_id = "scraperr_api" | ||||
|  | ||||
|     try: | ||||
|         container = client.containers.get(container_id) | ||||
|         log_stream = container.logs(stream=False).decode("utf-8") | ||||
|         return JSONResponse(content={"logs": log_stream}) | ||||
|     except Exception as e: | ||||
|         raise HTTPException(status_code=500, detail=f"Unexpected error: {e}") | ||||
|  | ||||
|  | ||||
| @log_router.get("/logs") | ||||
| async def get_own_logs(): | ||||
|     container_id = "scraperr_api" | ||||
|  | ||||
|     try: | ||||
|         container = client.containers.get(container_id) | ||||
|         log_stream = container.logs(stream=True, follow=True) | ||||
|  | ||||
|         def log_generator(): | ||||
|             try: | ||||
|                 for log in log_stream: | ||||
|                     yield f"data: {log.decode('utf-8')}\n\n" | ||||
|             except Exception as e: | ||||
|                 yield f"data: {str(e)}\n\n" | ||||
|  | ||||
|         return StreamingResponse(log_generator(), media_type="text/event-stream") | ||||
|     except Exception as e: | ||||
|         raise HTTPException(status_code=500, detail=str(e)) | ||||
							
								
								
									
										3
									
								
								api/backend/scheduler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/scheduler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from apscheduler.schedulers.background import BackgroundScheduler  # type: ignore | ||||
|  | ||||
| scheduler = BackgroundScheduler() | ||||
| @@ -1,26 +1,27 @@ | ||||
| import logging | ||||
| from typing import Any, Optional | ||||
| import time | ||||
| import random | ||||
| from typing import Any, Optional, cast | ||||
|  | ||||
| from bs4 import BeautifulSoup | ||||
| from bs4 import BeautifulSoup, Tag | ||||
| from lxml import etree | ||||
| from seleniumwire import webdriver | ||||
| from lxml.etree import _Element  # type: ignore [reportPrivateImport] | ||||
| from fake_useragent import UserAgent | ||||
| from selenium.webdriver.support import expected_conditions as EC | ||||
| from selenium.webdriver.common.by import By | ||||
| from selenium.webdriver.support.ui import WebDriverWait | ||||
| from selenium.webdriver.chrome.options import Options as ChromeOptions | ||||
| from camoufox import AsyncCamoufox | ||||
| from playwright.async_api import Page | ||||
| from urllib.parse import urlparse, urljoin | ||||
|  | ||||
| from api.backend.models import Element, CapturedElement | ||||
| from api.backend.job.scraping.scraping_utils import ( | ||||
|     clean_format_characters, | ||||
|     scrape_content, | ||||
| ) | ||||
| from api.backend.job.site_mapping.site_mapping import handle_site_mapping | ||||
|  | ||||
| from api.backend.job.scraping.add_custom import add_custom_items | ||||
|  | ||||
| from api.backend.constants import RECORDINGS_ENABLED | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class HtmlElement(_Element): ... | ||||
|  | ||||
|  | ||||
| def is_same_domain(url: str, original_url: str) -> bool: | ||||
|     parsed_url = urlparse(url) | ||||
|     parsed_original_url = urlparse(original_url) | ||||
| @@ -29,184 +30,164 @@ def is_same_domain(url: str, original_url: str) -> bool: | ||||
|  | ||||
| def clean_xpath(xpath: str) -> str: | ||||
|     parts = xpath.split("/") | ||||
|     clean_parts: list[str] = [] | ||||
|     for part in parts: | ||||
|         if part == "": | ||||
|             clean_parts.append("/") | ||||
|         else: | ||||
|             clean_parts.append(part) | ||||
|     clean_xpath = "//".join(clean_parts).replace("////", "//") | ||||
|     clean_xpath = clean_xpath.replace("'", "\\'") | ||||
|     clean_parts = ["/" if part == "" else part for part in parts] | ||||
|     clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'") | ||||
|     LOG.info(f"Cleaned xpath: {clean_xpath}") | ||||
|  | ||||
|     return clean_xpath | ||||
|  | ||||
|  | ||||
| def sxpath(context: _Element, xpath: str) -> list[HtmlElement]: | ||||
|     return context.xpath(xpath)  # pyright: ignore [reportReturnType] | ||||
|  | ||||
|  | ||||
| def interceptor(headers: dict[str, Any]): | ||||
|     def _interceptor(request: Any): | ||||
|         for key, val in headers.items(): | ||||
|             if request.headers.get(key): | ||||
|                 del request.headers[key] | ||||
|             request.headers[key] = val | ||||
|         if "sec-ch-ua" in request.headers: | ||||
|             original_value = request.headers["sec-ch-ua"] | ||||
|             del request.headers["sec-ch-ua"] | ||||
|             modified_value = original_value.replace("HeadlessChrome", "Chrome") | ||||
|             request.headers["sec-ch-ua"] = modified_value | ||||
|  | ||||
|     return _interceptor | ||||
|  | ||||
|  | ||||
| def create_driver(proxies: Optional[list[str]] = []): | ||||
|     ua = UserAgent() | ||||
|     chrome_options = ChromeOptions() | ||||
|     chrome_options.add_argument("--headless") | ||||
|     chrome_options.add_argument("--no-sandbox") | ||||
|     chrome_options.add_argument("--disable-dev-shm-usage") | ||||
|     chrome_options.add_argument(f"user-agent={ua.random}") | ||||
|  | ||||
|     sw_options = {} | ||||
|     if proxies: | ||||
|         selected_proxy = proxies[random.randint(0, len(proxies) - 1)] | ||||
|         LOG.info(f"Using proxy: {selected_proxy}") | ||||
|  | ||||
|         sw_options = { | ||||
|             "proxy": { | ||||
|                 "https": f"https://{selected_proxy}", | ||||
|                 "http": f"http://{selected_proxy}", | ||||
|             } | ||||
|         } | ||||
|  | ||||
|     driver = webdriver.Chrome( | ||||
|         options=chrome_options, | ||||
|         seleniumwire_options=sw_options, | ||||
|     ) | ||||
|     return driver | ||||
| def sxpath(context: etree._Element, xpath: str): | ||||
|     return context.xpath(xpath) | ||||
|  | ||||
|  | ||||
| async def make_site_request( | ||||
|     id: str, | ||||
|     url: str, | ||||
|     headers: Optional[dict[str, Any]], | ||||
|     multi_page_scrape: bool = False, | ||||
|     visited_urls: set[str] = set(), | ||||
|     pages: set[tuple[str, str]] = set(), | ||||
|     original_url: str = "", | ||||
|     proxies: Optional[list[str]] = [], | ||||
| ) -> None: | ||||
|     """Make basic `GET` request to site using Selenium.""" | ||||
|     # Check if URL has already been visited | ||||
|     proxies: Optional[list[str]] = None, | ||||
|     site_map: Optional[dict[str, Any]] = None, | ||||
|     collect_media: bool = False, | ||||
|     custom_cookies: Optional[list[dict[str, Any]]] = None, | ||||
| ): | ||||
|     if url in visited_urls: | ||||
|         return | ||||
|  | ||||
|     driver = create_driver(proxies) | ||||
|     driver.implicitly_wait(10) | ||||
|     proxy = None | ||||
|  | ||||
|     if headers: | ||||
|         driver.request_interceptor = interceptor(headers) | ||||
|     if proxies: | ||||
|         proxy = random.choice(proxies) | ||||
|         LOG.info(f"Using proxy: {proxy}") | ||||
|  | ||||
|     async with AsyncCamoufox(headless=not RECORDINGS_ENABLED, proxy=proxy) as browser: | ||||
|         page: Page = await browser.new_page() | ||||
|         await page.set_viewport_size({"width": 1920, "height": 1080}) | ||||
|  | ||||
|         # Add cookies and headers | ||||
|         await add_custom_items(url, page, custom_cookies, headers) | ||||
|  | ||||
|     try: | ||||
|         LOG.info(f"Visiting URL: {url}") | ||||
|         driver.get(url) | ||||
|  | ||||
|         final_url = driver.current_url | ||||
|         visited_urls.add(url) | ||||
|         visited_urls.add(final_url) | ||||
|         _ = WebDriverWait(driver, 10).until( | ||||
|             EC.presence_of_element_located((By.TAG_NAME, "body")) | ||||
|         ) | ||||
|         try: | ||||
|             await page.goto(url, timeout=60000) | ||||
|             await page.wait_for_load_state("networkidle") | ||||
|  | ||||
|         last_height = driver.execute_script("return document.body.scrollHeight") | ||||
|         while True: | ||||
|             driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||||
|             final_url = page.url | ||||
|  | ||||
|             time.sleep(3)  # Wait for the page to load | ||||
|             new_height = driver.execute_script("return document.body.scrollHeight") | ||||
|             visited_urls.add(url) | ||||
|             visited_urls.add(final_url) | ||||
|  | ||||
|             if new_height == last_height: | ||||
|                 break | ||||
|             html_content = await scrape_content(id, page, pages, collect_media) | ||||
|  | ||||
|             last_height = new_height | ||||
|             html_content = await page.content() | ||||
|             pages.add((html_content, final_url)) | ||||
|  | ||||
|         final_height = driver.execute_script("return document.body.scrollHeight") | ||||
|             if site_map: | ||||
|                 await handle_site_mapping( | ||||
|                     id, site_map, page, pages, collect_media=collect_media | ||||
|                 ) | ||||
|  | ||||
|         page_source = driver.page_source | ||||
|         LOG.debug(f"Page source for url: {url}\n{page_source}") | ||||
|         pages.add((page_source, final_url)) | ||||
|     finally: | ||||
|         driver.quit() | ||||
|         finally: | ||||
|             await page.close() | ||||
|             await browser.close() | ||||
|  | ||||
|     if not multi_page_scrape: | ||||
|         return | ||||
|  | ||||
|     soup = BeautifulSoup(page_source, "html.parser") | ||||
|     soup = BeautifulSoup(html_content, "html.parser") | ||||
|  | ||||
|     for a_tag in soup.find_all("a"): | ||||
|         link = a_tag.get("href") | ||||
|         if not isinstance(a_tag, Tag): | ||||
|             continue | ||||
|  | ||||
|         if link: | ||||
|             if not urlparse(link).netloc: | ||||
|                 base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url)) | ||||
|                 link = urljoin(base_url, link) | ||||
|         link = cast(str, a_tag.get("href", "")) | ||||
|  | ||||
|             if link not in visited_urls and is_same_domain(link, original_url): | ||||
|                 await make_site_request( | ||||
|                     link, | ||||
|                     headers=headers, | ||||
|                     multi_page_scrape=multi_page_scrape, | ||||
|                     visited_urls=visited_urls, | ||||
|                     pages=pages, | ||||
|                     original_url=original_url, | ||||
|                 ) | ||||
|         if not link: | ||||
|             continue | ||||
|  | ||||
|         if not urlparse(link).netloc: | ||||
|             base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url)) | ||||
|             link = urljoin(base_url, link) | ||||
|  | ||||
|         if link not in visited_urls and is_same_domain(link, original_url): | ||||
|             await make_site_request( | ||||
|                 id, | ||||
|                 link, | ||||
|                 headers=headers, | ||||
|                 multi_page_scrape=multi_page_scrape, | ||||
|                 visited_urls=visited_urls, | ||||
|                 pages=pages, | ||||
|                 original_url=original_url, | ||||
|                 proxies=proxies, | ||||
|                 site_map=site_map, | ||||
|                 collect_media=collect_media, | ||||
|                 custom_cookies=custom_cookies, | ||||
|             ) | ||||
|  | ||||
|  | ||||
| async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]): | ||||
|     soup = BeautifulSoup(page[0], "lxml") | ||||
|     root = etree.HTML(str(soup)) | ||||
|  | ||||
|     elements: dict[str, list[CapturedElement]] = dict() | ||||
|     elements: dict[str, list[CapturedElement]] = {} | ||||
|  | ||||
|     for elem in xpaths: | ||||
|         el = sxpath(root, elem.xpath) | ||||
|  | ||||
|         for e in el: | ||||
|             text = "\t".join(str(t) for t in e.itertext()) | ||||
|         for e in el:  # type: ignore | ||||
|             text = ( | ||||
|                 " ".join(str(t) for t in e.itertext()) | ||||
|                 if isinstance(e, etree._Element) | ||||
|                 else str(e)  # type: ignore | ||||
|             ) | ||||
|  | ||||
|             text = clean_format_characters(text) | ||||
|  | ||||
|             captured_element = CapturedElement( | ||||
|                 xpath=elem.xpath, text=text, name=elem.name | ||||
|             ) | ||||
|  | ||||
|             if elem.name in elements: | ||||
|                 elements[elem.name].append(captured_element) | ||||
|                 continue | ||||
|  | ||||
|             elements[elem.name] = [captured_element] | ||||
|             else: | ||||
|                 elements[elem.name] = [captured_element] | ||||
|  | ||||
|     return {page[1]: elements} | ||||
|  | ||||
|  | ||||
| async def scrape( | ||||
|     id: str, | ||||
|     url: str, | ||||
|     xpaths: list[Element], | ||||
|     headers: Optional[dict[str, Any]], | ||||
|     headers: Optional[dict[str, Any]] = None, | ||||
|     multi_page_scrape: bool = False, | ||||
|     proxies: Optional[list[str]] = [], | ||||
|     proxies: Optional[list[str]] = None, | ||||
|     site_map: Optional[dict[str, Any]] = None, | ||||
|     collect_media: bool = False, | ||||
|     custom_cookies: Optional[list[dict[str, Any]]] = None, | ||||
| ): | ||||
|     visited_urls: set[str] = set() | ||||
|     pages: set[tuple[str, str]] = set() | ||||
|  | ||||
|     _ = await make_site_request( | ||||
|     await make_site_request( | ||||
|         id, | ||||
|         url, | ||||
|         headers, | ||||
|         headers=headers, | ||||
|         multi_page_scrape=multi_page_scrape, | ||||
|         visited_urls=visited_urls, | ||||
|         pages=pages, | ||||
|         original_url=url, | ||||
|         proxies=proxies, | ||||
|         site_map=site_map, | ||||
|         collect_media=collect_media, | ||||
|         custom_cookies=custom_cookies, | ||||
|     ) | ||||
|  | ||||
|     elements: list[dict[str, dict[str, list[CapturedElement]]]] = list() | ||||
|     elements: list[dict[str, dict[str, list[CapturedElement]]]] = [] | ||||
|  | ||||
|     for page in pages: | ||||
|         elements.append(await collect_scraped_elements(page, xpaths)) | ||||
|   | ||||
| @@ -21,7 +21,7 @@ async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock): | ||||
|     mock_randint.return_value = mocked_random_int | ||||
|  | ||||
|     # Create a DownloadJob instance | ||||
|     download_job = DownloadJob(ids=[mocked_job["id"]]) | ||||
|     download_job = DownloadJob(ids=[mocked_job["id"]], job_format="csv") | ||||
|  | ||||
|     # Make a POST request to the /download endpoint | ||||
|     response = client.post("/download", json=download_job.model_dump()) | ||||
|   | ||||
| @@ -1,33 +1,53 @@ | ||||
| import pytest | ||||
| from unittest.mock import AsyncMock, patch, MagicMock | ||||
| from api.backend.tests.factories.job_factory import create_job | ||||
| from api.backend.models import JobOptions | ||||
| from api.backend.scraping import create_driver | ||||
| import logging | ||||
| from typing import Dict | ||||
| from playwright.async_api import async_playwright, Cookie, Route | ||||
| from api.backend.job.scraping.add_custom import add_custom_items | ||||
|  | ||||
|  | ||||
| mocked_job = create_job( | ||||
|     job_options=JobOptions( | ||||
|         multi_page_scrape=False, custom_headers={}, proxies=["127.0.0.1:8080"] | ||||
|     ) | ||||
| ).model_dump() | ||||
| logging.basicConfig(level=logging.DEBUG) | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| @pytest.mark.asyncio | ||||
| @patch("seleniumwire.webdriver.Chrome.get") | ||||
| async def test_proxy(mock_get: AsyncMock): | ||||
|     # Mock the response of the requests.get call | ||||
|     mock_response = MagicMock() | ||||
|     mock_get.return_value = mock_response | ||||
| async def test_add_custom_items(): | ||||
|     test_cookies = [{"name": "big", "value": "cookie"}] | ||||
|     test_headers = {"User-Agent": "test-agent", "Accept": "application/json"} | ||||
|  | ||||
|     driver = create_driver(proxies=["127.0.0.1:8080"]) | ||||
|     assert driver is not None | ||||
|     async with async_playwright() as p: | ||||
|         browser = await p.chromium.launch(headless=True) | ||||
|         context = await browser.new_context() | ||||
|         page = await context.new_page() | ||||
|  | ||||
|     # Simulate a request | ||||
|     driver.get("http://example.com") | ||||
|     response = driver.last_request | ||||
|         # Set up request interception | ||||
|         captured_headers: Dict[str, str] = {} | ||||
|  | ||||
|     # Check if the proxy header is set correctly | ||||
|     if response: | ||||
|         assert response.headers["Proxy"] == "127.0.0.1:8080" | ||||
|         async def handle_route(route: Route) -> None: | ||||
|             nonlocal captured_headers | ||||
|             captured_headers = route.request.headers | ||||
|             await route.continue_() | ||||
|  | ||||
|     driver.quit() | ||||
|         await page.route("**/*", handle_route) | ||||
|  | ||||
|         await add_custom_items( | ||||
|             url="http://example.com", | ||||
|             page=page, | ||||
|             cookies=test_cookies, | ||||
|             headers=test_headers, | ||||
|         ) | ||||
|  | ||||
|         # Navigate to example.com | ||||
|         await page.goto("http://example.com") | ||||
|  | ||||
|         # Verify cookies were added | ||||
|         cookies: list[Cookie] = await page.context.cookies() | ||||
|         test_cookie = next((c for c in cookies if c.get("name") == "big"), None) | ||||
|  | ||||
|         assert test_cookie is not None | ||||
|         assert test_cookie.get("value") == "cookie" | ||||
|         assert test_cookie.get("path") == "/"  # Default path should be set | ||||
|         assert test_cookie.get("sameSite") == "Lax"  # Default sameSite should be set | ||||
|  | ||||
|         # Verify headers were added | ||||
|         assert captured_headers.get("user-agent") == "test-agent" | ||||
|  | ||||
|         await browser.close() | ||||
|   | ||||
| @@ -1,5 +1,8 @@ | ||||
| from typing import Optional | ||||
| from typing import Any, Optional | ||||
| import logging | ||||
| import json | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def clean_text(text: str): | ||||
| @@ -17,3 +20,30 @@ def get_log_level(level_name: Optional[str]) -> int: | ||||
|         level = getattr(logging, level_name, logging.INFO) | ||||
|  | ||||
|     return level | ||||
|  | ||||
|  | ||||
| def format_list_for_query(ids: list[str]): | ||||
|     return ( | ||||
|         f"({','.join(['?' for _ in ids])})"  # Returns placeholders, e.g., "(?, ?, ?)" | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def format_sql_row_to_python(row: dict[str, Any]): | ||||
|     new_row: dict[str, Any] = {} | ||||
|     for key, value in row.items(): | ||||
|         if isinstance(value, str): | ||||
|             try: | ||||
|                 new_row[key] = json.loads(value) | ||||
|             except json.JSONDecodeError: | ||||
|                 new_row[key] = value | ||||
|         else: | ||||
|             new_row[key] = value | ||||
|  | ||||
|     return new_row | ||||
|  | ||||
|  | ||||
| def format_json(items: list[Any]): | ||||
|     for idx, item in enumerate(items): | ||||
|         if isinstance(item, (dict, list)): | ||||
|             formatted_item = json.dumps(item) | ||||
|             items[idx] = formatted_item | ||||
|   | ||||
| @@ -1,30 +1,97 @@ | ||||
| import os | ||||
| import json | ||||
| from pathlib import Path | ||||
|  | ||||
| from api.backend.job import get_queued_job, update_job | ||||
| from api.backend.scraping import scrape | ||||
| from api.backend.models import Element | ||||
| from fastapi.encoders import jsonable_encoder | ||||
| import subprocess | ||||
|  | ||||
| import asyncio | ||||
| import logging | ||||
| import sys | ||||
| import traceback | ||||
|  | ||||
| logging.basicConfig(stream=sys.stdout, level=logging.INFO) | ||||
| LOG = logging.getLogger(__name__) | ||||
| from api.backend.database.startup import init_database | ||||
|  | ||||
| from api.backend.worker.post_job_complete.post_job_complete import post_job_complete | ||||
| from api.backend.worker.logger import LOG | ||||
|  | ||||
| from api.backend.ai.agent.agent import scrape_with_agent | ||||
|  | ||||
|  | ||||
| NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "") | ||||
| NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "") | ||||
| SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "") | ||||
| EMAIL = os.getenv("EMAIL", "") | ||||
| TO = os.getenv("TO", "") | ||||
| SMTP_HOST = os.getenv("SMTP_HOST", "") | ||||
| SMTP_PORT = int(os.getenv("SMTP_PORT", 587)) | ||||
| SMTP_USER = os.getenv("SMTP_USER", "") | ||||
| SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "") | ||||
| USE_TLS = os.getenv("USE_TLS", "false").lower() == "true" | ||||
|  | ||||
| RECORDINGS_ENABLED = os.getenv("RECORDINGS_ENABLED", "true").lower() == "true" | ||||
| RECORDINGS_DIR = Path("/project/app/media/recordings") | ||||
|  | ||||
|  | ||||
| async def process_job(): | ||||
|     job = await get_queued_job() | ||||
|     ffmpeg_proc = None | ||||
|     status = "Queued" | ||||
|  | ||||
|     if job: | ||||
|         LOG.info(f"Beginning processing job: {job}.") | ||||
|  | ||||
|         try: | ||||
|             output_path = RECORDINGS_DIR / f"{job['id']}.mp4" | ||||
|  | ||||
|             if RECORDINGS_ENABLED: | ||||
|                 ffmpeg_proc = subprocess.Popen( | ||||
|                     [ | ||||
|                         "ffmpeg", | ||||
|                         "-y", | ||||
|                         "-video_size", | ||||
|                         "1280x1024", | ||||
|                         "-framerate", | ||||
|                         "15", | ||||
|                         "-f", | ||||
|                         "x11grab", | ||||
|                         "-i", | ||||
|                         ":99", | ||||
|                         "-codec:v", | ||||
|                         "libx264", | ||||
|                         "-preset", | ||||
|                         "ultrafast", | ||||
|                         output_path, | ||||
|                     ] | ||||
|                 ) | ||||
|  | ||||
|             _ = await update_job([job["id"]], field="status", value="Scraping") | ||||
|             scraped = await scrape( | ||||
|                 job["url"], | ||||
|                 [Element(**j) for j in job["elements"]], | ||||
|                 job["job_options"]["custom_headers"], | ||||
|                 job["job_options"]["multi_page_scrape"], | ||||
|                 job["job_options"]["proxies"], | ||||
|             ) | ||||
|  | ||||
|             proxies = job["job_options"]["proxies"] | ||||
|  | ||||
|             if proxies and isinstance(proxies[0], str) and proxies[0].startswith("{"): | ||||
|                 try: | ||||
|                     proxies = [json.loads(p) for p in proxies] | ||||
|                 except json.JSONDecodeError: | ||||
|                     LOG.error(f"Failed to parse proxy JSON: {proxies}") | ||||
|                     proxies = [] | ||||
|  | ||||
|             if job["agent_mode"]: | ||||
|                 scraped = await scrape_with_agent(job) | ||||
|             else: | ||||
|                 scraped = await scrape( | ||||
|                     job["id"], | ||||
|                     job["url"], | ||||
|                     [Element(**j) for j in job["elements"]], | ||||
|                     job["job_options"]["custom_headers"], | ||||
|                     job["job_options"]["multi_page_scrape"], | ||||
|                     proxies, | ||||
|                     job["job_options"]["site_map"], | ||||
|                     job["job_options"]["collect_media"], | ||||
|                     job["job_options"]["custom_cookies"], | ||||
|                 ) | ||||
|  | ||||
|             LOG.info( | ||||
|                 f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}" | ||||
|             ) | ||||
| @@ -32,14 +99,43 @@ async def process_job(): | ||||
|                 [job["id"]], field="result", value=jsonable_encoder(scraped) | ||||
|             ) | ||||
|             _ = await update_job([job["id"]], field="status", value="Completed") | ||||
|             status = "Completed" | ||||
|  | ||||
|         except Exception as e: | ||||
|             _ = await update_job([job["id"]], field="status", value="Failed") | ||||
|             _ = await update_job([job["id"]], field="result", value=e) | ||||
|             LOG.error(f"Exception as occured: {e}\n{traceback.print_exc()}") | ||||
|             status = "Failed" | ||||
|         finally: | ||||
|             job["status"] = status | ||||
|             await post_job_complete( | ||||
|                 job, | ||||
|                 { | ||||
|                     "channel": NOTIFICATION_CHANNEL, | ||||
|                     "webhook_url": NOTIFICATION_WEBHOOK_URL, | ||||
|                     "scraperr_frontend_url": SCRAPERR_FRONTEND_URL, | ||||
|                     "email": EMAIL, | ||||
|                     "to": TO, | ||||
|                     "smtp_host": SMTP_HOST, | ||||
|                     "smtp_port": SMTP_PORT, | ||||
|                     "smtp_user": SMTP_USER, | ||||
|                     "smtp_password": SMTP_PASSWORD, | ||||
|                     "use_tls": USE_TLS, | ||||
|                 }, | ||||
|             ) | ||||
|  | ||||
|             if ffmpeg_proc: | ||||
|                 ffmpeg_proc.terminate() | ||||
|                 ffmpeg_proc.wait() | ||||
|  | ||||
|  | ||||
| async def main(): | ||||
|     LOG.info("Starting job worker...") | ||||
|  | ||||
|     init_database() | ||||
|  | ||||
|     RECORDINGS_DIR.mkdir(parents=True, exist_ok=True) | ||||
|  | ||||
|     while True: | ||||
|         await process_job() | ||||
|         await asyncio.sleep(5) | ||||
|   | ||||
							
								
								
									
										12
									
								
								api/backend/worker/logger.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								api/backend/worker/logger.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,12 @@ | ||||
| import logging | ||||
| import os | ||||
|  | ||||
| from api.backend.utils import get_log_level | ||||
|  | ||||
| logging.basicConfig( | ||||
|     level=get_log_level(os.getenv("LOG_LEVEL")), | ||||
|     format="%(levelname)s:     %(asctime)s - %(name)s - %(message)s", | ||||
|     handlers=[logging.StreamHandler()], | ||||
| ) | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
							
								
								
									
										56
									
								
								api/backend/worker/post_job_complete/discord_notification.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								api/backend/worker/post_job_complete/discord_notification.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,56 @@ | ||||
| import json | ||||
| from typing import Any | ||||
|  | ||||
| import requests | ||||
|  | ||||
| from api.backend.worker.logger import LOG | ||||
| from api.backend.worker.post_job_complete.models import ( | ||||
|     PostJobCompleteOptions, | ||||
|     JOB_COLOR_MAP, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def discord_notification(job: dict[str, Any], options: PostJobCompleteOptions): | ||||
|     webhook_url = options["webhook_url"] | ||||
|     scraperr_frontend_url = options["scraperr_frontend_url"] | ||||
|  | ||||
|     LOG.info(f"Sending discord notification to {webhook_url}") | ||||
|  | ||||
|     embed = { | ||||
|         "title": "Job Completed", | ||||
|         "description": "Scraping job has been completed.", | ||||
|         "color": JOB_COLOR_MAP[job["status"]], | ||||
|         "url": f"{scraperr_frontend_url}/jobs?search={job['id']}&type=id", | ||||
|         "image": { | ||||
|             "url": "https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png", | ||||
|         }, | ||||
|         "author": { | ||||
|             "name": "Scraperr", | ||||
|             "url": "https://github.com/jaypyles/Scraperr", | ||||
|         }, | ||||
|         "fields": [ | ||||
|             { | ||||
|                 "name": "Status", | ||||
|                 "value": "Completed", | ||||
|                 "inline": True, | ||||
|             }, | ||||
|             { | ||||
|                 "name": "URL", | ||||
|                 "value": job["url"], | ||||
|                 "inline": True, | ||||
|             }, | ||||
|             { | ||||
|                 "name": "ID", | ||||
|                 "value": job["id"], | ||||
|                 "inline": False, | ||||
|             }, | ||||
|             { | ||||
|                 "name": "Options", | ||||
|                 "value": f"```json\n{json.dumps(job['job_options'], indent=4)}\n```", | ||||
|                 "inline": False, | ||||
|             }, | ||||
|         ], | ||||
|     } | ||||
|  | ||||
|     payload = {"embeds": [embed]} | ||||
|     requests.post(webhook_url, json=payload) | ||||
							
								
								
									
										97
									
								
								api/backend/worker/post_job_complete/email_notifcation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								api/backend/worker/post_job_complete/email_notifcation.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | ||||
| import smtplib | ||||
| import ssl | ||||
| from email.mime.text import MIMEText | ||||
| from email.mime.multipart import MIMEMultipart | ||||
| import json | ||||
| from typing import Any | ||||
|  | ||||
| from api.backend.worker.logger import LOG | ||||
|  | ||||
| from api.backend.worker.post_job_complete.models import ( | ||||
|     JOB_COLOR_MAP, | ||||
|     PostJobCompleteOptions, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def send_job_complete_email( | ||||
|     job: dict[str, Any], | ||||
|     options: PostJobCompleteOptions, | ||||
| ): | ||||
|     status = job["status"] | ||||
|     status_color = JOB_COLOR_MAP.get(status, 0x808080) | ||||
|     job_url = job["url"] | ||||
|     job_id = job["id"] | ||||
|     job_options_json = json.dumps(job["job_options"], indent=4) | ||||
|     frontend_url = options["scraperr_frontend_url"] | ||||
|  | ||||
|     subject = "📦 Job Completed - Scraperr Notification" | ||||
|  | ||||
|     html = f""" | ||||
|     <html> | ||||
|       <body style="font-family: Arial, sans-serif;"> | ||||
|         <h2 style="color: #{status_color:06x};">✅ Job Completed</h2> | ||||
|         <p>Scraping job has been completed successfully.</p> | ||||
|  | ||||
|         <a href="{frontend_url}/jobs?search={job_id}&type=id" target="_blank"> | ||||
|           <img src="https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png" alt="Scraperr Logo" width="200"> | ||||
|         </a> | ||||
|  | ||||
|         <h3>Job Info:</h3> | ||||
|         <ul> | ||||
|           <li><strong>Status:</strong> {status}</li> | ||||
|           <li><strong>Job URL:</strong> <a href="{job_url}">{job_url}</a></li> | ||||
|           <li><strong>Job ID:</strong> {job_id}</li> | ||||
|         </ul> | ||||
|  | ||||
|         <h3>Options:</h3> | ||||
|         <pre style="background-color:#f4f4f4; padding:10px; border-radius:5px;"> | ||||
| {job_options_json} | ||||
|         </pre> | ||||
|  | ||||
|         <h3>View your job here:</h3> | ||||
|         <a href="{options['scraperr_frontend_url']}/jobs?search={job_id}&type=id">Scraperr Job</a> | ||||
|  | ||||
|         <p style="font-size: 12px; color: gray;"> | ||||
|           Sent by <a href="https://github.com/jaypyles/Scraperr" target="_blank">Scraperr</a> | ||||
|         </p> | ||||
|       </body> | ||||
|     </html> | ||||
|     """ | ||||
|  | ||||
|     # Create email | ||||
|     message = MIMEMultipart("alternative") | ||||
|     message["From"] = options["email"] | ||||
|     message["To"] = options["to"] | ||||
|     message["Subject"] = subject | ||||
|     message.attach( | ||||
|         MIMEText( | ||||
|             "Job completed. View this email in HTML format for full details.", "plain" | ||||
|         ) | ||||
|     ) | ||||
|     message.attach(MIMEText(html, "html")) | ||||
|  | ||||
|     context = ssl.create_default_context() | ||||
|  | ||||
|     try: | ||||
|         if options["use_tls"]: | ||||
|             with smtplib.SMTP(options["smtp_host"], options["smtp_port"]) as server: | ||||
|                 server.starttls(context=context) | ||||
|                 server.login(options["smtp_user"], options["smtp_password"]) | ||||
|                 server.sendmail( | ||||
|                     from_addr=options["email"], | ||||
|                     to_addrs=options["to"], | ||||
|                     msg=message.as_string(), | ||||
|                 ) | ||||
|         else: | ||||
|             with smtplib.SMTP_SSL( | ||||
|                 options["smtp_host"], options["smtp_port"], context=context | ||||
|             ) as server: | ||||
|                 server.login(options["smtp_user"], options["smtp_password"]) | ||||
|                 server.sendmail( | ||||
|                     from_addr=options["email"], | ||||
|                     to_addrs=options["to"], | ||||
|                     msg=message.as_string(), | ||||
|                 ) | ||||
|         LOG.info("✅ Email sent successfully!") | ||||
|     except Exception as e: | ||||
|         LOG.error(f"❌ Failed to send email: {e}") | ||||
							
								
								
									
										22
									
								
								api/backend/worker/post_job_complete/models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								api/backend/worker/post_job_complete/models.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | ||||
| from typing import TypedDict | ||||
|  | ||||
|  | ||||
| class PostJobCompleteOptions(TypedDict): | ||||
|     channel: str | ||||
|     webhook_url: str | ||||
|     scraperr_frontend_url: str | ||||
|     email: str | ||||
|     to: str | ||||
|     smtp_host: str | ||||
|     smtp_port: int | ||||
|     smtp_user: str | ||||
|     smtp_password: str | ||||
|     use_tls: bool | ||||
|  | ||||
|  | ||||
| JOB_COLOR_MAP = { | ||||
|     "Queued": 0x0000FF, | ||||
|     "Scraping": 0x0000FF, | ||||
|     "Completed": 0x00FF00, | ||||
|     "Failed": 0xFF0000, | ||||
| } | ||||
							
								
								
									
										24
									
								
								api/backend/worker/post_job_complete/post_job_complete.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								api/backend/worker/post_job_complete/post_job_complete.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| from typing import Any | ||||
|  | ||||
| from api.backend.worker.post_job_complete.models import PostJobCompleteOptions | ||||
| from api.backend.worker.post_job_complete.email_notifcation import ( | ||||
|     send_job_complete_email, | ||||
| ) | ||||
| from api.backend.worker.post_job_complete.discord_notification import ( | ||||
|     discord_notification, | ||||
| ) | ||||
|  | ||||
|  | ||||
| async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions): | ||||
|     if options["channel"] == "": | ||||
|         return | ||||
|  | ||||
|     if not options.values(): | ||||
|         return | ||||
|  | ||||
|     if options["channel"] == "discord": | ||||
|         discord_notification(job, options) | ||||
|     elif options["channel"] == "email": | ||||
|         send_job_complete_email(job, options) | ||||
|     else: | ||||
|         raise ValueError(f"Invalid channel: {options['channel']}") | ||||
							
								
								
									
										60
									
								
								cypress/e2e/authentication.cy.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								cypress/e2e/authentication.cy.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,60 @@ | ||||
| describe("Authentication", () => { | ||||
|   it("should register", () => { | ||||
|     cy.intercept("POST", "/api/signup").as("signup"); | ||||
|  | ||||
|     cy.visit("/").then(() => { | ||||
|       cy.get("button").contains("Login").click(); | ||||
|       cy.url().should("include", "/login"); | ||||
|  | ||||
|       cy.get("form").should("be.visible"); | ||||
|       cy.get("button") | ||||
|         .contains("No Account? Sign up") | ||||
|         .should("be.visible") | ||||
|         .click(); | ||||
|  | ||||
|       cy.get("input[name='email']").type("test@test.com"); | ||||
|       cy.get("input[name='password']").type("password"); | ||||
|       cy.get("input[name='fullName']").type("John Doe"); | ||||
|       cy.get("button[type='submit']").contains("Signup").click(); | ||||
|  | ||||
|       cy.wait("@signup").then((interception) => { | ||||
|         if (!interception.response) { | ||||
|           cy.log("No response received!"); | ||||
|           throw new Error("signup request did not return a response"); | ||||
|         } | ||||
|  | ||||
|         cy.log("Response status: " + interception.response.statusCode); | ||||
|         cy.log("Response body: " + JSON.stringify(interception.response.body)); | ||||
|  | ||||
|         expect(interception.response.statusCode).to.eq(200); | ||||
|       }); | ||||
|     }); | ||||
|   }); | ||||
|  | ||||
|   it("should login", () => { | ||||
|     cy.intercept("POST", "/api/token").as("token"); | ||||
|  | ||||
|     cy.visit("/").then(() => { | ||||
|       cy.get("button") | ||||
|         .contains("Login") | ||||
|         .click() | ||||
|         .then(() => { | ||||
|           cy.get("input[name='email']").type("test@test.com"); | ||||
|           cy.get("input[name='password']").type("password"); | ||||
|           cy.get("button[type='submit']").contains("Login").click(); | ||||
|  | ||||
|           cy.wait("@token").then((interception) => { | ||||
|             if (!interception.response) { | ||||
|               cy.log("No response received!"); | ||||
|               throw new Error("token request did not return a response"); | ||||
|             } | ||||
|  | ||||
|             cy.log("Response status: " + interception.response.statusCode); | ||||
|             cy.log("Response body: " + JSON.stringify(interception.response.body)); | ||||
|  | ||||
|             expect(interception.response.statusCode).to.eq(200); | ||||
|           }); | ||||
|         }); | ||||
|     }); | ||||
|   }); | ||||
| }); | ||||
| @@ -1,19 +1,88 @@ | ||||
| describe("Job", () => { | ||||
| describe.only("Job", () => { | ||||
|   it("should create a job", () => { | ||||
|     cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob"); | ||||
|  | ||||
|     cy.visit("/"); | ||||
|  | ||||
|     const input = cy.get('[data-cy="url-input"]'); | ||||
|     input.type("https://example.com"); | ||||
|     cy.get('[data-cy="url-input"]').type("https://example.com"); | ||||
|     cy.get('[data-cy="name-field"]').type("example"); | ||||
|     cy.get('[data-cy="xpath-field"]').type("//body"); | ||||
|     cy.get('[data-cy="add-button"]').click(); | ||||
|  | ||||
|     const nameField = cy.get('[data-cy="name-field"]'); | ||||
|     const xPathField = cy.get('[data-cy="xpath-field"]'); | ||||
|     const addButton = cy.get('[data-cy="add-button"]'); | ||||
|     cy.contains("Submit").click(); | ||||
|  | ||||
|     nameField.type("example"); | ||||
|     xPathField.type("//body"); | ||||
|     addButton.click(); | ||||
|     cy.wait("@submitScrapeJob").then((interception) => { | ||||
|       if (!interception.response) { | ||||
|         cy.log("No response received!"); | ||||
|         cy.log("Request body: " + JSON.stringify(interception.request?.body)); | ||||
|         throw new Error("submitScrapeJob request did not return a response"); | ||||
|       } | ||||
|  | ||||
|     const submit = cy.contains("Submit"); | ||||
|     submit.click(); | ||||
|       cy.log("Response status: " + interception.response.statusCode); | ||||
|       cy.log("Response body: " + JSON.stringify(interception.response.body)); | ||||
|  | ||||
|       expect(interception.response.statusCode).to.eq(200); | ||||
|     }); | ||||
|  | ||||
|     cy.get("li").contains("Jobs").click(); | ||||
|  | ||||
|     cy.contains("div", "https://example.com", { timeout: 10000 }).should( | ||||
|       "exist" | ||||
|     ); | ||||
|     cy.contains("div", "Completed", { timeout: 20000 }).should("exist"); | ||||
|  | ||||
|     cy.get("tbody tr") | ||||
|       .first() | ||||
|       .within(() => { | ||||
|         cy.get('input[type="checkbox"]').click(); | ||||
|       }); | ||||
|  | ||||
|     cy.get("[data-testid='DeleteIcon']").click(); | ||||
|  | ||||
|     cy.contains("div", "https://example.com", { timeout: 10000 }).should( | ||||
|       "not.exist" | ||||
|     ); | ||||
|   }); | ||||
|  | ||||
|   it("should create a job with advanced options (media)", () => { | ||||
|     cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob"); | ||||
|  | ||||
|     cy.visit("/"); | ||||
|  | ||||
|     cy.get("button").contains("Advanced Job Options").click(); | ||||
|  | ||||
|     cy.get('[data-cy="collect-media-checkbox"]').click(); | ||||
|     cy.get("body").type("{esc}"); | ||||
|  | ||||
|     cy.get('[data-cy="url-input"]').type("https://books.toscrape.com"); | ||||
|     cy.get('[data-cy="name-field"]').type("example"); | ||||
|     cy.get('[data-cy="xpath-field"]').type("//body"); | ||||
|     cy.get('[data-cy="add-button"]').click(); | ||||
|  | ||||
|     cy.get("button").contains("Submit").click(); | ||||
|  | ||||
|     cy.get("li").contains("Jobs").click(); | ||||
|  | ||||
|     cy.contains("div", "https://books.toscrape.com", { timeout: 10000 }).should( | ||||
|       "exist" | ||||
|     ); | ||||
|  | ||||
|     cy.contains("div", "Completed", { timeout: 20000 }).should("exist"); | ||||
|     cy.get("li").contains("Media").click(); | ||||
|  | ||||
|     cy.get("div[id='select-job']").click(); | ||||
|     cy.get("li[role='option']").click(); | ||||
|  | ||||
|     cy.get("[data-testid='media-grid']", { timeout: 10000 }).should("exist"); | ||||
|  | ||||
|     cy.get("li").contains("Jobs").click(); | ||||
|  | ||||
|     cy.get("tbody tr") | ||||
|       .first() | ||||
|       .within(() => { | ||||
|         cy.get('input[type="checkbox"]').click(); | ||||
|       }); | ||||
|  | ||||
|     cy.get("[data-testid='DeleteIcon']").click(); | ||||
|   }); | ||||
| }); | ||||
|   | ||||
| @@ -34,4 +34,4 @@ | ||||
| //       visit(originalFn: CommandOriginalFn, url: string, options: Partial<VisitOptions>): Chainable<Element> | ||||
| //     } | ||||
| //   } | ||||
| // } | ||||
| // } | ||||
|   | ||||
| @@ -1,13 +1,10 @@ | ||||
| version: "3" | ||||
| services: | ||||
|   scraperr: | ||||
|     build: | ||||
|       context: . | ||||
|       dockerfile: docker/frontend/Dockerfile | ||||
|     command: ["npm", "run", "dev"] | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.scraperr.rule=Host(`localhost`)" | ||||
|       - "traefik.http.routers.scraperr.entrypoints=web" | ||||
|       - "traefik.http.services.scraperr.loadbalancer.server.port=3000" | ||||
|       - "traefik.http.routers.scraperr.tls=false" | ||||
|     volumes: | ||||
|       - "$PWD/src:/app/src" | ||||
|       - "$PWD/public:/app/public" | ||||
| @@ -16,7 +13,12 @@ services: | ||||
|       - "$PWD/package-lock.json:/app/package-lock.json" | ||||
|       - "$PWD/tsconfig.json:/app/tsconfig.json" | ||||
|   scraperr_api: | ||||
|     ports: | ||||
|       - "8000:8000" | ||||
|     build: | ||||
|       context: . | ||||
|       dockerfile: docker/api/Dockerfile | ||||
|     environment: | ||||
|       - LOG_LEVEL=INFO | ||||
|     volumes: | ||||
|       - "$PWD/api:/project/api" | ||||
|       - "$PWD/api:/project/app/api" | ||||
|     ports: | ||||
|       - "5900:5900" | ||||
|   | ||||
| @@ -1,66 +1,28 @@ | ||||
| services: | ||||
|   scraperr: | ||||
|     image: jpyles0524/scraperr:latest | ||||
|     build: | ||||
|       context: . | ||||
|       dockerfile: docker/frontend/Dockerfile | ||||
|     container_name: scraperr | ||||
|     command: ["npm", "run", "start"] | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost | ||||
|       - "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https | ||||
|       - "traefik.http.services.scraperr.loadbalancer.server.port=3000" | ||||
|     environment: | ||||
|       - NEXT_PUBLIC_API_URL=http://scraperr_api:8000 # your API URL | ||||
|       - SERVER_URL=http://scraperr_api:8000 # your docker container API URL | ||||
|     ports: | ||||
|       - 80:3000 | ||||
|     networks: | ||||
|       - web | ||||
|   scraperr_api: | ||||
|     init: True | ||||
|     image: jpyles0524/scraperr_api:latest | ||||
|     build: | ||||
|       context: . | ||||
|       dockerfile: docker/api/Dockerfile | ||||
|     environment: | ||||
|       - LOG_LEVEL=INFO | ||||
|       - OLLAMA_URL=http://ollama:11434 | ||||
|       - OLLAMA_MODEL=phi3 | ||||
|       - MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB | ||||
|       - SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string) | ||||
|       - ALGORITHM=HS256 # authentication encoding algorithm | ||||
|       - ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes | ||||
|     container_name: scraperr_api | ||||
|     volumes: | ||||
|       - /var/run/docker.sock:/var/run/docker.sock | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost | ||||
|       - "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https | ||||
|       - "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api" | ||||
|       - "traefik.http.routers.scraperr_api.middlewares=api-stripprefix" | ||||
|       - "traefik.http.services.scraperr_api.loadbalancer.server.port=8000" | ||||
|     networks: | ||||
|       - web | ||||
|   traefik: | ||||
|     image: traefik:latest | ||||
|     container_name: traefik | ||||
|     command: | ||||
|       - "--providers.docker=true" | ||||
|       - "--entrypoints.web.address=:80" | ||||
|       - "--entrypoints.websecure.address=:443" | ||||
|     ports: | ||||
|       - 80:80 | ||||
|       - 443:443 | ||||
|       - 8000:8000 | ||||
|     volumes: | ||||
|       - /var/run/docker.sock:/var/run/docker.sock:ro" | ||||
|     networks: | ||||
|       - web | ||||
|   mongo: | ||||
|     container_name: webscrape-mongo | ||||
|     image: mongo | ||||
|     restart: always | ||||
|     environment: | ||||
|       MONGO_INITDB_ROOT_USERNAME: root | ||||
|       MONGO_INITDB_ROOT_PASSWORD: example | ||||
|       - "$PWD/data:/project/app/data" | ||||
|       - "$PWD/media:/project/app/media" | ||||
|     networks: | ||||
|       - web | ||||
|  | ||||
| networks: | ||||
|   web: | ||||
|   | ||||
| @@ -1,36 +1,42 @@ | ||||
| # Build python dependencies | ||||
| FROM python:3.10.12-slim as pybuilder | ||||
|  | ||||
| RUN apt update && apt install -y uvicorn | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y curl && \ | ||||
|     apt-get install -y x11vnc xvfb uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 ffmpeg && \ | ||||
|     curl -LsSf https://astral.sh/uv/install.sh | sh && \ | ||||
|     apt-get remove -y curl && \ | ||||
|     apt-get autoremove -y && \ | ||||
|     rm -rf /var/lib/apt/lists/* | ||||
|  | ||||
| RUN python -m pip --no-cache-dir install pdm | ||||
| RUN pdm config python.use_venv false | ||||
|  | ||||
|  | ||||
| WORKDIR /project/app | ||||
| COPY pyproject.toml pdm.lock /project/app/ | ||||
| RUN pdm install | ||||
|  | ||||
| RUN pdm install -v --frozen-lockfile | ||||
|  | ||||
| RUN pdm run playwright install --with-deps | ||||
|  | ||||
| RUN pdm run camoufox fetch | ||||
|  | ||||
| COPY ./api/ /project/app/api | ||||
|  | ||||
| # Create final image | ||||
| FROM python:3.10.12-slim | ||||
|  | ||||
| RUN apt-get update | ||||
| RUN apt-get install -y wget gnupg supervisor | ||||
| RUN wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - | ||||
| RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' | ||||
| RUN apt-get update | ||||
| RUN apt-get install -y google-chrome-stable | ||||
|  | ||||
| ENV PYTHONPATH=/project/pkgs | ||||
| COPY --from=pybuilder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages | ||||
| COPY --from=pybuilder /usr/local/bin /usr/local/bin | ||||
| COPY --from=pybuilder /project/app /project/ | ||||
|  | ||||
| COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf | ||||
|  | ||||
| EXPOSE 8000 | ||||
|  | ||||
| WORKDIR /project/ | ||||
| WORKDIR /project/app | ||||
|  | ||||
| RUN mkdir -p /project/app/media | ||||
| RUN mkdir -p /project/app/data | ||||
| RUN touch /project/app/data/database.db | ||||
|  | ||||
| EXPOSE 5900 | ||||
|  | ||||
| COPY start.sh /project/app/start.sh | ||||
|  | ||||
| CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ] | ||||
| @@ -1,10 +1,14 @@ | ||||
| # Build next dependencies | ||||
| FROM node:latest | ||||
| FROM node:23.1-slim | ||||
| WORKDIR /app | ||||
|  | ||||
| COPY package*.json ./ | ||||
| RUN npm install | ||||
| # Copy package files first to leverage Docker cache | ||||
| COPY package.json yarn.lock ./ | ||||
|  | ||||
| # Install dependencies in a separate layer | ||||
| RUN yarn install --frozen-lockfile | ||||
|  | ||||
| # Copy the rest of the application | ||||
| COPY tsconfig.json /app/tsconfig.json | ||||
| COPY tailwind.config.js /app/tailwind.config.js | ||||
| COPY next.config.mjs /app/next.config.mjs | ||||
| @@ -13,8 +17,7 @@ COPY postcss.config.js /app/postcss.config.js | ||||
| COPY public /app/public | ||||
| COPY src /app/src | ||||
|  | ||||
| RUN npm run build | ||||
| # Build the application | ||||
| RUN yarn build | ||||
|  | ||||
| EXPOSE 3000 | ||||
|  | ||||
| # CMD [ "npm", "run" ] | ||||
| EXPOSE 3000 | ||||
										
											Binary file not shown.
										
									
								
							| Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 48 KiB | 
| @@ -1,4 +0,0 @@ | ||||
| tls: | ||||
|   certificates: | ||||
|     - certFile: /etc/certs/ssl-cert.pem | ||||
|       keyFile: /etc/certs/ssl-cert.key | ||||
							
								
								
									
										23
									
								
								helm/.helmignore
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								helm/.helmignore
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| # Patterns to ignore when building packages. | ||||
| # This supports shell glob matching, relative path matching, and | ||||
| # negation (prefixed with !). Only one pattern per line. | ||||
| .DS_Store | ||||
| # Common VCS dirs | ||||
| .git/ | ||||
| .gitignore | ||||
| .bzr/ | ||||
| .bzrignore | ||||
| .hg/ | ||||
| .hgignore | ||||
| .svn/ | ||||
| # Common backup files | ||||
| *.swp | ||||
| *.bak | ||||
| *.tmp | ||||
| *.orig | ||||
| *~ | ||||
| # Various IDEs | ||||
| .project | ||||
| .idea/ | ||||
| *.tmproj | ||||
| .vscode/ | ||||
							
								
								
									
										24
									
								
								helm/Chart.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								helm/Chart.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| apiVersion: v2 | ||||
| name: scraperr | ||||
| description: A Helm chart for Kubernetes | ||||
|  | ||||
| # A chart can be either an 'application' or a 'library' chart. | ||||
| # | ||||
| # Application charts are a collection of templates that can be packaged into versioned archives | ||||
| # to be deployed. | ||||
| # | ||||
| # Library charts provide useful utilities or functions for the chart developer. They're included as | ||||
| # a dependency of application charts to inject those utilities and functions into the rendering | ||||
| # pipeline. Library charts do not define any templates and therefore cannot be deployed. | ||||
| type: application | ||||
|  | ||||
| # This is the chart version. This version number should be incremented each time you make changes | ||||
| # to the chart and its templates, including the app version. | ||||
| # Versions are expected to follow Semantic Versioning (https://semver.org/) | ||||
| version: 1.1.0 | ||||
|  | ||||
| # This is the version number of the application being deployed. This version number should be | ||||
| # incremented each time you make changes to the application. Versions are not expected to | ||||
| # follow Semantic Versioning. They should reflect the version the application is using. | ||||
| # It is recommended to use it with quotes. | ||||
| appVersion: "1.16.0" | ||||
							
								
								
									
										56
									
								
								helm/templates/deployment.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								helm/templates/deployment.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,56 @@ | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: scraperr | ||||
| spec: | ||||
|   replicas: {{ .Values.replicaCount }} | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: scraperr | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: scraperr | ||||
|     spec: | ||||
|       containers: | ||||
|         - name: scraperr | ||||
|           {{ if .Values.scraperr.image.repository }} | ||||
|           image: "{{ .Values.scraperr.image.repository }}:{{ .Values.scraperr.image.tag }}" | ||||
|           {{ else }} | ||||
|           image: "{{ .Chart.Name }}:{{ .Chart.Version }}" | ||||
|           {{ end }} | ||||
|           imagePullPolicy: {{ .Values.scraperr.image.pullPolicy }} | ||||
|           command: {{ .Values.scraperr.containerCommand | toJson }} | ||||
|           ports: | ||||
|             - containerPort: {{ .Values.scraperr.containerPort }} | ||||
|           env: {{ toYaml .Values.scraperr.env | nindent 12 }} | ||||
|  | ||||
| --- | ||||
| apiVersion: apps/v1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: scraperr-api | ||||
| spec: | ||||
|   replicas: {{ .Values.replicaCount }} | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       app: scraperr-api | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: scraperr-api | ||||
|     spec: | ||||
|       containers: | ||||
|         - name: scraperr-api | ||||
|           {{ if .Values.scraperrApi.image.repository }} | ||||
|           image: "{{ .Values.scraperrApi.image.repository }}:{{ .Values.scraperrApi.image.tag }}" | ||||
|           {{ else }} | ||||
|           image: "{{ .Chart.Name }}:{{ .Chart.Version }}" | ||||
|           {{ end }} | ||||
|           imagePullPolicy: {{ .Values.scraperrApi.image.pullPolicy }} | ||||
|           ports: | ||||
|             - containerPort: {{ .Values.scraperrApi.containerPort }} | ||||
|           env: {{ toYaml .Values.scraperrApi.env | nindent 12 }} | ||||
|           volumeMounts: {{ toYaml .Values.scraperrApi.volumeMounts | nindent 12 }} | ||||
|       volumes: {{ toYaml .Values.scraperrApi.volumes | nindent 12 }} | ||||
							
								
								
									
										37
									
								
								helm/templates/service.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								helm/templates/service.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   name: scraperr | ||||
| spec: | ||||
|   type: {{ .Values.scraperr.serviceType }} | ||||
|   selector: | ||||
|     app: scraperr | ||||
|   ports: | ||||
|     {{- range .Values.scraperr.ports }} | ||||
|     - port: {{ .port }} | ||||
|       targetPort: {{ .targetPort }} | ||||
|       {{- if .nodePort }} | ||||
|       nodePort: {{ .nodePort }} | ||||
|       {{- end }} | ||||
|       protocol: {{ .protocol | default "TCP" }} | ||||
|     {{- end }} | ||||
|  | ||||
| --- | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   name: scraperr-api | ||||
| spec: | ||||
|   type: {{ .Values.scraperrApi.serviceType }} | ||||
|   selector: | ||||
|     app: scraperr-api | ||||
|   ports: | ||||
|     {{- range .Values.scraperrApi.ports }} | ||||
|     - port: {{ .port }} | ||||
|       targetPort: {{ .targetPort }} | ||||
|       {{- if .nodePort }} | ||||
|       nodePort: {{ .nodePort }} | ||||
|       {{- end }} | ||||
|       protocol: {{ .protocol | default "TCP" }} | ||||
|     {{- end }} | ||||
							
								
								
									
										47
									
								
								helm/values.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								helm/values.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,47 @@ | ||||
| scraperr: | ||||
|   image: | ||||
|     repository: jpyles0524/scraperr | ||||
|     tag: latest | ||||
|     pullPolicy: IfNotPresent | ||||
|   containerCommand: ["npm", "run","start"] | ||||
|   containerPort: 3000 | ||||
|   serviceType: NodePort | ||||
|   ports: | ||||
|     - port: 80          | ||||
|       targetPort: 3000   | ||||
|       nodePort: 32300 | ||||
|       protocol: TCP | ||||
|   env: | ||||
|     - name: NEXT_PUBLIC_API_URL | ||||
|       value: "http://scraperr-api:8000" | ||||
|     - name: SERVER_URL | ||||
|       value: "http://scraperr-api:8000" | ||||
|  | ||||
| scraperrApi: | ||||
|   image: | ||||
|     repository: jpyles0524/scraperr_api | ||||
|     tag: latest | ||||
|     pullPolicy: IfNotPresent | ||||
|   containerPort: 8000 | ||||
|   serviceType: ClusterIP | ||||
|   ports: | ||||
|     - port: 8000 | ||||
|       targetPort: 8000 | ||||
|       protocol: TCP | ||||
|   env: | ||||
|     - name: LOG_LEVEL | ||||
|       value: "INFO" | ||||
|   volumeMounts: | ||||
|     - name: data | ||||
|       mountPath: /project/app/data | ||||
|     - name: media | ||||
|       mountPath: /project/app/media | ||||
|   volumes: | ||||
|     - name: data | ||||
|       hostPath: | ||||
|         path: /data/scraperr/data | ||||
|         type: DirectoryOrCreate | ||||
|     - name: media | ||||
|       hostPath: | ||||
|         path: /data/scraperr/media | ||||
| replicaCount: 1 | ||||
							
								
								
									
										37
									
								
								ipython.py
									
									
									
									
									
								
							
							
						
						
									
										37
									
								
								ipython.py
									
									
									
									
									
								
							| @@ -1,37 +0,0 @@ | ||||
| # STL | ||||
| import os | ||||
|  | ||||
| # PDM | ||||
| import boto3 | ||||
| from dotenv import load_dotenv | ||||
|  | ||||
| # Load environment variables from .env file | ||||
| load_dotenv() | ||||
|  | ||||
|  | ||||
| def test_insert_and_delete(): | ||||
|     # Get environment variables | ||||
|     region_name = os.getenv("AWS_REGION") | ||||
|     # Initialize DynamoDB resource | ||||
|     dynamodb = boto3.resource("dynamodb", region_name=region_name) | ||||
|     table = dynamodb.Table("scrape") | ||||
|  | ||||
|     # Item to insert | ||||
|     item = { | ||||
|         "id": "123",  # Replace with the appropriate id value | ||||
|         "attribute1": "value1", | ||||
|         "attribute2": "value2", | ||||
|         # Add more attributes as needed | ||||
|     } | ||||
|  | ||||
|     # Insert the item | ||||
|     table.put_item(Item=item) | ||||
|     print(f"Inserted item: {item}") | ||||
|  | ||||
|     # Delete the item | ||||
|     table.delete_item(Key={"id": "123"})  # Replace with the appropriate id value | ||||
|     print(f"Deleted item with id: {item['id']}") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     test_insert_and_delete() | ||||
							
								
								
									
										23271
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										23271
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										22
									
								
								package.json
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								package.json
									
									
									
									
									
								
							| @@ -12,13 +12,16 @@ | ||||
|     "@minchat/react-chat-ui": "^0.16.2", | ||||
|     "@mui/icons-material": "^5.15.3", | ||||
|     "@mui/material": "^5.16.0", | ||||
|     "@reduxjs/toolkit": "^2.8.2", | ||||
|     "@testing-library/jest-dom": "^5.16.5", | ||||
|     "@testing-library/react": "^13.4.0", | ||||
|     "@testing-library/user-event": "^13.5.0", | ||||
|     "@types/react": "^18.3.21", | ||||
|     "axios": "^1.7.2", | ||||
|     "bootstrap": "^5.3.0", | ||||
|     "chart.js": "^4.4.3", | ||||
|     "cookie": "^0.6.0", | ||||
|     "dotenv": "^16.5.0", | ||||
|     "framer-motion": "^4.1.17", | ||||
|     "js-cookie": "^3.0.5", | ||||
|     "next": "^14.2.4", | ||||
| @@ -29,17 +32,18 @@ | ||||
|     "react-dom": "^18.3.1", | ||||
|     "react-markdown": "^9.0.0", | ||||
|     "react-modal-image": "^2.6.0", | ||||
|     "react-redux": "^9.2.0", | ||||
|     "react-router": "^6.14.1", | ||||
|     "react-router-dom": "^6.14.1", | ||||
|     "react-scripts": "^5.0.1", | ||||
|     "react-spinners": "^0.14.1", | ||||
|     "redux-persist": "^6.0.0", | ||||
|     "typescript": "^4.9.5", | ||||
|     "web-vitals": "^2.1.4" | ||||
|   }, | ||||
|   "scripts": { | ||||
|     "dev": "next dev", | ||||
|     "build": "next build", | ||||
|     "start": "next start", | ||||
|     "dev": "yarn next dev", | ||||
|     "build": "yarn next build", | ||||
|     "start": "yarn next start", | ||||
|     "serve": "serve -s ./dist", | ||||
|     "cy:open": "cypress open", | ||||
|     "cy:run": "cypress run" | ||||
| @@ -63,12 +67,18 @@ | ||||
|     ] | ||||
|   }, | ||||
|   "devDependencies": { | ||||
|     "@types/cypress": "^0.1.6", | ||||
|     "@types/cypress": "^1.1.6", | ||||
|     "@types/js-cookie": "^3.0.6", | ||||
|     "cypress": "^13.15.0", | ||||
|     "autoprefixer": "^10.4.21", | ||||
|     "cypress": "^13.17.0", | ||||
|     "eslint": "^9.26.0", | ||||
|     "postcss": "^8.5.3", | ||||
|     "tailwindcss": "^3.3.5" | ||||
|   }, | ||||
|   "overrides": { | ||||
|     "react-refresh": "0.11.0" | ||||
|   }, | ||||
|   "resolutions": { | ||||
|     "postcss": "^8.4.31" | ||||
|   } | ||||
| } | ||||
|   | ||||
| @@ -2,9 +2,7 @@ | ||||
| name = "web-scrape" | ||||
| version = "0.1.0" | ||||
| description = "" | ||||
| authors = [ | ||||
|     {name = "Jayden Pyles", email = "jpylesbuisness@gmail.com"}, | ||||
| ] | ||||
| authors = [{ name = "Jayden Pyles", email = "jpylesbuisness@gmail.com" }] | ||||
| dependencies = [ | ||||
|     "uvicorn>=0.30.1", | ||||
|     "fastapi>=0.111.0", | ||||
| @@ -18,7 +16,6 @@ dependencies = [ | ||||
|     "lxml-stubs>=0.5.1", | ||||
|     "fake-useragent>=1.5.1", | ||||
|     "requests-html>=0.10.0", | ||||
|     "selenium>=4.22.0", | ||||
|     "webdriver-manager>=4.0.1", | ||||
|     "pydantic[email]>=2.9.2", | ||||
|     "pandas>=2.2.2", | ||||
| @@ -39,20 +36,22 @@ dependencies = [ | ||||
|     "exceptiongroup>=1.2.2", | ||||
|     "Faker>=30.6.0", | ||||
|     "pytest-asyncio>=0.24.0", | ||||
|     "python-multipart>=0.0.12", | ||||
|     "python-multipart>=0.0.1", | ||||
|     "bcrypt==4.0.1", | ||||
|     "apscheduler>=3.11.0", | ||||
|     "playwright>=1.52.0", | ||||
|     "camoufox>=0.4.11", | ||||
|     "html2text>=2025.4.15", | ||||
| ] | ||||
| requires-python = ">=3.10" | ||||
| readme = "README.md" | ||||
| license = {text = "MIT"} | ||||
| license = { text = "MIT" } | ||||
|  | ||||
| [tool.pdm] | ||||
| distribution = true | ||||
|  | ||||
| [tool.pdm.dev-dependencies] | ||||
| dev = [ | ||||
|     "ipython>=8.26.0", | ||||
|     "pytest>=8.3.3", | ||||
| ] | ||||
| dev = ["ipython>=8.26.0", "pytest>=8.3.3"] | ||||
| [tool.pyright] | ||||
| include = ["./api/backend/"] | ||||
| exclude = ["**/node_modules", "**/__pycache__"] | ||||
| @@ -60,14 +59,42 @@ ignore = [] | ||||
| defineConstant = { DEBUG = true } | ||||
| stubPath = "" | ||||
|  | ||||
| reportUnknownMemberType= false | ||||
| reportMissingImports = true | ||||
| reportMissingTypeStubs = false | ||||
| reportAny = false | ||||
| reportCallInDefaultInitializer = false | ||||
| # Type checking strictness | ||||
| typeCheckingMode = "strict"                        # Enables strict type checking mode | ||||
| reportPrivateUsage = "none" | ||||
| reportMissingTypeStubs = "none" | ||||
| reportUntypedFunctionDecorator = "error" | ||||
| reportUntypedClassDecorator = "error" | ||||
| reportUntypedBaseClass = "error" | ||||
| reportInvalidTypeVarUse = "error" | ||||
| reportUnnecessaryTypeIgnoreComment = "information" | ||||
| reportUnknownVariableType = "none" | ||||
| reportUnknownMemberType = "none" | ||||
| reportUnknownParameterType = "none" | ||||
|  | ||||
| pythonVersion = "3.9" | ||||
| pythonPlatform = "Linux" | ||||
| # Additional checks | ||||
| reportImplicitStringConcatenation = "error" | ||||
| reportInvalidStringEscapeSequence = "error" | ||||
| reportMissingImports = "error" | ||||
| reportMissingModuleSource = "error" | ||||
| reportOptionalCall = "error" | ||||
| reportOptionalIterable = "error" | ||||
| reportOptionalMemberAccess = "error" | ||||
| reportOptionalOperand = "error" | ||||
| reportOptionalSubscript = "error" | ||||
| reportTypedDictNotRequiredAccess = "error" | ||||
|  | ||||
| # Function return type checking | ||||
| reportIncompleteStub = "error" | ||||
| reportIncompatibleMethodOverride = "error" | ||||
| reportInvalidStubStatement = "error" | ||||
| reportInconsistentOverload = "error" | ||||
|  | ||||
| # Misc settings | ||||
| pythonVersion = "3.10"           # Matches your Python version from pyproject.toml | ||||
| strictListInference = true | ||||
| strictDictionaryInference = true | ||||
| strictSetInference = true | ||||
|  | ||||
|  | ||||
| [tool.isort] | ||||
|   | ||||
| @@ -1,17 +1,23 @@ | ||||
| import React, { useState, useEffect, Dispatch, useRef } from "react"; | ||||
| import React, { useState, Dispatch, useEffect } from "react"; | ||||
| import { Job } from "../../types"; | ||||
| import { fetchJobs } from "../../lib"; | ||||
| import Box from "@mui/material/Box"; | ||||
| import InputLabel from "@mui/material/InputLabel"; | ||||
| import FormControl from "@mui/material/FormControl"; | ||||
| import Select from "@mui/material/Select"; | ||||
| import Popover from "@mui/material/Popover"; | ||||
| import { Typography, MenuItem, useTheme } from "@mui/material"; | ||||
| import { | ||||
|   Typography, | ||||
|   MenuItem, | ||||
|   useTheme, | ||||
|   ClickAwayListener, | ||||
| } from "@mui/material"; | ||||
| import { SxProps } from "@mui/material"; | ||||
|  | ||||
| interface Props { | ||||
|   sxProps: SxProps; | ||||
|   setSelectedJob: Dispatch<React.SetStateAction<Job | null>>; | ||||
|   sxProps?: SxProps; | ||||
|   setSelectedJob: | ||||
|     | Dispatch<React.SetStateAction<Job | null>> | ||||
|     | ((job: Job) => void); | ||||
|   selectedJob: Job | null; | ||||
|   setJobs: Dispatch<React.SetStateAction<Job[]>>; | ||||
|   jobs: Job[]; | ||||
| @@ -28,10 +34,6 @@ export const JobSelector = ({ | ||||
|   const [popoverJob, setPopoverJob] = useState<Job | null>(null); | ||||
|   const theme = useTheme(); | ||||
|  | ||||
|   useEffect(() => { | ||||
|     fetchJobs(setJobs, { chat: true }); | ||||
|   }, []); | ||||
|  | ||||
|   const handlePopoverOpen = ( | ||||
|     event: React.MouseEvent<HTMLElement>, | ||||
|     job: Job | ||||
| @@ -47,6 +49,12 @@ export const JobSelector = ({ | ||||
|  | ||||
|   const open = Boolean(anchorEl); | ||||
|  | ||||
|   useEffect(() => { | ||||
|     if (!open) { | ||||
|       setAnchorEl(null); | ||||
|     } | ||||
|   }, [open]); | ||||
|  | ||||
|   return ( | ||||
|     <Box sx={sxProps}> | ||||
|       <FormControl fullWidth> | ||||
| @@ -59,9 +67,11 @@ export const JobSelector = ({ | ||||
|               value={selectedJob?.id || ""} | ||||
|               label="Job" | ||||
|               onChange={(e) => { | ||||
|                 setSelectedJob( | ||||
|                   jobs.find((job) => job.id === e.target.value) || null | ||||
|                 ); | ||||
|                 const job = jobs.find((job) => job.id === e.target.value); | ||||
|  | ||||
|                 if (job) { | ||||
|                   setSelectedJob(job); | ||||
|                 } | ||||
|               }} | ||||
|             > | ||||
|               {jobs.map((job) => ( | ||||
| @@ -81,55 +91,63 @@ export const JobSelector = ({ | ||||
|           </> | ||||
|         ) : null} | ||||
|       </FormControl> | ||||
|       <Popover | ||||
|         id="mouse-over-popover" | ||||
|         sx={{ | ||||
|           pointerEvents: "none", | ||||
|           padding: 0, | ||||
|         }} | ||||
|         open={open} | ||||
|         anchorEl={anchorEl} | ||||
|         anchorOrigin={{ | ||||
|           vertical: "bottom", | ||||
|           horizontal: "left", | ||||
|         }} | ||||
|         transformOrigin={{ | ||||
|           vertical: "top", | ||||
|           horizontal: "left", | ||||
|         }} | ||||
|         onClose={handlePopoverClose} | ||||
|       > | ||||
|         {popoverJob && ( | ||||
|           <Box | ||||
|  | ||||
|       {open && ( | ||||
|         <ClickAwayListener onClickAway={handlePopoverClose}> | ||||
|           <Popover | ||||
|             id="mouse-over-popover" | ||||
|             sx={{ | ||||
|               border: | ||||
|                 theme.palette.mode === "light" | ||||
|                   ? "2px solid black" | ||||
|                   : "2px solid white", | ||||
|               pointerEvents: "none", | ||||
|               padding: 0, | ||||
|             }} | ||||
|             open={open} | ||||
|             anchorEl={anchorEl} | ||||
|             anchorOrigin={{ | ||||
|               vertical: "bottom", | ||||
|               horizontal: "left", | ||||
|             }} | ||||
|             transformOrigin={{ | ||||
|               vertical: "top", | ||||
|               horizontal: "left", | ||||
|             }} | ||||
|             onClose={handlePopoverClose} | ||||
|           > | ||||
|             <Typography | ||||
|               variant="body1" | ||||
|               sx={{ paddingLeft: 1, paddingRight: 1 }} | ||||
|             > | ||||
|               {popoverJob.url} | ||||
|             </Typography> | ||||
|             <div className="flex flex-row w-full justify-end mb-1"> | ||||
|               <Typography | ||||
|                 variant="body2" | ||||
|             {popoverJob && ( | ||||
|               <Box | ||||
|                 sx={{ | ||||
|                   paddingLeft: 1, | ||||
|                   paddingRight: 1, | ||||
|                   color: theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63", | ||||
|                   fontStyle: "italic", | ||||
|                   border: | ||||
|                     theme.palette.mode === "light" | ||||
|                       ? "2px solid black" | ||||
|                       : "2px solid white", | ||||
|                 }} | ||||
|               > | ||||
|                 {new Date(popoverJob.time_created).toLocaleString()} | ||||
|               </Typography> | ||||
|             </div> | ||||
|           </Box> | ||||
|         )} | ||||
|       </Popover> | ||||
|                 <Typography | ||||
|                   variant="body1" | ||||
|                   sx={{ paddingLeft: 1, paddingRight: 1 }} | ||||
|                 > | ||||
|                   {popoverJob.url} | ||||
|                 </Typography> | ||||
|                 <div className="flex flex-row w-full justify-end mb-1"> | ||||
|                   <Typography | ||||
|                     variant="body2" | ||||
|                     sx={{ | ||||
|                       paddingLeft: 1, | ||||
|                       paddingRight: 1, | ||||
|                       color: | ||||
|                         theme.palette.mode === "dark" ? "#d3d7e6" : "#5b5d63", | ||||
|                       fontStyle: "italic", | ||||
|                     }} | ||||
|                   > | ||||
|                     {popoverJob.time_created | ||||
|                       ? new Date(popoverJob.time_created).toLocaleString() | ||||
|                       : "Unknown"} | ||||
|                   </Typography> | ||||
|                 </div> | ||||
|               </Box> | ||||
|             )} | ||||
|           </Popover> | ||||
|         </ClickAwayListener> | ||||
|       )} | ||||
|     </Box> | ||||
|   ); | ||||
| }; | ||||
|   | ||||
| @@ -0,0 +1,48 @@ | ||||
| import { Box, Link, Typography } from "@mui/material"; | ||||
| import { SetStateAction, Dispatch, useState } from "react"; | ||||
| import { AdvancedJobOptionsDialog } from "./dialog/advanced-job-options-dialog"; | ||||
| import { RawJobOptions } from "@/types"; | ||||
|  | ||||
| export type AdvancedJobOptionsProps = { | ||||
|   jobOptions: RawJobOptions; | ||||
|   setJobOptions: Dispatch<SetStateAction<RawJobOptions>>; | ||||
|   multiPageScrapeEnabled?: boolean; | ||||
| }; | ||||
|  | ||||
| export const AdvancedJobOptions = ({ | ||||
|   jobOptions, | ||||
|   setJobOptions, | ||||
|   multiPageScrapeEnabled = true, | ||||
| }: AdvancedJobOptionsProps) => { | ||||
|   const [open, setOpen] = useState(false); | ||||
|   return ( | ||||
|     <Box sx={{ mb: 2 }}> | ||||
|       <Link | ||||
|         component="button" | ||||
|         variant="body2" | ||||
|         onClick={() => setOpen(true)} | ||||
|         sx={{ | ||||
|           textDecoration: "none", | ||||
|           color: "primary.main", | ||||
|           "&:hover": { | ||||
|             color: "primary.dark", | ||||
|             textDecoration: "underline", | ||||
|           }, | ||||
|           paddingLeft: 1, | ||||
|           display: "inline-flex", | ||||
|           alignItems: "center", | ||||
|           gap: 0.5, | ||||
|         }} | ||||
|       > | ||||
|         <Typography variant="body2">Advanced Job Options</Typography> | ||||
|       </Link> | ||||
|       <AdvancedJobOptionsDialog | ||||
|         open={open} | ||||
|         onClose={() => setOpen(false)} | ||||
|         jobOptions={jobOptions} | ||||
|         setJobOptions={setJobOptions} | ||||
|         multiPageScrapeEnabled={multiPageScrapeEnabled} | ||||
|       /> | ||||
|     </Box> | ||||
|   ); | ||||
| }; | ||||
| @@ -0,0 +1,279 @@ | ||||
| import { | ||||
|   Accordion, | ||||
|   AccordionDetails, | ||||
|   AccordionSummary, | ||||
|   Box, | ||||
|   Checkbox, | ||||
|   Dialog, | ||||
|   DialogContent, | ||||
|   DialogTitle, | ||||
|   Divider, | ||||
|   FormControl, | ||||
|   FormControlLabel, | ||||
|   FormGroup, | ||||
|   IconButton, | ||||
|   TextField, | ||||
|   Tooltip, | ||||
|   Typography, | ||||
|   useTheme, | ||||
| } from "@mui/material"; | ||||
| import { | ||||
|   ExpandMore as ExpandMoreIcon, | ||||
|   InfoOutlined, | ||||
|   Code as CodeIcon, | ||||
|   Settings, | ||||
| } from "@mui/icons-material"; | ||||
| import { Dispatch, SetStateAction } from "react"; | ||||
| import { RawJobOptions } from "@/types"; | ||||
| import { ExpandedTableInput } from "../../expanded-table-input"; | ||||
|  | ||||
| export type AdvancedJobOptionsDialogProps = { | ||||
|   open: boolean; | ||||
|   onClose: () => void; | ||||
|   jobOptions: RawJobOptions; | ||||
|   setJobOptions: Dispatch<SetStateAction<RawJobOptions>>; | ||||
|   multiPageScrapeEnabled?: boolean; | ||||
| }; | ||||
|  | ||||
| export const AdvancedJobOptionsDialog = ({ | ||||
|   open, | ||||
|   onClose, | ||||
|   jobOptions, | ||||
|   setJobOptions, | ||||
|   multiPageScrapeEnabled = true, | ||||
| }: AdvancedJobOptionsDialogProps) => { | ||||
|   const theme = useTheme(); | ||||
|   const handleMultiPageScrapeChange = () => { | ||||
|     setJobOptions((prevJobOptions) => ({ | ||||
|       ...prevJobOptions, | ||||
|       multi_page_scrape: !prevJobOptions.multi_page_scrape, | ||||
|     })); | ||||
|   }; | ||||
|  | ||||
|   const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => { | ||||
|     setJobOptions((prevJobOptions) => ({ | ||||
|       ...prevJobOptions, | ||||
|       proxies: e.target.value, | ||||
|     })); | ||||
|   }; | ||||
|  | ||||
|   const handleCollectMediaChange = () => { | ||||
|     setJobOptions((prevJobOptions) => ({ | ||||
|       ...prevJobOptions, | ||||
|       collect_media: !prevJobOptions.collect_media, | ||||
|     })); | ||||
|   }; | ||||
|  | ||||
|   return ( | ||||
|     <Dialog | ||||
|       open={open} | ||||
|       onClose={onClose} | ||||
|       maxWidth="md" | ||||
|       fullWidth | ||||
|       PaperProps={{ | ||||
|         sx: { | ||||
|           borderRadius: 2, | ||||
|           boxShadow: "0 8px 32px rgba(0, 0, 0, 0.1)", | ||||
|         }, | ||||
|       }} | ||||
|     > | ||||
|       <DialogTitle | ||||
|         sx={{ | ||||
|           borderBottom: `1px solid ${theme.palette.divider}`, | ||||
|           backgroundColor: theme.palette.background.default, | ||||
|           color: theme.palette.primary.contrastText, | ||||
|           borderRadius: 2, | ||||
|           display: "flex", | ||||
|           alignItems: "center", | ||||
|           justifyContent: "space-between", | ||||
|           padding: "1rem 2rem", | ||||
|           marginRight: 2, | ||||
|           marginLeft: 2, | ||||
|         }} | ||||
|       > | ||||
|         <Typography variant="h6" component="div"> | ||||
|           Advanced Job Options | ||||
|         </Typography> | ||||
|         <Settings | ||||
|           sx={{ | ||||
|             color: theme.palette.primary.contrastText, | ||||
|           }} | ||||
|         /> | ||||
|       </DialogTitle> | ||||
|  | ||||
|       <DialogContent | ||||
|         sx={{ padding: 3, overflowY: "auto", marginTop: 2, height: "60rem" }} | ||||
|       > | ||||
|         <FormControl fullWidth> | ||||
|           <Box sx={{ mb: 3 }}> | ||||
|             <Typography | ||||
|               variant="subtitle1" | ||||
|               sx={{ | ||||
|                 mb: 1, | ||||
|                 fontWeight: "bold", | ||||
|                 color: theme.palette.text.primary, | ||||
|               }} | ||||
|             > | ||||
|               Collection Options | ||||
|             </Typography> | ||||
|             <Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} /> | ||||
|  | ||||
|             <FormGroup row sx={{ gap: 4, mb: 1 }}> | ||||
|               <FormControlLabel | ||||
|                 control={ | ||||
|                   <Checkbox | ||||
|                     checked={jobOptions.multi_page_scrape} | ||||
|                     onChange={handleMultiPageScrapeChange} | ||||
|                     disabled={!multiPageScrapeEnabled} | ||||
|                   /> | ||||
|                 } | ||||
|                 label={ | ||||
|                   <Box sx={{ display: "flex", alignItems: "center" }}> | ||||
|                     <Typography>Multi Page Scrape</Typography> | ||||
|                     <Tooltip | ||||
|                       title={ | ||||
|                         multiPageScrapeEnabled | ||||
|                           ? "Enable crawling through multiple pages" | ||||
|                           : "Multi page scrape is disabled" | ||||
|                       } | ||||
|                     > | ||||
|                       <IconButton size="small"> | ||||
|                         <InfoOutlined fontSize="small" /> | ||||
|                       </IconButton> | ||||
|                     </Tooltip> | ||||
|                   </Box> | ||||
|                 } | ||||
|               /> | ||||
|               <FormControlLabel | ||||
|                 control={ | ||||
|                   <Checkbox | ||||
|                     checked={jobOptions.collect_media} | ||||
|                     onChange={handleCollectMediaChange} | ||||
|                     data-cy="collect-media-checkbox" | ||||
|                   /> | ||||
|                 } | ||||
|                 label={ | ||||
|                   <Box sx={{ display: "flex", alignItems: "center" }}> | ||||
|                     <Typography>Collect Media</Typography> | ||||
|                     <Tooltip title="Download images and other media"> | ||||
|                       <IconButton size="small"> | ||||
|                         <InfoOutlined fontSize="small" /> | ||||
|                       </IconButton> | ||||
|                     </Tooltip> | ||||
|                   </Box> | ||||
|                 } | ||||
|               /> | ||||
|             </FormGroup> | ||||
|           </Box> | ||||
|  | ||||
|           <Box sx={{ mb: 3 }}> | ||||
|             <Typography | ||||
|               variant="subtitle1" | ||||
|               sx={{ | ||||
|                 mb: 1, | ||||
|                 fontWeight: "bold", | ||||
|                 color: theme.palette.text.primary, | ||||
|               }} | ||||
|             > | ||||
|               Custom Options | ||||
|             </Typography> | ||||
|             <Divider sx={{ mb: 2, backgroundColor: theme.palette.divider }} /> | ||||
|  | ||||
|             {/* Proxies Section */} | ||||
|             <Accordion | ||||
|               defaultExpanded | ||||
|               elevation={0} | ||||
|               sx={{ | ||||
|                 mb: 2, | ||||
|                 border: `1px solid ${theme.palette.divider}`, | ||||
|                 "&:before": { display: "none" }, | ||||
|                 borderRadius: 1, | ||||
|                 overflow: "hidden", | ||||
|                 padding: 1, | ||||
|               }} | ||||
|             > | ||||
|               <AccordionSummary | ||||
|                 expandIcon={<ExpandMoreIcon />} | ||||
|                 sx={{ | ||||
|                   backgroundColor: theme.palette.background.paper, | ||||
|                   borderBottom: `1px solid ${theme.palette.divider}`, | ||||
|                   "&.Mui-expanded": { | ||||
|                     borderBottom: `1px solid ${theme.palette.divider}`, | ||||
|                   }, | ||||
|                 }} | ||||
|               > | ||||
|                 <Box sx={{ display: "flex", alignItems: "center" }}> | ||||
|                   <div | ||||
|                     style={{ | ||||
|                       display: "flex", | ||||
|                       alignItems: "center", | ||||
|                       gap: "0.5rem", | ||||
|                     }} | ||||
|                   > | ||||
|                     <Typography | ||||
|                       sx={{ | ||||
|                         fontWeight: 500, | ||||
|                         color: theme.palette.text.primary, | ||||
|                       }} | ||||
|                     > | ||||
|                       Proxies | ||||
|                     </Typography> | ||||
|  | ||||
|                     <Tooltip title="Comma separated list of proxies that should follow Playwright proxy format"> | ||||
|                       <InfoOutlined fontSize="small" /> | ||||
|                     </Tooltip> | ||||
|                   </div> | ||||
|                 </Box> | ||||
|               </AccordionSummary> | ||||
|               <AccordionDetails | ||||
|                 sx={{ p: 2, backgroundColor: theme.palette.background.default }} | ||||
|               > | ||||
|                 <TextField | ||||
|                   placeholder='Proxies ([{"server": "proxy.example.com:8080", "username": "username", "password": "password"}])' | ||||
|                   fullWidth | ||||
|                   variant="outlined" | ||||
|                   size="small" | ||||
|                   value={jobOptions.proxies} | ||||
|                   onChange={handleProxiesChange} | ||||
|                   InputProps={{ | ||||
|                     startAdornment: ( | ||||
|                       <CodeIcon | ||||
|                         sx={{ color: theme.palette.text.secondary, mr: 1 }} | ||||
|                       /> | ||||
|                     ), | ||||
|                   }} | ||||
|                 /> | ||||
|               </AccordionDetails> | ||||
|             </Accordion> | ||||
|  | ||||
|             {/* Custom Headers Section */} | ||||
|             <ExpandedTableInput | ||||
|               label="Custom Headers" | ||||
|               placeholder='{"User-Agent": "CustomAgent", "Accept": "*/*"}' | ||||
|               urlParam="custom_headers" | ||||
|               onChange={(value) => { | ||||
|                 setJobOptions((prevJobOptions) => ({ | ||||
|                   ...prevJobOptions, | ||||
|                   custom_headers: value, | ||||
|                 })); | ||||
|               }} | ||||
|             /> | ||||
|  | ||||
|             {/* Custom Cookies Section */} | ||||
|             <ExpandedTableInput | ||||
|               label="Custom Cookies" | ||||
|               placeholder='[{"name": "value", "name2": "value2"}]' | ||||
|               urlParam="custom_cookies" | ||||
|               onChange={(value) => { | ||||
|                 setJobOptions((prevJobOptions) => ({ | ||||
|                   ...prevJobOptions, | ||||
|                   custom_cookies: value, | ||||
|                 })); | ||||
|               }} | ||||
|             /> | ||||
|           </Box> | ||||
|         </FormControl> | ||||
|       </DialogContent> | ||||
|     </Dialog> | ||||
|   ); | ||||
| }; | ||||
| @@ -0,0 +1 @@ | ||||
| export * from "./advanced-job-options-dialog"; | ||||
							
								
								
									
										1
									
								
								src/components/common/advanced-job-options/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/common/advanced-job-options/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./advanced-job-options"; | ||||
							
								
								
									
										166
									
								
								src/components/common/csv-table/csv-table.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										166
									
								
								src/components/common/csv-table/csv-table.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,166 @@ | ||||
| import React, { useState } from "react"; | ||||
| import { | ||||
|   Table, | ||||
|   TableBody, | ||||
|   TableCell, | ||||
|   TableContainer, | ||||
|   TableHead, | ||||
|   TableRow, | ||||
|   Paper, | ||||
|   Box, | ||||
|   Typography, | ||||
|   useTheme, | ||||
|   alpha, | ||||
| } from "@mui/material"; | ||||
|  | ||||
| export type CsvRow = { | ||||
|   [key: string]: string; | ||||
| }; | ||||
|  | ||||
| export type CsvTableProps = { | ||||
|   csv: { | ||||
|     rows: CsvRow[]; | ||||
|     headers: string[]; | ||||
|   }; | ||||
|   className?: string; | ||||
| }; | ||||
|  | ||||
| export const CsvTable: React.FC<CsvTableProps> = ({ csv, className }) => { | ||||
|   const [expandedRow, setExpandedRow] = useState<number | null>(null); | ||||
|   const theme = useTheme(); | ||||
|  | ||||
|   const handleRowClick = (rowIndex: number) => { | ||||
|     setExpandedRow((prevRow) => (prevRow === rowIndex ? null : rowIndex)); | ||||
|   }; | ||||
|  | ||||
|   return ( | ||||
|     <Box | ||||
|       sx={{ | ||||
|         height: "100%", | ||||
|         display: "flex", | ||||
|         flexDirection: "column", | ||||
|         overflow: "hidden", | ||||
|         width: "100%", | ||||
|       }} | ||||
|       className={className} | ||||
|     > | ||||
|       {csv.rows.length > 0 ? ( | ||||
|         <TableContainer | ||||
|           sx={{ | ||||
|             flex: 1, | ||||
|             overflow: "auto", | ||||
|             borderRadius: theme.shape.borderRadius, | ||||
|             boxShadow: theme.shadows[1], | ||||
|           }} | ||||
|         > | ||||
|           <Table stickyHeader size="small" aria-label="csv data table"> | ||||
|             <TableHead> | ||||
|               <TableRow> | ||||
|                 {csv.headers.map((header, idx) => ( | ||||
|                   <TableCell | ||||
|                     key={idx} | ||||
|                     sx={{ | ||||
|                       fontWeight: "bold", | ||||
|                       cursor: "pointer", | ||||
|                       whiteSpace: "nowrap", | ||||
|                       backgroundColor: theme.palette.background.paper, | ||||
|                       color: theme.palette.text.primary, | ||||
|                       "&:hover": { | ||||
|                         backgroundColor: alpha(theme.palette.primary.main, 0.1), | ||||
|                       }, | ||||
|                       p: { xs: 1, sm: 2 }, | ||||
|                     }} | ||||
|                   > | ||||
|                     {header} | ||||
|                   </TableCell> | ||||
|                 ))} | ||||
|               </TableRow> | ||||
|             </TableHead> | ||||
|             <TableBody> | ||||
|               {csv.rows.map((row, rowIndex) => ( | ||||
|                 <React.Fragment key={rowIndex}> | ||||
|                   <TableRow | ||||
|                     onClick={() => handleRowClick(rowIndex)} | ||||
|                     sx={{ | ||||
|                       "&:nth-of-type(odd)": { | ||||
|                         backgroundColor: alpha( | ||||
|                           theme.palette.primary.main, | ||||
|                           0.02 | ||||
|                         ), | ||||
|                       }, | ||||
|                       "&:hover": { | ||||
|                         backgroundColor: alpha( | ||||
|                           theme.palette.primary.main, | ||||
|                           0.04 | ||||
|                         ), | ||||
|                       }, | ||||
|                       cursor: "pointer", | ||||
|                     }} | ||||
|                   > | ||||
|                     {Object.values(row).map((col, colIndex) => ( | ||||
|                       <TableCell | ||||
|                         key={colIndex} | ||||
|                         sx={{ | ||||
|                           whiteSpace: "nowrap", | ||||
|                           maxWidth: { xs: "150px", sm: "200px", md: "200px" }, | ||||
|                           overflow: "hidden", | ||||
|                           textOverflow: "ellipsis", | ||||
|                           p: { xs: 1, sm: 2 }, | ||||
|                         }} | ||||
|                       > | ||||
|                         {col} | ||||
|                       </TableCell> | ||||
|                     ))} | ||||
|                   </TableRow> | ||||
|  | ||||
|                   {expandedRow === rowIndex && ( | ||||
|                     <TableRow> | ||||
|                       <TableCell | ||||
|                         colSpan={csv.headers.length} | ||||
|                         sx={{ padding: 2 }} | ||||
|                       > | ||||
|                         <Paper | ||||
|                           sx={{ | ||||
|                             padding: 2, | ||||
|                             backgroundColor: alpha( | ||||
|                               theme.palette.background.paper, | ||||
|                               0.5 | ||||
|                             ), | ||||
|                           }} | ||||
|                         > | ||||
|                           <Typography variant="body2" color="text.secondary"> | ||||
|                             {row.text | ||||
|                               ? row.text | ||||
|                                   .replace(/[\n\t\r]+/g, " ") | ||||
|                                   .replace(/\s+/g, " ") | ||||
|                                   .trim() | ||||
|                               : "No text available"} | ||||
|                           </Typography> | ||||
|                         </Paper> | ||||
|                       </TableCell> | ||||
|                     </TableRow> | ||||
|                   )} | ||||
|                 </React.Fragment> | ||||
|               ))} | ||||
|             </TableBody> | ||||
|           </Table> | ||||
|         </TableContainer> | ||||
|       ) : ( | ||||
|         <Paper | ||||
|           sx={{ | ||||
|             p: 4, | ||||
|             display: "flex", | ||||
|             justifyContent: "center", | ||||
|             alignItems: "center", | ||||
|             height: "100%", | ||||
|             borderRadius: theme.shape.borderRadius, | ||||
|             backgroundColor: alpha(theme.palette.background.paper, 0.5), | ||||
|             border: `1px dashed ${theme.palette.divider}`, | ||||
|           }} | ||||
|         > | ||||
|           <Typography color="text.secondary">No data available</Typography> | ||||
|         </Paper> | ||||
|       )} | ||||
|     </Box> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										1
									
								
								src/components/common/csv-table/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/common/csv-table/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./csv-table"; | ||||
							
								
								
									
										29
									
								
								src/components/common/disabled/disabled.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								src/components/common/disabled/disabled.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| import { Box } from "@mui/material"; | ||||
|  | ||||
| export type DisabledProps = { | ||||
|   message: string; | ||||
| }; | ||||
|  | ||||
| export const Disabled = ({ message }: DisabledProps) => { | ||||
|   return ( | ||||
|     <Box | ||||
|       bgcolor="background.default" | ||||
|       minHeight="100vh" | ||||
|       display="flex" | ||||
|       justifyContent="center" | ||||
|       alignItems="center" | ||||
|     > | ||||
|       <h4 | ||||
|         style={{ | ||||
|           color: "#fff", | ||||
|           padding: "20px", | ||||
|           borderRadius: "8px", | ||||
|           background: "rgba(0, 0, 0, 0.6)", | ||||
|           boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)", | ||||
|         }} | ||||
|       > | ||||
|         {message} | ||||
|       </h4> | ||||
|     </Box> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										1
									
								
								src/components/common/disabled/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/common/disabled/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./disabled"; | ||||
| @@ -0,0 +1,204 @@ | ||||
| import { | ||||
|   Accordion, | ||||
|   AccordionSummary, | ||||
|   TableCell, | ||||
|   TableRow, | ||||
|   Paper, | ||||
|   TableBody, | ||||
|   useTheme, | ||||
|   TextField, | ||||
|   Box, | ||||
|   Typography, | ||||
|   AccordionDetails, | ||||
|   TableHead, | ||||
|   TableContainer, | ||||
|   Table, | ||||
| } from "@mui/material"; | ||||
| import { useEffect, useState } from "react"; | ||||
| import ExpandMoreIcon from "@mui/icons-material/ExpandMore"; | ||||
| import { parseJsonToEntries } from "@/lib/helpers/parse-json-to-entries"; | ||||
|  | ||||
| export type ExpandedTableInputProps = { | ||||
|   label: string; | ||||
|   onChange: (value: any) => void; | ||||
|   placeholder: string; | ||||
|   urlParam: string; | ||||
| }; | ||||
|  | ||||
| export const ExpandedTableInput = ({ | ||||
|   label, | ||||
|   onChange, | ||||
|   placeholder, | ||||
|   urlParam, | ||||
| }: ExpandedTableInputProps) => { | ||||
|   const theme = useTheme(); | ||||
|   const [value, setValue] = useState(""); | ||||
|   const [parsedHeaders, setParsedHeaders] = useState<[string, string][] | null>( | ||||
|     null | ||||
|   ); | ||||
|  | ||||
|   const [jsonError, setJsonError] = useState<string | null>(null); | ||||
|  | ||||
|   const urlParams = new URLSearchParams(window.location.search); | ||||
|  | ||||
|   const validateAndParse = (val: string) => { | ||||
|     if (val.trim() === "") { | ||||
|       setParsedHeaders(null); | ||||
|       setJsonError(null); | ||||
|       return null; | ||||
|     } | ||||
|  | ||||
|     try { | ||||
|       const parsed = JSON.parse(val); | ||||
|       const entries = parseJsonToEntries(val); | ||||
|  | ||||
|       if (entries === null) { | ||||
|         setParsedHeaders(null); | ||||
|         setJsonError("Invalid JSON object"); | ||||
|         return null; | ||||
|       } else { | ||||
|         setParsedHeaders(entries); | ||||
|         setJsonError(null); | ||||
|         return parsed; | ||||
|       } | ||||
|     } catch (e) { | ||||
|       setParsedHeaders(null); | ||||
|       setJsonError("Invalid JSON format"); | ||||
|       return null; | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   const handleChange = (e: React.ChangeEvent<HTMLInputElement>) => { | ||||
|     const val = e.target.value; | ||||
|     setValue(val); | ||||
|     const parsed = validateAndParse(val); | ||||
|     onChange(parsed); | ||||
|   }; | ||||
|  | ||||
|   useEffect(() => { | ||||
|     const jobOptions = urlParams.get("job_options"); | ||||
|  | ||||
|     if (!jobOptions) { | ||||
|       setParsedHeaders(null); | ||||
|       setJsonError(null); | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     const jobOptionsObject = JSON.parse(jobOptions || "{}"); | ||||
|     let val = jobOptionsObject[urlParam]; | ||||
|  | ||||
|     if (val.length === 0 || Object.keys(val).length === 0) { | ||||
|       setParsedHeaders(null); | ||||
|       setJsonError(null); | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     if (typeof val === "string") { | ||||
|       try { | ||||
|         val = JSON.parse(val); | ||||
|       } catch {} | ||||
|     } | ||||
|  | ||||
|     const finalVal = | ||||
|       typeof val === "string" ? val : val != null ? JSON.stringify(val) : ""; | ||||
|  | ||||
|     setValue(finalVal); | ||||
|     const parsed = validateAndParse(finalVal); | ||||
|     onChange(parsed); | ||||
|   }, [urlParam]); | ||||
|  | ||||
|   return ( | ||||
|     <Accordion | ||||
|       defaultExpanded | ||||
|       elevation={0} | ||||
|       sx={{ | ||||
|         mb: 2, | ||||
|         border: `1px solid ${theme.palette.divider}`, | ||||
|         "&:before": { display: "none" }, | ||||
|         borderRadius: 1, | ||||
|         overflow: "hidden", | ||||
|         padding: 1, | ||||
|       }} | ||||
|     > | ||||
|       <AccordionSummary | ||||
|         expandIcon={<ExpandMoreIcon />} | ||||
|         sx={{ | ||||
|           backgroundColor: theme.palette.background.paper, | ||||
|           borderBottom: `1px solid ${theme.palette.divider}`, | ||||
|           "&.Mui-expanded": { | ||||
|             borderBottom: `1px solid ${theme.palette.divider}`, | ||||
|           }, | ||||
|         }} | ||||
|       > | ||||
|         <Box sx={{ display: "flex", alignItems: "center" }}> | ||||
|           <Typography | ||||
|             sx={{ fontWeight: 500, color: theme.palette.text.primary }} | ||||
|           > | ||||
|             {label} | ||||
|           </Typography> | ||||
|         </Box> | ||||
|       </AccordionSummary> | ||||
|       <AccordionDetails | ||||
|         sx={{ p: 2, backgroundColor: theme.palette.background.default }} | ||||
|       > | ||||
|         <TextField | ||||
|           placeholder={placeholder} | ||||
|           value={value} | ||||
|           onChange={handleChange} | ||||
|           fullWidth | ||||
|           variant="outlined" | ||||
|           size="small" | ||||
|           error={jsonError !== null} | ||||
|           helperText={jsonError ?? ""} | ||||
|         /> | ||||
|  | ||||
|         {parsedHeaders && parsedHeaders.length > 0 && ( | ||||
|           <Paper | ||||
|             variant="outlined" | ||||
|             sx={{ | ||||
|               marginTop: 1, | ||||
|               border: `1px solid ${theme.palette.divider}`, | ||||
|               borderRadius: 1, | ||||
|               overflow: "hidden", | ||||
|               padding: 0, | ||||
|             }} | ||||
|           > | ||||
|             <TableContainer sx={{ maxHeight: 200 }}> | ||||
|               <Table size="small" stickyHeader> | ||||
|                 <TableHead> | ||||
|                   <TableRow | ||||
|                     sx={{ | ||||
|                       backgroundColor: theme.palette.background.paper, | ||||
|                     }} | ||||
|                   > | ||||
|                     <TableCell sx={{ fontWeight: "bold" }}>Header</TableCell> | ||||
|                     <TableCell sx={{ fontWeight: "bold" }}>Value</TableCell> | ||||
|                   </TableRow> | ||||
|                 </TableHead> | ||||
|                 <TableBody> | ||||
|                   {parsedHeaders.map(([key, val]) => ( | ||||
|                     <TableRow | ||||
|                       key={key} | ||||
|                       hover | ||||
|                       sx={{ | ||||
|                         "&:nth-of-type(odd)": { | ||||
|                           backgroundColor: | ||||
|                             theme.palette.mode === "light" | ||||
|                               ? "rgba(0, 0, 0, 0.02)" | ||||
|                               : "rgba(255, 255, 255, 0.02)", | ||||
|                         }, | ||||
|                       }} | ||||
|                     > | ||||
|                       <TableCell sx={{ fontWeight: 500 }}>{key}</TableCell> | ||||
|                       <TableCell>{val}</TableCell> | ||||
|                     </TableRow> | ||||
|                   ))} | ||||
|                 </TableBody> | ||||
|               </Table> | ||||
|             </TableContainer> | ||||
|           </Paper> | ||||
|         )} | ||||
|       </AccordionDetails> | ||||
|     </Accordion> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										1
									
								
								src/components/common/expanded-table-input/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/common/expanded-table-input/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./expanded-table-input"; | ||||
							
								
								
									
										1
									
								
								src/components/common/job-download-dialog/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/common/job-download-dialog/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./job-download-dialog"; | ||||
| @@ -0,0 +1,95 @@ | ||||
| import { | ||||
|   Dialog, | ||||
|   DialogTitle, | ||||
|   DialogContent, | ||||
|   DialogActions, | ||||
|   Button, | ||||
|   FormControl, | ||||
|   RadioGroup, | ||||
|   FormControlLabel, | ||||
|   Radio, | ||||
|   FormLabel, | ||||
|   Typography, | ||||
|   Box, | ||||
| } from "@mui/material"; | ||||
| import { useState } from "react"; | ||||
|  | ||||
| export type JobDownloadDialogProps = { | ||||
|   open: boolean; | ||||
|   onClose: () => void; | ||||
|   ids: string[]; | ||||
| }; | ||||
|  | ||||
| export const JobDownloadDialog = ({ | ||||
|   open, | ||||
|   onClose, | ||||
|   ids, | ||||
| }: JobDownloadDialogProps) => { | ||||
|   const [jobFormat, setJobFormat] = useState<string>("csv"); | ||||
|   const handleDownload = async () => { | ||||
|     const response = await fetch("/api/download", { | ||||
|       method: "POST", | ||||
|       headers: { "Content-Type": "application/json" }, | ||||
|       body: JSON.stringify({ data: { ids: ids, job_format: jobFormat } }), | ||||
|     }); | ||||
|  | ||||
|     if (response.ok) { | ||||
|       const blob = await response.blob(); | ||||
|       const url = window.URL.createObjectURL(blob); | ||||
|       const a = document.createElement("a"); | ||||
|       a.style.display = "none"; | ||||
|       a.href = url; | ||||
|       a.download = `job_${ids[0]}.${jobFormat}`; | ||||
|       document.body.appendChild(a); | ||||
|       a.click(); | ||||
|       window.URL.revokeObjectURL(url); | ||||
|       document.body.removeChild(a); | ||||
|     } else { | ||||
|       console.error("Failed to download the file."); | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   return ( | ||||
|     <Dialog open={open} onClose={onClose}> | ||||
|       <DialogTitle>Download Job</DialogTitle> | ||||
|       <DialogContent> | ||||
|         <FormControl> | ||||
|           <Typography variant="body1"> | ||||
|             You are about to download {ids.length} job(s). Please select the | ||||
|             format that you would like to download them in. | ||||
|           </Typography> | ||||
|           <br /> | ||||
|           <Box | ||||
|             sx={{ | ||||
|               display: "flex", | ||||
|               flexDirection: "column", | ||||
|               backgroundColor: "background.paper", | ||||
|               padding: 2, | ||||
|               border: "1px solid", | ||||
|             }} | ||||
|           > | ||||
|             <FormLabel>Format</FormLabel> | ||||
|             <hr style={{ width: "100%", margin: "10px 0" }} /> | ||||
|             <RadioGroup | ||||
|               aria-labelledby="job-download-format-radio-buttons" | ||||
|               name="job-download-format-radio-buttons" | ||||
|               value={jobFormat} | ||||
|               onChange={(e) => setJobFormat(e.target.value)} | ||||
|             > | ||||
|               <FormControlLabel value="csv" control={<Radio />} label="CSV" /> | ||||
|               <FormControlLabel | ||||
|                 value="md" | ||||
|                 control={<Radio />} | ||||
|                 label="Markdown" | ||||
|               /> | ||||
|             </RadioGroup> | ||||
|           </Box> | ||||
|           <br /> | ||||
|           <Button onClick={handleDownload} size="small"> | ||||
|             Download | ||||
|           </Button> | ||||
|         </FormControl> | ||||
|       </DialogContent> | ||||
|     </Dialog> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										40
									
								
								src/components/common/media-viewer/audio/audio-viewer.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								src/components/common/media-viewer/audio/audio-viewer.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,40 @@ | ||||
|  | ||||
| import { Box, Typography } from "@mui/material"; | ||||
|  | ||||
| interface AudioViewerProps { | ||||
|   mediaUrl: string; | ||||
|   selectedMedia: string; | ||||
|   onError: () => void; | ||||
| } | ||||
|  | ||||
| export const AudioViewer = ({ | ||||
|   mediaUrl, | ||||
|   selectedMedia, | ||||
|   onError, | ||||
| }: AudioViewerProps) => { | ||||
|   return ( | ||||
|     <Box | ||||
|       sx={{ | ||||
|         display: "flex", | ||||
|         justifyContent: "center", | ||||
|         alignItems: "center", | ||||
|         flexDirection: "column", | ||||
|         height: "100%", | ||||
|         gap: 2, | ||||
|       }} | ||||
|     > | ||||
|       <Typography variant="h6">{selectedMedia}</Typography> | ||||
|       <audio | ||||
|         controls | ||||
|         onError={onError} | ||||
|         style={{ | ||||
|           width: "80%", | ||||
|           maxWidth: "500px", | ||||
|         }} | ||||
|       > | ||||
|         <source src={mediaUrl} type="audio/mpeg" /> | ||||
|         Your browser does not support the audio element. | ||||
|       </audio> | ||||
|     </Box> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										1
									
								
								src/components/common/media-viewer/audio/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/common/media-viewer/audio/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./audio-viewer"; | ||||
							
								
								
									
										36
									
								
								src/components/common/media-viewer/image/image-viewer.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								src/components/common/media-viewer/image/image-viewer.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | ||||
| import { Box, useTheme } from "@mui/material"; | ||||
|  | ||||
| export const ImageViewer = ({ | ||||
|   mediaUrl, | ||||
|   selectedMedia, | ||||
| }: { | ||||
|   mediaUrl: string; | ||||
|   selectedMedia: string; | ||||
| }) => { | ||||
|   const theme = useTheme(); | ||||
|   return ( | ||||
|     <Box | ||||
|       sx={{ | ||||
|         display: "flex", | ||||
|         justifyContent: "center", | ||||
|         alignItems: "center", | ||||
|         height: "100%", | ||||
|         width: "100%", | ||||
|         overflow: "hidden", | ||||
|         position: "relative", | ||||
|       }} | ||||
|     > | ||||
|       <img | ||||
|         src={mediaUrl} | ||||
|         alt={selectedMedia} | ||||
|         style={{ | ||||
|           maxHeight: "100%", | ||||
|           maxWidth: "100%", | ||||
|           objectFit: "contain", | ||||
|           borderRadius: "4px", | ||||
|           boxShadow: theme.shadows[4], | ||||
|         }} | ||||
|       /> | ||||
|     </Box> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										1
									
								
								src/components/common/media-viewer/image/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/common/media-viewer/image/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./image-viewer"; | ||||
							
								
								
									
										1
									
								
								src/components/common/media-viewer/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/common/media-viewer/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./media-viewer"; | ||||
							
								
								
									
										75
									
								
								src/components/common/media-viewer/media-viewer.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								src/components/common/media-viewer/media-viewer.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,75 @@ | ||||
| import { Box, Typography } from "@mui/material"; | ||||
| import { ImageViewer } from "./image"; | ||||
| import { VideoViewer } from "./video"; | ||||
| import { AudioViewer } from "./audio"; | ||||
| import { PDFViewer } from "./pdf-viewer"; | ||||
|  | ||||
| interface MediaViewerProps { | ||||
|   selectedMedia: string; | ||||
|   activeTab: string; | ||||
|   getMediaUrl: (fileName: string) => string; | ||||
|   onError: (error: string) => void; | ||||
| } | ||||
|  | ||||
| export const MediaViewer = ({ | ||||
|   selectedMedia, | ||||
|   activeTab, | ||||
|   getMediaUrl, | ||||
|   onError, | ||||
| }: MediaViewerProps) => { | ||||
|   if (!selectedMedia) { | ||||
|     return ( | ||||
|       <Box | ||||
|         sx={{ | ||||
|           display: "flex", | ||||
|           justifyContent: "center", | ||||
|           alignItems: "center", | ||||
|           height: "100%", | ||||
|         }} | ||||
|       > | ||||
|         <Typography variant="body1" color="textSecondary"> | ||||
|           Select a file to view | ||||
|         </Typography> | ||||
|       </Box> | ||||
|     ); | ||||
|   } | ||||
|  | ||||
|   const mediaUrl = getMediaUrl(selectedMedia); | ||||
|  | ||||
|   switch (activeTab) { | ||||
|     case "images": | ||||
|       return <ImageViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />; | ||||
|     case "videos": | ||||
|       return ( | ||||
|         <VideoViewer | ||||
|           mediaUrl={mediaUrl} | ||||
|           onError={() => onError("Error loading video")} | ||||
|         /> | ||||
|       ); | ||||
|     case "audio": | ||||
|       return ( | ||||
|         <AudioViewer | ||||
|           mediaUrl={mediaUrl} | ||||
|           selectedMedia={selectedMedia} | ||||
|           onError={() => onError("Error loading audio")} | ||||
|         /> | ||||
|       ); | ||||
|     case "pdfs": | ||||
|       return <PDFViewer mediaUrl={mediaUrl} selectedMedia={selectedMedia} />; | ||||
|     default: | ||||
|       return ( | ||||
|         <Box | ||||
|           sx={{ | ||||
|             display: "flex", | ||||
|             justifyContent: "center", | ||||
|             alignItems: "center", | ||||
|             height: "100%", | ||||
|           }} | ||||
|         > | ||||
|           <Typography variant="body1"> | ||||
|             {selectedMedia} - Download this file to view it | ||||
|           </Typography> | ||||
|         </Box> | ||||
|       ); | ||||
|   } | ||||
| }; | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user