mirror of
				https://github.com/jaypyles/Scraperr.git
				synced 2025-10-30 22:17:21 +00:00 
			
		
		
		
	Compare commits
	
		
			53 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|   | 031572325f | ||
|   | 48d3bf9214 | ||
|   | e07abcd089 | ||
|   | 8a933b88a7 | ||
|   | 863dbcd044 | ||
|   | de40181a6f | ||
|   | 8703f706a1 | ||
|   | b40d378bbf | ||
|   | 8123e1f149 | ||
|   | 8cd30599fa | ||
|   | a58212b214 | ||
|   | a6ab6ec71d | ||
|   | c5c9427af4 | ||
|   | e8d80c1a77 | ||
|   | ee8047ac78 | ||
|   | e74c4f392c | ||
|   | 6b484952a3 | ||
|   | 2283808605 | ||
|   | ee5ada70f7 | ||
|   | 56cc457e6e | ||
|   | 21a38181de | ||
|   | 3063bc0d53 | ||
|   | f42e7ed531 | ||
|   | c197f2becd | ||
|   | a534129702 | ||
|   | 455ed049c9 | ||
|   | de4ccfbf3a | ||
|   | 3475d66995 | ||
|   | 186b4a0231 | ||
|   | 0af0ebf5b5 | ||
|   | ef35db00d7 | ||
|   | d65e600ec3 | ||
|   | 6fe145f649 | ||
|   | 563ca2245e | ||
|   | d54fdbd405 | ||
|   | 7169755cd2 | ||
|   | 15b56b5704 | ||
|   | bf6b740005 | ||
|   | c339e75e06 | ||
|   | b6ed40e6cf | ||
|   | 3085f9d31a | ||
|   | 7d80ff5c7f | ||
|   | 3a0762f1e3 | ||
|   | dc4d219205 | ||
|   | b3bf780eda | ||
|   | 1dfd3ca92a | ||
|   | fe51140a0e | ||
|   | dd6cec6679 | ||
|   | 2339ba1b77 | ||
|   | 1cdffd9006 | ||
|   | 266b91ed0e | ||
|   | 7dfa3ccfe9 | ||
|   | 86a1d32990 | 
							
								
								
									
										32
									
								
								.github/ISSUE_TEMPLATE/bug_report.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								.github/ISSUE_TEMPLATE/bug_report.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| --- | ||||
| name: Bug report | ||||
| about: 'Bug reporting ' | ||||
| title: '' | ||||
| labels: '' | ||||
| assignees: '' | ||||
|  | ||||
| --- | ||||
|  | ||||
| **Describe the bug** | ||||
| A clear and concise description of what the bug is. | ||||
|  | ||||
| **To Reproduce** | ||||
| Steps to reproduce the behavior: | ||||
| 1. Go to '...' | ||||
| 2. Click on '....' | ||||
| 3. Scroll down to '....' | ||||
| 4. See error | ||||
|  | ||||
| **Expected behavior** | ||||
| A clear and concise description of what you expected to happen. | ||||
|  | ||||
| **Screenshots** | ||||
| If applicable, add screenshots to help explain your problem. | ||||
|  | ||||
| **Desktop (please complete the following information):** | ||||
|  - OS: [e.g. iOS] | ||||
|  - Browser [e.g. chrome, safari] | ||||
|  - Version [e.g. 22] | ||||
|  | ||||
| **Additional context** | ||||
| Add any other context about the problem here. | ||||
							
								
								
									
										58
									
								
								.github/actions/run-cypress-tests/action.yaml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								.github/actions/run-cypress-tests/action.yaml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,58 @@ | ||||
| name: Run Cypress Tests | ||||
|  | ||||
| description: Run Cypress tests | ||||
|  | ||||
| runs: | ||||
|   using: "composite" | ||||
|   steps: | ||||
|     - name: Checkout code | ||||
|       uses: actions/checkout@v4 | ||||
|  | ||||
|     - name: Setup Node | ||||
|       uses: actions/setup-node@v4 | ||||
|       with: | ||||
|         node-version: 22 | ||||
|  | ||||
|     - name: Setup Docker project | ||||
|       shell: bash | ||||
|       run: make build up-dev | ||||
|  | ||||
|     - name: Install dependencies | ||||
|       shell: bash | ||||
|       run: npm install | ||||
|  | ||||
|     - name: Wait for frontend to be ready | ||||
|       shell: bash | ||||
|       run: | | ||||
|         for i in {1..10}; do | ||||
|           curl -s http://127.0.0.1:80 && echo "Frontend is ready" && exit 0 | ||||
|           echo "Waiting for frontend to be ready... attempt $i" | ||||
|           sleep 1 | ||||
|         done | ||||
|         echo "Frontend failed to be ready after 10 retries" | ||||
|         exit 1 | ||||
|  | ||||
|     - name: Wait for backend to be ready | ||||
|       shell: bash | ||||
|       run: | | ||||
|         for i in {1..10}; do | ||||
|           curl -s http://127.0.0.1:8000 && echo "Backend is ready" && exit 0 | ||||
|           echo "Waiting for backend to be ready... attempt $i" | ||||
|           sleep 1 | ||||
|         done | ||||
|         echo "Backend failed to be ready after 10 retries" | ||||
|         exit 1 | ||||
|  | ||||
|     - name: Show backend logs on failure | ||||
|       if: failure() | ||||
|       shell: bash | ||||
|       run: | | ||||
|         echo "== Docker Containers ==" | ||||
|         docker ps -a | ||||
|         echo "== Backend Logs ==" | ||||
|         docker logs $(docker ps -a --filter "name=scraperr_api" --format "{{.Names}}") || echo "Could not get backend logs" | ||||
|  | ||||
|     - name: Run Cypress tests | ||||
|       shell: bash | ||||
|       run: npm run cy:run | ||||
|  | ||||
							
								
								
									
										27
									
								
								.github/workflows/docker-image.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										27
									
								
								.github/workflows/docker-image.yml
									
									
									
									
										vendored
									
									
								
							| @@ -1,10 +1,14 @@ | ||||
| name: ci | ||||
| name: Docker Image | ||||
| on: | ||||
|   push: | ||||
|     branches: ["master"] | ||||
|   workflow_run: | ||||
|     workflows: ["Unit Tests"] | ||||
|     types: | ||||
|       - completed | ||||
|   workflow_dispatch: | ||||
|  | ||||
| jobs: | ||||
|   build: | ||||
|     if: ${{ github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'master' }} | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - name: Checkout | ||||
| @@ -34,3 +38,20 @@ jobs: | ||||
|           file: ./docker/api/Dockerfile | ||||
|           push: true | ||||
|           tags: ${{ secrets.DOCKERHUB_USERNAME }}/scraperr_api:latest | ||||
|  | ||||
|   success-message: | ||||
|     runs-on: ubuntu-latest | ||||
|     needs: | ||||
|       - build | ||||
|     steps: | ||||
|       - name: Send Discord Message | ||||
|         uses: jaypyles/discord-webhook-action@v1.0.0 | ||||
|         with: | ||||
|           webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }} | ||||
|           content: "Scraperr Successfully Built Docker Images" | ||||
|           username: "Scraperr CI" | ||||
|           embed-title: "✅ Deployment Status" | ||||
|           embed-description: "Scraperr successfully built docker images." | ||||
|           embed-color: 3066993 # Green | ||||
|           embed-footer-text: "Scraperr CI" | ||||
|           embed-timestamp: ${{ github.event.head_commit.timestamp }} | ||||
|   | ||||
							
								
								
									
										57
									
								
								.github/workflows/unit-tests.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								.github/workflows/unit-tests.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,57 @@ | ||||
| name: Unit Tests | ||||
|  | ||||
| on: | ||||
|   push: | ||||
|     branches: | ||||
|       - master | ||||
|  | ||||
|   pull_request: | ||||
|     types: [opened, synchronize, reopened] | ||||
|  | ||||
|   workflow_dispatch: | ||||
|  | ||||
| jobs: | ||||
|   unit-tests: | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - name: Checkout | ||||
|         uses: actions/checkout@v4 | ||||
|  | ||||
|       - name: Set env | ||||
|         run: echo "ENV=test" >> $GITHUB_ENV | ||||
|  | ||||
|       - name: Install pdm | ||||
|         run: pip install pdm | ||||
|  | ||||
|       - name: Install project dependencies | ||||
|         run: pdm install | ||||
|  | ||||
|       - name: Install playwright | ||||
|         run: pdm run playwright install | ||||
|  | ||||
|       - name: Run tests | ||||
|         run: PYTHONPATH=. pdm run pytest api/backend/tests | ||||
|  | ||||
|   cypress-tests: | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - uses: actions/checkout@v4 | ||||
|       - uses: ./.github/actions/run-cypress-tests | ||||
|  | ||||
|   success-message: | ||||
|     runs-on: ubuntu-latest | ||||
|     needs: | ||||
|       - unit-tests | ||||
|       - cypress-tests | ||||
|     steps: | ||||
|       - name: Send Discord Message | ||||
|         uses: jaypyles/discord-webhook-action@v1.0.0 | ||||
|         with: | ||||
|           webhook-url: ${{ secrets.DISCORD_WEBHOOK_URL }} | ||||
|           content: "Scraperr Successfully Passed Tests" | ||||
|           username: "Scraperr CI" | ||||
|           embed-title: "✅ Deployment Status" | ||||
|           embed-description: "Scraperr successfully passed all tests." | ||||
|           embed-color: 3066993 # Green | ||||
|           embed-footer-text: "Scraperr CI" | ||||
|           embed-timestamp: ${{ github.event.head_commit.timestamp }} | ||||
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -187,3 +187,5 @@ cython_debug/ | ||||
| postgres_data | ||||
| .vscode | ||||
| ollama | ||||
| data | ||||
| media | ||||
							
								
								
									
										1
									
								
								.python-version
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.python-version
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| 3.10.12 | ||||
							
								
								
									
										1
									
								
								FUNDING.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								FUNDING.yml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| custom: ["https://www.buymeacoffee.com/jaypyles"] | ||||
							
								
								
									
										191
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										191
									
								
								README.md
									
									
									
									
									
								
							| @@ -1,176 +1,59 @@ | ||||
|  | ||||
|  | ||||
| <div align="center"> | ||||
|   <img src="https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=for-the-badge&logo=mongodb&logoColor=white" alt="MongoDB" /> | ||||
|   <img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI" /> | ||||
|   <img src="https://img.shields.io/badge/Next-black?style=for-the-badge&logo=next.js&logoColor=white" alt="Next JS" /> | ||||
|   <img src="https://img.shields.io/badge/tailwindcss-%2338B2AC.svg?style=for-the-badge&logo=tailwind-css&logoColor=white" alt="TailwindCSS" /> | ||||
|   <img src="https://github.com/jaypyles/www-scrape/blob/master/docs/logo_picture.png" alt="Scraperr Logo" width="250px"> | ||||
|    | ||||
|   **A powerful self-hosted web scraping solution** | ||||
|    | ||||
|   <div> | ||||
|     <img src="https://img.shields.io/badge/MongoDB-%234ea94b.svg?style=for-the-badge&logo=mongodb&logoColor=white" alt="MongoDB" /> | ||||
|     <img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI" /> | ||||
|     <img src="https://img.shields.io/badge/Next-black?style=for-the-badge&logo=next.js&logoColor=white" alt="Next JS" /> | ||||
|     <img src="https://img.shields.io/badge/tailwindcss-%2338B2AC.svg?style=for-the-badge&logo=tailwind-css&logoColor=white" alt="TailwindCSS" /> | ||||
|   </div> | ||||
| </div> | ||||
|  | ||||
| # Summary | ||||
| ## 📋 Overview | ||||
|  | ||||
| Scraperr is a self-hosted web application that allows users to scrape data from web pages by specifying elements via XPath. Users can submit URLs and the corresponding elements to be scraped, and the results will be displayed in a table. | ||||
| Scraperr enables you to extract data from websites with precision using XPath selectors. This self-hosted application provides a clean interface to manage scraping jobs, view results, and export data. | ||||
|  | ||||
| From the table, users can download an excel sheet of the job's results, along with an option to rerun the job. | ||||
| > 📚 **[Check out the docs](https://scraperr-docs.pages.dev)** for a comprehensive quickstart guide and detailed information. | ||||
|  | ||||
| ## Features | ||||
| <div align="center"> | ||||
|   <img src="https://github.com/jaypyles/www-scrape/blob/master/docs/main_page.png" alt="Scraperr Main Interface" width="800px"> | ||||
| </div> | ||||
|  | ||||
| ### Submitting URLs for Scraping | ||||
| ## ✨ Key Features | ||||
|  | ||||
| - Submit/Queue URLs for web scraping | ||||
| - Add and manage elements to scrape using XPath | ||||
| - Scrape all pages within same domain | ||||
| - Add custom json headers to send in requests to URLs | ||||
| - Display results of scraped data | ||||
| - **XPath-Based Extraction**: Precisely target page elements | ||||
| - **Queue Management**: Submit and manage multiple scraping jobs | ||||
| - **Domain Spidering**: Option to scrape all pages within the same domain | ||||
| - **Custom Headers**: Add JSON headers to your scraping requests | ||||
| - **Media Downloads**: Automatically download images, videos, and other media | ||||
| - **Results Visualization**: View scraped data in a structured table format | ||||
| - **Data Export**: Export your results in various formats | ||||
| - **Notifcation Channels**: Send completion notifcations, through various channels | ||||
|  | ||||
|  | ||||
| ## 🚀 Getting Started | ||||
|  | ||||
| ### Managing Previous Jobs | ||||
|  | ||||
| - Download csv containing results | ||||
| - Rerun jobs | ||||
| - View status of queued jobs | ||||
| - Favorite and view favorited jobs | ||||
|  | ||||
|  | ||||
|  | ||||
| ### User Management | ||||
|  | ||||
| - User login/signup to organize jobs (optional) | ||||
|  | ||||
|  | ||||
|  | ||||
| ### Log Viewing | ||||
|  | ||||
| - View app logs inside of web ui | ||||
|  | ||||
|  | ||||
|  | ||||
| ### Statistics View | ||||
|  | ||||
| - View a small statistics view of jobs ran | ||||
|  | ||||
|  | ||||
|  | ||||
| ### AI Integration | ||||
|  | ||||
| - Include the results of a selected job into the context of a conversation | ||||
| - Currently supports: | ||||
|  | ||||
| 1. Ollama | ||||
| 2. OpenAI | ||||
|  | ||||
|  | ||||
|  | ||||
| ## Installation | ||||
|  | ||||
| 1. Clone the repository: | ||||
|  | ||||
|    ```sh | ||||
|    git clone https://github.com/jaypyles/scraperr.git | ||||
|  | ||||
|    ``` | ||||
|  | ||||
| 2. Set environmental variables and labels in `docker-compose.yml`. | ||||
|  | ||||
| ```yaml | ||||
| scraperr: | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost | ||||
|       - "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https | ||||
|       - "traefik.http.services.scraperr.loadbalancer.server.port=3000" | ||||
|  | ||||
| scraperr_api: | ||||
|  environment: | ||||
|       - LOG_LEVEL=INFO | ||||
|       - MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB | ||||
|       - SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string) | ||||
|       - ALGORITHM=HS256 # authentication encoding algorithm | ||||
|       - ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes | ||||
|   labels: | ||||
|         - "traefik.enable=true" | ||||
|         - "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost | ||||
|         - "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https | ||||
|         - "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api" | ||||
|         - "traefik.http.routers.scraperr_api.middlewares=api-stripprefix" | ||||
|         - "traefik.http.services.scraperr_api.loadbalancer.server.port=8000" | ||||
|  | ||||
| mongo: | ||||
|     environment: | ||||
|       MONGO_INITDB_ROOT_USERNAME: root | ||||
|       MONGO_INITDB_ROOT_PASSWORD: example | ||||
| ``` | ||||
|  | ||||
| Don't want to use `traefik`? This configuration can be used in other reverse proxies, as long as the API is proxied to `/api` of the frontend container. This is currently | ||||
| not able to be used without a reverse proxy, due to limitations of runtime client-side environmental variables in `next.js`. | ||||
|  | ||||
| 3. Deploy | ||||
|  | ||||
| ```sh | ||||
| ```bash | ||||
| make up | ||||
| ``` | ||||
|  | ||||
| The app provides its own `traefik` configuration to use independently, but can easily be reverse-proxied by any other app, or your own reverse-proxy. | ||||
| ## ⚖️ Legal and Ethical Guidelines | ||||
|  | ||||
| ## Usage | ||||
| When using Scraperr, please remember to: | ||||
|  | ||||
| 1. Open the application in your browser at `http://localhost`. | ||||
| 2. Enter the URL you want to scrape in the URL field. | ||||
| 3. Add elements to scrape by specifying a name and the corresponding XPath. | ||||
| 4. Click the "Submit" button to queue URL to be scraped. | ||||
| 5. View queue in the "Previous Jobs" section. | ||||
| 1. **Respect `robots.txt`**: Always check a website's `robots.txt` file to verify which pages permit scraping | ||||
| 2. **Terms of Service**: Adhere to each website's Terms of Service regarding data extraction | ||||
| 3. **Rate Limiting**: Implement reasonable delays between requests to avoid overloading servers | ||||
|  | ||||
| ## API Endpoints | ||||
| > **Disclaimer**: Scraperr is intended for use only on websites that explicitly permit scraping. The creator accepts no responsibility for misuse of this tool. | ||||
|  | ||||
| Use this service as an API for your own projects. Due to this using FastAPI, a docs page is available at `/docs` for the API. | ||||
|  | ||||
|  | ||||
|  | ||||
| ## AI | ||||
|  | ||||
| Currently supports either an Ollama instance or OpenAI's ChatGPT, using your own API key. Setting up is easy as either setting the Ollama url or the OpenAI API key in the API's environmental variables in the `docker-compose.yml` file: | ||||
|  | ||||
| ```yaml | ||||
| scraperr_api: | ||||
|   environment: | ||||
|     - OLLAMA_URL=http://ollama:11434 | ||||
|     - OLLAMA_MODEL=llama3.1 | ||||
|     # or | ||||
|     - OPENAI_KEY=<your_key> | ||||
|     - OPENAI_MODEL=gpt3.5-turbo | ||||
| ``` | ||||
|  | ||||
| The model's names are taken from the documentation of their respective technologies. | ||||
|  | ||||
| ## Troubleshooting | ||||
|  | ||||
| Q: When running Scraperr, I'm met with "404 Page not found".   | ||||
| A: This is probably an issue with MongoDB related to running Scraperr in a VM. You should see something liks this in `make logs`: | ||||
|  | ||||
| ``` | ||||
| WARNING: MongoDB 5.0+ requires a CPU with AVX support, and your current system does not appear to have that! | ||||
| ``` | ||||
|  | ||||
| To resolve this issue, simply set CPU host type to `host`. This can be done in Proxmox in the VM settings > Processor. [Related issue](https://github.com/jaypyles/Scraperr/issues/9). | ||||
|  | ||||
| ## Legal and Ethical Considerations | ||||
|  | ||||
| When using Scraperr, please ensure that you: | ||||
|  | ||||
| 1. **Check Robots.txt**: Verify allowed pages by reviewing the `robots.txt` file of the target website. | ||||
| 2. **Compliance**: Always comply with the website's Terms of Service (ToS) regarding web scraping. | ||||
|  | ||||
| **Disclaimer**: This tool is intended for use only on websites that permit scraping. The author is not responsible for any misuse of this tool. | ||||
|  | ||||
| ## License | ||||
| ## 📄 License | ||||
|  | ||||
| This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. | ||||
|  | ||||
| ### Contributions | ||||
| ## 👏 Contributions | ||||
|  | ||||
| Development made easy by developing from [webapp template](https://github.com/jaypyles/webapp-template). View documentation for extra information. | ||||
| Development made easier with the [webapp template](https://github.com/jaypyles/webapp-template). | ||||
|  | ||||
| Start development server: | ||||
|  | ||||
| `make deps build up-dev` | ||||
| To get started, simply run `make build up-dev`. | ||||
| @@ -1,3 +0,0 @@ | ||||
| github_repo: https://github.com/jaypyles/webapp-template.git | ||||
| deploy_path: /home/admin/site-test6 | ||||
| deploy_command: make pull up-prd | ||||
| @@ -1,10 +0,0 @@ | ||||
| - name: Deploy site | ||||
|   hosts: all | ||||
|   become: true | ||||
|   vars_files: | ||||
|     - ./config.yaml | ||||
|   tasks: | ||||
|     - name: Deploy | ||||
|       command: "{{deploy_command}}" | ||||
|       args: | ||||
|         chdir: "{{deploy_path}}" | ||||
| @@ -1,6 +0,0 @@ | ||||
| all: | ||||
|   hosts: | ||||
|     host1: | ||||
|       ansible_host: 192.168.0.1 | ||||
|       ansible_user: admin | ||||
|       ansible_ssh_private_key_file: private_key.pem | ||||
| @@ -1,54 +0,0 @@ | ||||
| - name: Install Docker and run make pull up | ||||
|   hosts: all | ||||
|   become: true | ||||
|   vars_files: | ||||
|     - ./config.yaml | ||||
|   tasks: | ||||
|     - name: Update apt cache | ||||
|       apt: | ||||
|         update_cache: yes | ||||
|     - name: Install required packages | ||||
|       apt: | ||||
|         name: | ||||
|           - apt-transport-https | ||||
|           - ca-certificates | ||||
|           - curl | ||||
|           - gnupg-agent | ||||
|           - software-properties-common | ||||
|           - rsync | ||||
|           - make | ||||
|         state: present | ||||
|     - name: Add Docker’s official GPG key | ||||
|       apt_key: | ||||
|         url: https://download.docker.com/linux/ubuntu/gpg | ||||
|         state: present | ||||
|     - name: Add Docker APT repository | ||||
|       apt_repository: | ||||
|         repo: deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable | ||||
|         state: present | ||||
|     - name: Update apt cache again after adding Docker repo | ||||
|       apt: | ||||
|         update_cache: yes | ||||
|     - name: Install Docker | ||||
|       apt: | ||||
|         name: docker-ce | ||||
|         state: present | ||||
|     - name: Start and enable Docker service | ||||
|       systemd: | ||||
|         name: docker | ||||
|         enabled: yes | ||||
|         state: started | ||||
|     - name: Install Docker Compose | ||||
|       apt: | ||||
|         name: docker-compose-plugin | ||||
|         state: present | ||||
|     - name: Verify Docker is installed | ||||
|       command: docker --version | ||||
|       register: docker_version | ||||
|     - name: Display Docker version | ||||
|       debug: | ||||
|         msg: "Docker version: {{ docker_version.stdout }}" | ||||
|     - name: Clone repo | ||||
|       ansible.builtin.git: | ||||
|         repo: "{{github_repo}}" | ||||
|         dest: "{{deploy_path}}" | ||||
| @@ -67,4 +67,4 @@ async def ai(c: AI): | ||||
|  | ||||
| @ai_router.get("/ai/check") | ||||
| async def check(): | ||||
|     return JSONResponse(content=bool(open_ai_key or llama_model)) | ||||
|     return JSONResponse(content={"ai_enabled": bool(open_ai_key or llama_model)}) | ||||
|   | ||||
| @@ -1,9 +1,13 @@ | ||||
| # STL | ||||
| import os | ||||
| import logging | ||||
| import apscheduler  # type: ignore | ||||
|  | ||||
| # PDM | ||||
| from fastapi import FastAPI | ||||
| import apscheduler.schedulers | ||||
| import apscheduler.schedulers.background | ||||
| from fastapi import FastAPI, Request, status | ||||
| from fastapi.exceptions import RequestValidationError | ||||
| from fastapi.middleware.cors import CORSMiddleware | ||||
|  | ||||
| # LOCAL | ||||
| @@ -11,8 +15,12 @@ from api.backend.ai.ai_router import ai_router | ||||
| from api.backend.auth.auth_router import auth_router | ||||
| from api.backend.utils import get_log_level | ||||
| from api.backend.routers.job_router import job_router | ||||
| from api.backend.routers.log_router import log_router | ||||
| from api.backend.routers.stats_router import stats_router | ||||
| from api.backend.database.startup import init_database | ||||
| from fastapi.responses import JSONResponse | ||||
|  | ||||
| from api.backend.job.cron_scheduling.cron_scheduling import start_cron_scheduler | ||||
| from api.backend.scheduler import scheduler | ||||
|  | ||||
| log_level = os.getenv("LOG_LEVEL") | ||||
| LOG_LEVEL = get_log_level(log_level) | ||||
| @@ -25,7 +33,7 @@ logging.basicConfig( | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
| app = FastAPI(title="api") | ||||
| app = FastAPI(title="api", root_path="/api") | ||||
|  | ||||
| app.add_middleware( | ||||
|     CORSMiddleware, | ||||
| @@ -39,5 +47,29 @@ app.add_middleware( | ||||
| app.include_router(auth_router) | ||||
| app.include_router(ai_router) | ||||
| app.include_router(job_router) | ||||
| app.include_router(log_router) | ||||
| app.include_router(stats_router) | ||||
|  | ||||
|  | ||||
| @app.on_event("startup") | ||||
| async def startup_event(): | ||||
|     start_cron_scheduler(scheduler) | ||||
|     scheduler.start() | ||||
|  | ||||
|     if os.getenv("ENV") != "test": | ||||
|         init_database() | ||||
|         LOG.info("Starting up...") | ||||
|  | ||||
|  | ||||
| @app.on_event("shutdown") | ||||
| def shutdown_scheduler(): | ||||
|     scheduler.shutdown(wait=False)  # Set wait=False to not block shutdown | ||||
|  | ||||
|  | ||||
| @app.exception_handler(RequestValidationError) | ||||
| async def validation_exception_handler(request: Request, exc: RequestValidationError): | ||||
|     exc_str = f"{exc}".replace("\n", " ").replace("   ", " ") | ||||
|     logging.error(f"{request}: {exc_str}") | ||||
|     content = {"status_code": 10422, "message": exc_str, "data": None} | ||||
|     return JSONResponse( | ||||
|         content=content, status_code=status.HTTP_422_UNPROCESSABLE_ENTITY | ||||
|     ) | ||||
|   | ||||
| @@ -1,5 +1,6 @@ | ||||
| # STL | ||||
| from datetime import timedelta | ||||
| import os | ||||
|  | ||||
| # PDM | ||||
| from fastapi import Depends, APIRouter, HTTPException, status | ||||
| @@ -7,7 +8,6 @@ from fastapi.security import OAuth2PasswordRequestForm | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.schemas import User, Token, UserCreate | ||||
| from api.backend.database import get_user_collection | ||||
| from api.backend.auth.auth_utils import ( | ||||
|     ACCESS_TOKEN_EXPIRE_MINUTES, | ||||
|     get_current_user, | ||||
| @@ -15,9 +15,14 @@ from api.backend.auth.auth_utils import ( | ||||
|     get_password_hash, | ||||
|     create_access_token, | ||||
| ) | ||||
| import logging | ||||
|  | ||||
| from api.backend.database.common import update | ||||
|  | ||||
| auth_router = APIRouter() | ||||
|  | ||||
| LOG = logging.getLogger("auth_router") | ||||
|  | ||||
|  | ||||
| @auth_router.post("/auth/token", response_model=Token) | ||||
| async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()): | ||||
| @@ -43,15 +48,22 @@ async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends( | ||||
|  | ||||
| @auth_router.post("/auth/signup", response_model=User) | ||||
| async def create_user(user: UserCreate): | ||||
|     users_collection = get_user_collection() | ||||
|     hashed_password = get_password_hash(user.password) | ||||
|     user_dict = user.model_dump() | ||||
|     user_dict["hashed_password"] = hashed_password | ||||
|     del user_dict["password"] | ||||
|     _ = await users_collection.insert_one(user_dict) | ||||
|  | ||||
|     query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)" | ||||
|     _ = update(query, (user_dict["email"], hashed_password, user_dict["full_name"])) | ||||
|  | ||||
|     return user_dict | ||||
|  | ||||
|  | ||||
| @auth_router.get("/auth/users/me", response_model=User) | ||||
| async def read_users_me(current_user: User = Depends(get_current_user)): | ||||
|     return current_user | ||||
|  | ||||
|  | ||||
| @auth_router.get("/auth/check") | ||||
| async def check_auth(): | ||||
|     return {"registration": os.environ.get("REGISTRATION_ENABLED", "True") == "True"} | ||||
|   | ||||
| @@ -1,9 +1,8 @@ | ||||
| # STL | ||||
| import os | ||||
| from gc import disable | ||||
| from queue import Empty | ||||
| from typing import Any, Optional | ||||
| from datetime import datetime, timedelta | ||||
| import logging | ||||
|  | ||||
| # PDM | ||||
| from jose import JWTError, jwt | ||||
| @@ -14,13 +13,16 @@ from fastapi.security import OAuth2PasswordBearer | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.schemas import User, UserInDB, TokenData | ||||
| from api.backend.database import get_user_collection | ||||
|  | ||||
| from api.backend.database.common import query | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
| _ = load_dotenv() | ||||
|  | ||||
| SECRET_KEY = os.getenv("SECRET_KEY") or "" | ||||
| ALGORITHM = os.getenv("ALGORITHM") or "" | ||||
| ACCESS_TOKEN_EXPIRE_MINUTES = os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES") | ||||
| SECRET_KEY = os.getenv("SECRET_KEY") or "secret" | ||||
| ALGORITHM = os.getenv("ALGORITHM") or "HS256" | ||||
| ACCESS_TOKEN_EXPIRE_MINUTES = os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES") or 600 | ||||
|  | ||||
| pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") | ||||
| oauth2_scheme = OAuth2PasswordBearer(tokenUrl="auth/token") | ||||
| @@ -37,8 +39,8 @@ def get_password_hash(password: str): | ||||
|  | ||||
|  | ||||
| async def get_user(email: str): | ||||
|     user_collection = get_user_collection() | ||||
|     user = await user_collection.find_one({"email": email}) | ||||
|     user_query = "SELECT * FROM users WHERE email = ?" | ||||
|     user = query(user_query, (email,))[0] | ||||
|  | ||||
|     if not user: | ||||
|         return | ||||
| @@ -74,25 +76,49 @@ def create_access_token( | ||||
|  | ||||
|  | ||||
| async def get_current_user(token: str = Depends(oauth2_scheme)): | ||||
|     LOG.debug(f"Getting current user with token: {token}") | ||||
|  | ||||
|     if not token: | ||||
|         LOG.debug("No token provided") | ||||
|         return EMPTY_USER | ||||
|  | ||||
|     if len(token.split(".")) != 3: | ||||
|         LOG.error(f"Malformed token: {token}") | ||||
|         return EMPTY_USER | ||||
|  | ||||
|     try: | ||||
|         LOG.debug( | ||||
|             f"Decoding token: {token} with secret key: {SECRET_KEY} and algorithm: {ALGORITHM}" | ||||
|         ) | ||||
|  | ||||
|         if token.startswith("Bearer "): | ||||
|             token = token.split(" ")[1] | ||||
|  | ||||
|         payload: Optional[dict[str, Any]] = jwt.decode( | ||||
|             token, SECRET_KEY, algorithms=[ALGORITHM] | ||||
|         ) | ||||
|  | ||||
|         if not payload: | ||||
|             LOG.error("No payload found in token") | ||||
|             return EMPTY_USER | ||||
|  | ||||
|         email = payload.get("sub") | ||||
|  | ||||
|         if email is None: | ||||
|             LOG.error("No email found in payload") | ||||
|             return EMPTY_USER | ||||
|  | ||||
|         token_data = TokenData(email=email) | ||||
|  | ||||
|     except JWTError: | ||||
|     except JWTError as e: | ||||
|         LOG.error(f"JWTError occurred: {e}") | ||||
|         return EMPTY_USER | ||||
|  | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {e}") | ||||
|         return EMPTY_USER | ||||
|  | ||||
|     user = await get_user(email=token_data.email) | ||||
|  | ||||
|     if user is None: | ||||
|         return EMPTY_USER | ||||
|  | ||||
|   | ||||
							
								
								
									
										1
									
								
								api/backend/constants.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								api/backend/constants.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| DATABASE_PATH = "data/database.db" | ||||
| @@ -1,23 +0,0 @@ | ||||
| # STL | ||||
| import os | ||||
| from typing import Any | ||||
|  | ||||
| # PDM | ||||
| from dotenv import load_dotenv | ||||
| from motor.motor_asyncio import AsyncIOMotorClient | ||||
|  | ||||
| _ = load_dotenv() | ||||
|  | ||||
| MONGODB_URI = os.getenv("MONGODB_URI") | ||||
|  | ||||
|  | ||||
| def get_user_collection(): | ||||
|     client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI) | ||||
|     db = client["scrape"] | ||||
|     return db["users"] | ||||
|  | ||||
|  | ||||
| def get_job_collection(): | ||||
|     client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI) | ||||
|     db = client["scrape"] | ||||
|     return db["jobs"] | ||||
							
								
								
									
										3
									
								
								api/backend/database/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/database/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .common import insert, QUERIES, update | ||||
|  | ||||
| __all__ = ["insert", "QUERIES", "update"] | ||||
							
								
								
									
										92
									
								
								api/backend/database/common.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										92
									
								
								api/backend/database/common.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,92 @@ | ||||
| import sqlite3 | ||||
| from typing import Any, Optional | ||||
| from api.backend.constants import DATABASE_PATH | ||||
| from api.backend.utils import format_json, format_sql_row_to_python | ||||
| from api.backend.database.schema import INIT_QUERY | ||||
| from api.backend.database.queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY | ||||
| import logging | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def connect(): | ||||
|     connection = sqlite3.connect(DATABASE_PATH) | ||||
|     connection.set_trace_callback(print) | ||||
|     cursor = connection.cursor() | ||||
|     return cursor | ||||
|  | ||||
|  | ||||
| def insert(query: str, values: tuple[Any, ...]): | ||||
|     connection = sqlite3.connect(DATABASE_PATH) | ||||
|     cursor = connection.cursor() | ||||
|     copy = list(values) | ||||
|     format_json(copy) | ||||
|  | ||||
|     try: | ||||
|         _ = cursor.execute(query, copy) | ||||
|         connection.commit() | ||||
|     except sqlite3.Error as e: | ||||
|         LOG.error(f"An error occurred: {e}") | ||||
|     finally: | ||||
|         cursor.close() | ||||
|         connection.close() | ||||
|  | ||||
|  | ||||
| def query(query: str, values: Optional[tuple[Any, ...]] = None): | ||||
|     connection = sqlite3.connect(DATABASE_PATH) | ||||
|     connection.row_factory = sqlite3.Row | ||||
|     cursor = connection.cursor() | ||||
|     rows = [] | ||||
|     try: | ||||
|         if values: | ||||
|             _ = cursor.execute(query, values) | ||||
|         else: | ||||
|             _ = cursor.execute(query) | ||||
|  | ||||
|         rows = cursor.fetchall() | ||||
|  | ||||
|     finally: | ||||
|         cursor.close() | ||||
|         connection.close() | ||||
|  | ||||
|     formatted_rows: list[dict[str, Any]] = [] | ||||
|  | ||||
|     for row in rows: | ||||
|         row = dict(row) | ||||
|         formatted_row = format_sql_row_to_python(row) | ||||
|         formatted_rows.append(formatted_row) | ||||
|  | ||||
|     return formatted_rows | ||||
|  | ||||
|  | ||||
| def update(query: str, values: Optional[tuple[Any, ...]] = None): | ||||
|     connection = sqlite3.connect(DATABASE_PATH) | ||||
|     cursor = connection.cursor() | ||||
|  | ||||
|     copy = None | ||||
|  | ||||
|     if values: | ||||
|         copy = list(values) | ||||
|         format_json(copy) | ||||
|  | ||||
|     try: | ||||
|         if copy: | ||||
|             res = cursor.execute(query, copy) | ||||
|         else: | ||||
|             res = cursor.execute(query) | ||||
|         connection.commit() | ||||
|         return res.rowcount | ||||
|     except sqlite3.Error as e: | ||||
|         LOG.error(f"An error occurred: {e}") | ||||
|     finally: | ||||
|         cursor.close() | ||||
|         connection.close() | ||||
|  | ||||
|     return 0 | ||||
|  | ||||
|  | ||||
| QUERIES = { | ||||
|     "init": INIT_QUERY, | ||||
|     "insert_job": JOB_INSERT_QUERY, | ||||
|     "delete_job": DELETE_JOB_QUERY, | ||||
| } | ||||
							
								
								
									
										3
									
								
								api/backend/database/queries/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/database/queries/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .queries import JOB_INSERT_QUERY, DELETE_JOB_QUERY | ||||
|  | ||||
| __all__ = ["JOB_INSERT_QUERY", "DELETE_JOB_QUERY"] | ||||
							
								
								
									
										9
									
								
								api/backend/database/queries/queries.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								api/backend/database/queries/queries.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | ||||
| JOB_INSERT_QUERY = """ | ||||
| INSERT INTO jobs  | ||||
| (id, url, elements, user, time_created, result, status, chat, job_options) | ||||
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) | ||||
| """ | ||||
|  | ||||
| DELETE_JOB_QUERY = """ | ||||
| DELETE FROM jobs WHERE id IN () | ||||
| """ | ||||
							
								
								
									
										3
									
								
								api/backend/database/schema/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/database/schema/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .schema import INIT_QUERY | ||||
|  | ||||
| __all__ = ["INIT_QUERY"] | ||||
							
								
								
									
										30
									
								
								api/backend/database/schema/schema.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								api/backend/database/schema/schema.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| INIT_QUERY = """ | ||||
| CREATE TABLE IF NOT EXISTS jobs ( | ||||
|     id STRING PRIMARY KEY NOT NULL, | ||||
|     url STRING NOT NULL, | ||||
|     elements JSON NOT NULL, | ||||
|     user STRING, | ||||
|     time_created DATETIME NOT NULL, | ||||
|     result JSON NOT NULL, | ||||
|     status STRING NOT NULL, | ||||
|     chat JSON, | ||||
|     job_options JSON | ||||
| ); | ||||
|  | ||||
| CREATE TABLE IF NOT EXISTS users ( | ||||
|     email STRING PRIMARY KEY NOT NULL, | ||||
|     hashed_password STRING NOT NULL, | ||||
|     full_name STRING, | ||||
|     disabled BOOLEAN | ||||
| ); | ||||
|  | ||||
| CREATE TABLE IF NOT EXISTS cron_jobs ( | ||||
|     id STRING PRIMARY KEY NOT NULL, | ||||
|     user_email STRING NOT NULL, | ||||
|     job_id STRING NOT NULL, | ||||
|     cron_expression STRING NOT NULL, | ||||
|     time_created DATETIME NOT NULL, | ||||
|     time_updated DATETIME NOT NULL, | ||||
|     FOREIGN KEY (job_id) REFERENCES jobs(id) | ||||
| ); | ||||
| """ | ||||
							
								
								
									
										43
									
								
								api/backend/database/startup.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								api/backend/database/startup.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | ||||
| import os | ||||
| from api.backend.database.common import connect, QUERIES, insert | ||||
| import logging | ||||
|  | ||||
| from api.backend.auth.auth_utils import get_password_hash | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def init_database(): | ||||
|     cursor = connect() | ||||
|  | ||||
|     for query in QUERIES["init"].strip().split(";"): | ||||
|         if query.strip(): | ||||
|             LOG.info(f"Executing query: {query}") | ||||
|             _ = cursor.execute(query) | ||||
|  | ||||
|     if os.environ.get("REGISTRATION_ENABLED", "True") == "False": | ||||
|         default_user_email = os.environ.get("DEFAULT_USER_EMAIL") | ||||
|         default_user_password = os.environ.get("DEFAULT_USER_PASSWORD") | ||||
|         default_user_full_name = os.environ.get("DEFAULT_USER_FULL_NAME") | ||||
|  | ||||
|         if ( | ||||
|             not default_user_email | ||||
|             or not default_user_password | ||||
|             or not default_user_full_name | ||||
|         ): | ||||
|             LOG.error( | ||||
|                 "DEFAULT_USER_EMAIL, DEFAULT_USER_PASSWORD, or DEFAULT_USER_FULL_NAME is not set!" | ||||
|             ) | ||||
|             exit(1) | ||||
|  | ||||
|         query = "INSERT INTO users (email, hashed_password, full_name) VALUES (?, ?, ?)" | ||||
|         _ = insert( | ||||
|             query, | ||||
|             ( | ||||
|                 default_user_email, | ||||
|                 get_password_hash(default_user_password), | ||||
|                 default_user_full_name, | ||||
|             ), | ||||
|         ) | ||||
|  | ||||
|     cursor.close() | ||||
| @@ -1,119 +0,0 @@ | ||||
| # STL | ||||
| import logging | ||||
| from typing import Any, Optional | ||||
|  | ||||
| # PDM | ||||
| from pymongo import DESCENDING | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.models import FetchOptions | ||||
| from api.backend.database import get_job_collection | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| async def insert(item: dict[str, Any]) -> None: | ||||
|     collection = get_job_collection() | ||||
|     i = await collection.insert_one(item) | ||||
|     LOG.info(f"Inserted item: {i}") | ||||
|  | ||||
|  | ||||
| async def get_queued_job(): | ||||
|     collection = get_job_collection() | ||||
|     return await collection.find_one( | ||||
|         {"status": "Queued"}, sort=[("created_at", DESCENDING)] | ||||
|     ) | ||||
|  | ||||
|  | ||||
| async def query( | ||||
|     filter: dict[str, Any], fetch_options: Optional[FetchOptions] = None | ||||
| ) -> list[dict[str, Any]]: | ||||
|     collection = get_job_collection() | ||||
|     cursor = collection.find(filter) | ||||
|     results: list[dict[str, Any]] = [] | ||||
|  | ||||
|     async for document in cursor: | ||||
|         del document["_id"] | ||||
|  | ||||
|         if fetch_options and not fetch_options.chat and document.get("chat"): | ||||
|             del document["chat"] | ||||
|  | ||||
|         results.append(document) | ||||
|  | ||||
|     return results | ||||
|  | ||||
|  | ||||
| async def update_job(ids: list[str], field: str, value: Any): | ||||
|     collection = get_job_collection() | ||||
|     for id in ids: | ||||
|         _ = await collection.update_one( | ||||
|             {"id": id}, | ||||
|             {"$set": {field: value}}, | ||||
|         ) | ||||
|  | ||||
|  | ||||
| async def delete_jobs(jobs: list[str]): | ||||
|     collection = get_job_collection() | ||||
|     result = await collection.delete_many({"id": {"$in": jobs}}) | ||||
|     LOG.info(f"{result.deleted_count} documents deleted") | ||||
|  | ||||
|     return True if result.deleted_count > 0 else False | ||||
|  | ||||
|  | ||||
| async def average_elements_per_link(user: str): | ||||
|     collection = get_job_collection() | ||||
|     pipeline = [ | ||||
|         {"$match": {"status": "Completed", "user": user}}, | ||||
|         { | ||||
|             "$project": { | ||||
|                 "date": { | ||||
|                     "$dateToString": {"format": "%Y-%m-%d", "date": "$time_created"} | ||||
|                 }, | ||||
|                 "num_elements": {"$size": "$elements"}, | ||||
|             } | ||||
|         }, | ||||
|         { | ||||
|             "$group": { | ||||
|                 "_id": "$date", | ||||
|                 "average_elements": {"$avg": "$num_elements"}, | ||||
|                 "count": {"$sum": 1}, | ||||
|             } | ||||
|         }, | ||||
|         {"$sort": {"_id": 1}}, | ||||
|     ] | ||||
|     cursor = collection.aggregate(pipeline) | ||||
|     results: list[dict[str, Any]] = [] | ||||
|  | ||||
|     async for document in cursor: | ||||
|         results.append( | ||||
|             { | ||||
|                 "date": document["_id"], | ||||
|                 "average_elements": document["average_elements"], | ||||
|                 "count": document["count"], | ||||
|             } | ||||
|         ) | ||||
|  | ||||
|     return results | ||||
|  | ||||
|  | ||||
| async def get_jobs_per_day(user: str): | ||||
|     collection = get_job_collection() | ||||
|     pipeline = [ | ||||
|         {"$match": {"status": "Completed", "user": user}}, | ||||
|         { | ||||
|             "$project": { | ||||
|                 "date": { | ||||
|                     "$dateToString": {"format": "%Y-%m-%d", "date": "$time_created"} | ||||
|                 } | ||||
|             } | ||||
|         }, | ||||
|         {"$group": {"_id": "$date", "job_count": {"$sum": 1}}}, | ||||
|         {"$sort": {"_id": 1}}, | ||||
|     ] | ||||
|     cursor = collection.aggregate(pipeline) | ||||
|  | ||||
|     results: list[dict[str, Any]] = [] | ||||
|     async for document in cursor: | ||||
|         results.append({"date": document["_id"], "job_count": document["job_count"]}) | ||||
|  | ||||
|     return results | ||||
							
								
								
									
										17
									
								
								api/backend/job/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								api/backend/job/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,17 @@ | ||||
| from .job import ( | ||||
|     insert, | ||||
|     update_job, | ||||
|     delete_jobs, | ||||
|     get_jobs_per_day, | ||||
|     get_queued_job, | ||||
|     average_elements_per_link, | ||||
| ) | ||||
|  | ||||
| __all__ = [ | ||||
|     "insert", | ||||
|     "update_job", | ||||
|     "delete_jobs", | ||||
|     "get_jobs_per_day", | ||||
|     "get_queued_job", | ||||
|     "average_elements_per_link", | ||||
| ] | ||||
							
								
								
									
										100
									
								
								api/backend/job/cron_scheduling/cron_scheduling.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								api/backend/job/cron_scheduling/cron_scheduling.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,100 @@ | ||||
| import datetime | ||||
| from typing import Any | ||||
| import uuid | ||||
| from api.backend.database.common import insert, query | ||||
| from api.backend.models import CronJob | ||||
| from apscheduler.schedulers.background import BackgroundScheduler  # type: ignore | ||||
| from apscheduler.triggers.cron import CronTrigger  # type: ignore | ||||
|  | ||||
| from api.backend.job import insert as insert_job | ||||
| import logging | ||||
|  | ||||
| LOG = logging.getLogger("Cron Scheduler") | ||||
|  | ||||
|  | ||||
| def insert_cron_job(cron_job: CronJob): | ||||
|     query = """ | ||||
|     INSERT INTO cron_jobs (id, user_email, job_id, cron_expression, time_created, time_updated) | ||||
|     VALUES (?, ?, ?, ?, ?, ?) | ||||
|     """ | ||||
|     values = ( | ||||
|         cron_job.id, | ||||
|         cron_job.user_email, | ||||
|         cron_job.job_id, | ||||
|         cron_job.cron_expression, | ||||
|         cron_job.time_created, | ||||
|         cron_job.time_updated, | ||||
|     ) | ||||
|  | ||||
|     insert(query, values) | ||||
|  | ||||
|     return True | ||||
|  | ||||
|  | ||||
| def delete_cron_job(id: str, user_email: str): | ||||
|     query = """ | ||||
|     DELETE FROM cron_jobs | ||||
|     WHERE id = ? AND user_email = ? | ||||
|     """ | ||||
|     values = (id, user_email) | ||||
|     insert(query, values) | ||||
|  | ||||
|     return True | ||||
|  | ||||
|  | ||||
| def get_cron_jobs(user_email: str): | ||||
|     cron_jobs = query("SELECT * FROM cron_jobs WHERE user_email = ?", (user_email,)) | ||||
|  | ||||
|     return cron_jobs | ||||
|  | ||||
|  | ||||
| def get_all_cron_jobs(): | ||||
|     cron_jobs = query("SELECT * FROM cron_jobs") | ||||
|  | ||||
|     return cron_jobs | ||||
|  | ||||
|  | ||||
| def insert_job_from_cron_job(job: dict[str, Any]): | ||||
|     insert_job( | ||||
|         { | ||||
|             **job, | ||||
|             "id": uuid.uuid4().hex, | ||||
|             "status": "Queued", | ||||
|             "result": "", | ||||
|             "chat": None, | ||||
|             "time_created": datetime.datetime.now(), | ||||
|             "time_updated": datetime.datetime.now(), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def get_cron_job_trigger(cron_expression: str): | ||||
|     expression_parts = cron_expression.split() | ||||
|  | ||||
|     if len(expression_parts) != 5: | ||||
|         print(f"Invalid cron expression: {cron_expression}") | ||||
|         return None | ||||
|  | ||||
|     minute, hour, day, month, day_of_week = expression_parts | ||||
|  | ||||
|     return CronTrigger( | ||||
|         minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def start_cron_scheduler(scheduler: BackgroundScheduler): | ||||
|     cron_jobs = get_all_cron_jobs() | ||||
|  | ||||
|     LOG.info(f"Cron jobs: {cron_jobs}") | ||||
|  | ||||
|     for job in cron_jobs: | ||||
|         queried_job = query("SELECT * FROM jobs WHERE id = ?", (job["job_id"],)) | ||||
|  | ||||
|         LOG.info(f"Adding job: {queried_job}") | ||||
|  | ||||
|         scheduler.add_job( | ||||
|             insert_job_from_cron_job, | ||||
|             get_cron_job_trigger(job["cron_expression"]), | ||||
|             id=job["id"], | ||||
|             args=[queried_job[0]], | ||||
|         ) | ||||
							
								
								
									
										97
									
								
								api/backend/job/job.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								api/backend/job/job.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | ||||
| # STL | ||||
| import logging | ||||
| from typing import Any | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.utils import format_list_for_query | ||||
| from api.backend.database.common import ( | ||||
|     insert as common_insert, | ||||
|     query as common_query, | ||||
|     QUERIES, | ||||
|     update as common_update, | ||||
| ) | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def insert(item: dict[str, Any]) -> None: | ||||
|     common_insert( | ||||
|         QUERIES["insert_job"], | ||||
|         ( | ||||
|             item["id"], | ||||
|             item["url"], | ||||
|             item["elements"], | ||||
|             item["user"], | ||||
|             item["time_created"], | ||||
|             item["result"], | ||||
|             item["status"], | ||||
|             item["chat"], | ||||
|             item["job_options"], | ||||
|         ), | ||||
|     ) | ||||
|     LOG.info(f"Inserted item: {item}") | ||||
|  | ||||
|  | ||||
| async def get_queued_job(): | ||||
|     query = ( | ||||
|         "SELECT * FROM jobs WHERE status = 'Queued' ORDER BY time_created DESC LIMIT 1" | ||||
|     ) | ||||
|     res = common_query(query) | ||||
|     LOG.info(f"Got queued job: {res}") | ||||
|     return res[0] if res else None | ||||
|  | ||||
|  | ||||
| async def update_job(ids: list[str], field: str, value: Any): | ||||
|     query = f"UPDATE jobs SET {field} = ? WHERE id IN {format_list_for_query(ids)}" | ||||
|     res = common_update(query, tuple([value] + ids)) | ||||
|     LOG.info(f"Updated job: {res}") | ||||
|  | ||||
|  | ||||
| async def delete_jobs(jobs: list[str]): | ||||
|     if not jobs: | ||||
|         LOG.info("No jobs to delete.") | ||||
|         return False | ||||
|  | ||||
|     query = f"DELETE FROM jobs WHERE id IN {format_list_for_query(jobs)}" | ||||
|     res = common_update(query, tuple(jobs)) | ||||
|  | ||||
|     return res > 0 | ||||
|  | ||||
|  | ||||
| async def average_elements_per_link(user: str): | ||||
|     job_query = """ | ||||
|     SELECT  | ||||
|         DATE(time_created) AS date, | ||||
|         AVG(json_array_length(elements)) AS average_elements, | ||||
|         COUNT(*) AS count | ||||
|     FROM  | ||||
|         jobs | ||||
|     WHERE  | ||||
|         status = 'Completed' AND user = ? | ||||
|     GROUP BY  | ||||
|         DATE(time_created) | ||||
|     ORDER BY  | ||||
|         date ASC; | ||||
|     """ | ||||
|     results = common_query(job_query, (user,)) | ||||
|  | ||||
|     return results | ||||
|  | ||||
|  | ||||
| async def get_jobs_per_day(user: str): | ||||
|     job_query = """ | ||||
|     SELECT  | ||||
|         DATE(time_created) AS date, | ||||
|         COUNT(*) AS job_count | ||||
|     FROM  | ||||
|         jobs | ||||
|     WHERE  | ||||
|         status = 'Completed' AND user = ? | ||||
|     GROUP BY  | ||||
|         DATE(time_created) | ||||
|     ORDER BY  | ||||
|         date ASC; | ||||
|     """ | ||||
|     results = common_query(job_query, (user,)) | ||||
|  | ||||
|     return results | ||||
							
								
								
									
										3
									
								
								api/backend/job/models/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/job/models/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from .job_options import JobOptions | ||||
|  | ||||
| __all__ = ["JobOptions"] | ||||
							
								
								
									
										15
									
								
								api/backend/job/models/job_options.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								api/backend/job/models/job_options.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,15 @@ | ||||
| from pydantic import BaseModel | ||||
| from typing import Any, Optional | ||||
| from api.backend.job.models.site_map import SiteMap | ||||
|  | ||||
|  | ||||
| class FetchOptions(BaseModel): | ||||
|     chat: Optional[bool] = None | ||||
|  | ||||
|  | ||||
| class JobOptions(BaseModel): | ||||
|     multi_page_scrape: bool = False | ||||
|     custom_headers: dict[str, Any] = {} | ||||
|     proxies: list[str] = [] | ||||
|     site_map: Optional[SiteMap] = None | ||||
|     collect_media: bool = False | ||||
							
								
								
									
										14
									
								
								api/backend/job/models/site_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								api/backend/job/models/site_map.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| from pydantic import BaseModel | ||||
| from typing import Literal | ||||
|  | ||||
|  | ||||
| class Action(BaseModel): | ||||
|     type: Literal["click", "input"] | ||||
|     xpath: str | ||||
|     name: str | ||||
|     input: str = "" | ||||
|     do_once: bool = True | ||||
|  | ||||
|  | ||||
| class SiteMap(BaseModel): | ||||
|     actions: list[Action] | ||||
							
								
								
									
										99
									
								
								api/backend/job/scraping/collect_media.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										99
									
								
								api/backend/job/scraping/collect_media.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,99 @@ | ||||
| import os | ||||
| from pathlib import Path | ||||
| from urllib.parse import urlparse | ||||
| from typing import Dict, List | ||||
|  | ||||
| import aiohttp | ||||
| from playwright.async_api import Page | ||||
|  | ||||
| from api.backend.utils import LOG | ||||
|  | ||||
|  | ||||
| async def collect_media(page: Page) -> dict[str, list[dict[str, str]]]: | ||||
|     media_types = { | ||||
|         "images": "img", | ||||
|         "videos": "video", | ||||
|         "audio": "audio", | ||||
|         "pdfs": 'a[href$=".pdf"]', | ||||
|         "documents": 'a[href$=".doc"], a[href$=".docx"], a[href$=".txt"], a[href$=".rtf"]', | ||||
|         "presentations": 'a[href$=".ppt"], a[href$=".pptx"]', | ||||
|         "spreadsheets": 'a[href$=".xls"], a[href$=".xlsx"], a[href$=".csv"]', | ||||
|     } | ||||
|  | ||||
|     base_dir = Path("media") | ||||
|     base_dir.mkdir(exist_ok=True) | ||||
|  | ||||
|     media_urls = {} | ||||
|  | ||||
|     async with aiohttp.ClientSession() as session: | ||||
|         for media_type, selector in media_types.items(): | ||||
|             elements = await page.query_selector_all(selector) | ||||
|             urls: List[Dict[str, str]] = [] | ||||
|  | ||||
|             media_dir = base_dir / media_type | ||||
|             media_dir.mkdir(exist_ok=True) | ||||
|  | ||||
|             for element in elements: | ||||
|                 if media_type == "images": | ||||
|                     url = await element.get_attribute("src") | ||||
|                 elif media_type == "videos": | ||||
|                     url = await element.get_attribute( | ||||
|                         "src" | ||||
|                     ) or await element.get_attribute("data-src") | ||||
|                 else: | ||||
|                     url = await element.get_attribute("href") | ||||
|  | ||||
|                 if url and url.startswith("/"): | ||||
|                     root_url = urlparse(page.url) | ||||
|                     root_domain = f"{root_url.scheme}://{root_url.netloc}" | ||||
|                     url = f"{root_domain}{url}" | ||||
|  | ||||
|                 if url and url.startswith(("http://", "https://")): | ||||
|                     try: | ||||
|                         parsed = urlparse(url) | ||||
|                         filename = ( | ||||
|                             os.path.basename(parsed.path) or f"{media_type}_{len(urls)}" | ||||
|                         ) | ||||
|  | ||||
|                         if "." not in filename: | ||||
|                             ext = { | ||||
|                                 "images": ".jpg", | ||||
|                                 "videos": ".mp4", | ||||
|                                 "audio": ".mp3", | ||||
|                                 "pdfs": ".pdf", | ||||
|                                 "documents": ".doc", | ||||
|                                 "presentations": ".ppt", | ||||
|                                 "spreadsheets": ".xls", | ||||
|                             }.get(media_type, "") | ||||
|                             filename += ext | ||||
|  | ||||
|                         file_path = media_dir / filename | ||||
|  | ||||
|                         async with session.get(url) as response: | ||||
|                             response.raise_for_status() | ||||
|                             with open(file_path, "wb") as f: | ||||
|                                 while True: | ||||
|                                     chunk = await response.content.read(8192) | ||||
|                                     if not chunk: | ||||
|                                         break | ||||
|                                     f.write(chunk) | ||||
|  | ||||
|                         urls.append({"url": url, "local_path": str(file_path)}) | ||||
|                         LOG.info(f"Downloaded {filename} to {file_path}") | ||||
|  | ||||
|                     except Exception as e: | ||||
|                         LOG.error(f"Error downloading {url}: {str(e)}") | ||||
|                         continue | ||||
|  | ||||
|             media_urls[media_type] = urls | ||||
|  | ||||
|     # Write summary | ||||
|     with open(base_dir / "download_summary.txt", "w") as f: | ||||
|         for media_type, downloads in media_urls.items(): | ||||
|             if downloads: | ||||
|                 f.write(f"\n=== {media_type.upper()} ===\n") | ||||
|                 for download in downloads: | ||||
|                     f.write(f"URL: {download['url']}\n") | ||||
|                     f.write(f"Saved to: {download['local_path']}\n\n") | ||||
|  | ||||
|     return media_urls | ||||
							
								
								
									
										32
									
								
								api/backend/job/scraping/scraping_utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								api/backend/job/scraping/scraping_utils.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| import asyncio | ||||
| from typing import Set, Tuple | ||||
| from playwright.async_api import Page | ||||
|  | ||||
| from api.backend.utils import LOG | ||||
|  | ||||
| from api.backend.job.scraping.collect_media import collect_media as collect_media_utils | ||||
|  | ||||
|  | ||||
| async def scrape_content( | ||||
|     page: Page, pages: Set[Tuple[str, str]], collect_media: bool | ||||
| ) -> str: | ||||
|     last_height = await page.evaluate("document.body.scrollHeight") | ||||
|  | ||||
|     while True: | ||||
|         await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") | ||||
|         await asyncio.sleep(3) | ||||
|         new_height = await page.evaluate("document.body.scrollHeight") | ||||
|  | ||||
|         if new_height == last_height: | ||||
|             break | ||||
|  | ||||
|         last_height = new_height | ||||
|  | ||||
|     html = await page.content() | ||||
|     pages.add((html, page.url)) | ||||
|  | ||||
|     if collect_media: | ||||
|         LOG.info("Collecting media") | ||||
|         await collect_media_utils(page) | ||||
|  | ||||
|     return html | ||||
							
								
								
									
										0
									
								
								api/backend/job/site_mapping/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								api/backend/job/site_mapping/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										78
									
								
								api/backend/job/site_mapping/site_mapping.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								api/backend/job/site_mapping/site_mapping.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,78 @@ | ||||
| import logging | ||||
| import asyncio | ||||
| from copy import deepcopy | ||||
| from typing import Any | ||||
|  | ||||
| from playwright.async_api import Page | ||||
|  | ||||
| from api.backend.job.models.site_map import Action, SiteMap | ||||
| from api.backend.job.scraping.scraping_utils import scrape_content | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def clear_done_actions(site_map: dict[str, Any]) -> dict[str, Any]: | ||||
|     """Clear all actions that have been clicked.""" | ||||
|     cleared_site_map = deepcopy(site_map) | ||||
|     cleared_site_map["actions"] = [ | ||||
|         action for action in cleared_site_map["actions"] if not action["do_once"] | ||||
|     ] | ||||
|  | ||||
|     return cleared_site_map | ||||
|  | ||||
|  | ||||
| async def handle_input(action: Action, page: Page) -> bool: | ||||
|     try: | ||||
|         element = page.locator(f"xpath={action.xpath}") | ||||
|         await element.wait_for(state="visible", timeout=10000) | ||||
|         LOG.info(f"Sending keys: {action.input} to element: {action.xpath}") | ||||
|         await element.fill(action.input) | ||||
|         return True | ||||
|     except Exception as e: | ||||
|         LOG.warning(f"Error handling input for xpath '{action.xpath}': {e}") | ||||
|         return False | ||||
|  | ||||
|  | ||||
| async def handle_click(action: Action, page: Page) -> bool: | ||||
|     try: | ||||
|         element = page.locator(f"xpath={action.xpath}") | ||||
|         await element.wait_for(state="visible", timeout=10000) | ||||
|         LOG.info(f"Clicking element: {action.xpath}") | ||||
|         await element.click() | ||||
|         return True | ||||
|     except Exception as e: | ||||
|         LOG.warning(f"Error clicking element at xpath '{action.xpath}': {e}") | ||||
|         return False | ||||
|  | ||||
|  | ||||
| ACTION_MAP = { | ||||
|     "click": handle_click, | ||||
|     "input": handle_input, | ||||
| } | ||||
|  | ||||
|  | ||||
| async def handle_site_mapping( | ||||
|     site_map_dict: dict[str, Any], | ||||
|     page: Page, | ||||
|     pages: set[tuple[str, str]], | ||||
|     collect_media: bool = False, | ||||
| ): | ||||
|     site_map = SiteMap(**site_map_dict) | ||||
|  | ||||
|     for action in site_map.actions: | ||||
|         action_handler = ACTION_MAP[action.type] | ||||
|         success = await action_handler(action, page) | ||||
|  | ||||
|         if not success: | ||||
|             return | ||||
|  | ||||
|         await asyncio.sleep(2) | ||||
|  | ||||
|     await scrape_content(page, pages, collect_media=collect_media) | ||||
|  | ||||
|     cleared_site_map_dict = clear_done_actions(site_map_dict) | ||||
|  | ||||
|     if cleared_site_map_dict["actions"]: | ||||
|         await handle_site_mapping( | ||||
|             cleared_site_map_dict, page, pages, collect_media=collect_media | ||||
|         ) | ||||
| @@ -2,14 +2,13 @@ | ||||
| from typing import Any, Optional, Union | ||||
| from datetime import datetime | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.job.models.job_options import JobOptions | ||||
|  | ||||
| # PDM | ||||
| import pydantic | ||||
|  | ||||
|  | ||||
| class FetchOptions(pydantic.BaseModel): | ||||
|     chat: Optional[bool] = None | ||||
|  | ||||
|  | ||||
| class Element(pydantic.BaseModel): | ||||
|     name: str | ||||
|     xpath: str | ||||
| @@ -22,11 +21,6 @@ class CapturedElement(pydantic.BaseModel): | ||||
|     name: str | ||||
|  | ||||
|  | ||||
| class JobOptions(pydantic.BaseModel): | ||||
|     multi_page_scrape: bool | ||||
|     custom_headers: Optional[dict[str, Any]] | ||||
|  | ||||
|  | ||||
| class RetrieveScrapeJobs(pydantic.BaseModel): | ||||
|     user: str | ||||
|  | ||||
| @@ -63,3 +57,17 @@ class Job(pydantic.BaseModel): | ||||
|     job_options: JobOptions | ||||
|     status: str = "Queued" | ||||
|     chat: Optional[str] = None | ||||
|  | ||||
|  | ||||
| class CronJob(pydantic.BaseModel): | ||||
|     id: Optional[str] = None | ||||
|     user_email: str | ||||
|     job_id: str | ||||
|     cron_expression: str | ||||
|     time_created: Optional[Union[datetime, str]] = None | ||||
|     time_updated: Optional[Union[datetime, str]] = None | ||||
|  | ||||
|  | ||||
| class DeleteCronJob(pydantic.BaseModel): | ||||
|     id: str | ||||
|     user_email: str | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| # STL | ||||
| import datetime | ||||
| import uuid | ||||
| import traceback | ||||
| from io import StringIO | ||||
| @@ -10,24 +11,33 @@ import random | ||||
| from fastapi import Depends, APIRouter | ||||
| from fastapi.encoders import jsonable_encoder | ||||
| from fastapi.responses import JSONResponse, StreamingResponse | ||||
| from api.backend.scheduler import scheduler | ||||
| from apscheduler.triggers.cron import CronTrigger  # type: ignore | ||||
|  | ||||
| # LOCAL | ||||
| from api.backend.job import ( | ||||
|     query, | ||||
|     insert, | ||||
|     update_job, | ||||
|     delete_jobs, | ||||
| ) | ||||
| from api.backend.job import insert, update_job, delete_jobs | ||||
| from api.backend.models import ( | ||||
|     DeleteCronJob, | ||||
|     UpdateJobs, | ||||
|     DownloadJob, | ||||
|     FetchOptions, | ||||
|     DeleteScrapeJobs, | ||||
|     Job, | ||||
|     CronJob, | ||||
| ) | ||||
| from api.backend.schemas import User | ||||
| from api.backend.auth.auth_utils import get_current_user | ||||
| from api.backend.utils import clean_text | ||||
| from api.backend.utils import clean_text, format_list_for_query | ||||
| from api.backend.job.models.job_options import FetchOptions | ||||
|  | ||||
| from api.backend.database.common import query | ||||
|  | ||||
| from api.backend.job.cron_scheduling.cron_scheduling import ( | ||||
|     delete_cron_job, | ||||
|     get_cron_job_trigger, | ||||
|     insert_cron_job, | ||||
|     get_cron_jobs, | ||||
|     insert_job_from_cron_job, | ||||
| ) | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
| @@ -47,10 +57,11 @@ async def submit_scrape_job(job: Job): | ||||
|         job.id = uuid.uuid4().hex | ||||
|  | ||||
|         job_dict = job.model_dump() | ||||
|         await insert(job_dict) | ||||
|         insert(job_dict) | ||||
|  | ||||
|         return JSONResponse(content=f"Job queued for scraping: {job.id}") | ||||
|         return JSONResponse(content={"id": job.id}) | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {traceback.format_exc()}") | ||||
|         return JSONResponse(content={"error": str(e)}, status_code=500) | ||||
|  | ||||
|  | ||||
| @@ -59,8 +70,11 @@ async def retrieve_scrape_jobs( | ||||
|     fetch_options: FetchOptions, user: User = Depends(get_current_user) | ||||
| ): | ||||
|     LOG.info(f"Retrieving jobs for account: {user.email}") | ||||
|     ATTRIBUTES = "chat" if fetch_options.chat else "*" | ||||
|  | ||||
|     try: | ||||
|         results = await query({"user": user.email}, fetch_options=fetch_options) | ||||
|         job_query = f"SELECT {ATTRIBUTES} FROM jobs WHERE user = ?" | ||||
|         results = query(job_query, (user.email,)) | ||||
|         return JSONResponse(content=jsonable_encoder(results[::-1])) | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {e}") | ||||
| @@ -70,9 +84,10 @@ async def retrieve_scrape_jobs( | ||||
| @job_router.get("/job/{id}") | ||||
| async def job(id: str, user: User = Depends(get_current_user)): | ||||
|     LOG.info(f"Retrieving jobs for account: {user.email}") | ||||
|  | ||||
|     try: | ||||
|         filter = {"user": user.email, "id": id} | ||||
|         results = await query(filter) | ||||
|         job_query = "SELECT * FROM jobs WHERE user = ? AND id = ?" | ||||
|         results = query(job_query, (user.email, id)) | ||||
|         return JSONResponse(content=jsonable_encoder(results)) | ||||
|     except Exception as e: | ||||
|         LOG.error(f"Exception occurred: {e}") | ||||
| @@ -84,7 +99,10 @@ async def download(download_job: DownloadJob): | ||||
|     LOG.info(f"Downloading job with ids: {download_job.ids}") | ||||
|  | ||||
|     try: | ||||
|         results = await query({"id": {"$in": download_job.ids}}) | ||||
|         job_query = ( | ||||
|             f"SELECT * FROM jobs WHERE id IN {format_list_for_query(download_job.ids)}" | ||||
|         ) | ||||
|         results = query(job_query, tuple(download_job.ids)) | ||||
|  | ||||
|         csv_buffer = StringIO() | ||||
|         csv_writer = csv.writer(csv_buffer, quotechar='"', quoting=csv.QUOTE_ALL) | ||||
| @@ -135,3 +153,47 @@ async def delete(delete_scrape_jobs: DeleteScrapeJobs): | ||||
|         if result | ||||
|         else JSONResponse({"error": "Jobs not deleted."}) | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @job_router.post("/schedule-cron-job") | ||||
| async def schedule_cron_job(cron_job: CronJob): | ||||
|     if not cron_job.id: | ||||
|         cron_job.id = uuid.uuid4().hex | ||||
|  | ||||
|     if not cron_job.time_created: | ||||
|         cron_job.time_created = datetime.datetime.now() | ||||
|  | ||||
|     if not cron_job.time_updated: | ||||
|         cron_job.time_updated = datetime.datetime.now() | ||||
|  | ||||
|     insert_cron_job(cron_job) | ||||
|  | ||||
|     queried_job = query("SELECT * FROM jobs WHERE id = ?", (cron_job.job_id,)) | ||||
|  | ||||
|     scheduler.add_job( | ||||
|         insert_job_from_cron_job, | ||||
|         get_cron_job_trigger(cron_job.cron_expression), | ||||
|         id=cron_job.id, | ||||
|         args=[queried_job[0]], | ||||
|     ) | ||||
|  | ||||
|     return JSONResponse(content={"message": "Cron job scheduled successfully."}) | ||||
|  | ||||
|  | ||||
| @job_router.post("/delete-cron-job") | ||||
| async def delete_cron_job_request(request: DeleteCronJob): | ||||
|     if not request.id: | ||||
|         return JSONResponse( | ||||
|             content={"error": "Cron job id is required."}, status_code=400 | ||||
|         ) | ||||
|  | ||||
|     delete_cron_job(request.id, request.user_email) | ||||
|     scheduler.remove_job(request.id) | ||||
|  | ||||
|     return JSONResponse(content={"message": "Cron job deleted successfully."}) | ||||
|  | ||||
|  | ||||
| @job_router.get("/cron-jobs") | ||||
| async def get_cron_jobs_request(user: User = Depends(get_current_user)): | ||||
|     cron_jobs = get_cron_jobs(user.email) | ||||
|     return JSONResponse(content=jsonable_encoder(cron_jobs)) | ||||
|   | ||||
| @@ -1,46 +0,0 @@ | ||||
| # STL | ||||
| import logging | ||||
| import docker | ||||
|  | ||||
| # PDM | ||||
| from fastapi import APIRouter, HTTPException | ||||
| from fastapi.responses import JSONResponse, StreamingResponse | ||||
|  | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
| log_router = APIRouter() | ||||
|  | ||||
| client = docker.from_env() | ||||
|  | ||||
|  | ||||
| @log_router.get("/initial_logs") | ||||
| async def get_initial_logs(): | ||||
|     container_id = "scraperr_api" | ||||
|  | ||||
|     try: | ||||
|         container = client.containers.get(container_id) | ||||
|         log_stream = container.logs(stream=False).decode("utf-8") | ||||
|         return JSONResponse(content={"logs": log_stream}) | ||||
|     except Exception as e: | ||||
|         raise HTTPException(status_code=500, detail=f"Unexpected error: {e}") | ||||
|  | ||||
|  | ||||
| @log_router.get("/logs") | ||||
| async def get_own_logs(): | ||||
|     container_id = "scraperr_api" | ||||
|  | ||||
|     try: | ||||
|         container = client.containers.get(container_id) | ||||
|         log_stream = container.logs(stream=True, follow=True) | ||||
|  | ||||
|         def log_generator(): | ||||
|             try: | ||||
|                 for log in log_stream: | ||||
|                     yield f"data: {log.decode('utf-8')}\n\n" | ||||
|             except Exception as e: | ||||
|                 yield f"data: {str(e)}\n\n" | ||||
|  | ||||
|         return StreamingResponse(log_generator(), media_type="text/event-stream") | ||||
|     except Exception as e: | ||||
|         raise HTTPException(status_code=500, detail=str(e)) | ||||
							
								
								
									
										3
									
								
								api/backend/scheduler.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								api/backend/scheduler.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| from apscheduler.schedulers.background import BackgroundScheduler  # type: ignore | ||||
|  | ||||
| scheduler = BackgroundScheduler() | ||||
| @@ -1,28 +1,20 @@ | ||||
| import logging | ||||
| from typing import Any, Optional | ||||
| import time | ||||
| import random | ||||
| from typing import Any, Optional, cast | ||||
|  | ||||
| from bs4 import BeautifulSoup | ||||
| from bs4 import BeautifulSoup, Tag | ||||
| from lxml import etree | ||||
| from seleniumwire import webdriver | ||||
| from lxml.etree import _Element  # type: ignore [reportPrivateImport] | ||||
| from fake_useragent import UserAgent | ||||
| from webdriver_manager.chrome import ChromeDriverManager | ||||
| from selenium.webdriver.support import expected_conditions as EC | ||||
| from selenium.webdriver.common.by import By | ||||
| from selenium.webdriver.support.ui import WebDriverWait | ||||
| from selenium.webdriver.chrome.options import Options as ChromeOptions | ||||
| from selenium.webdriver.chrome.service import Service | ||||
| from camoufox import AsyncCamoufox | ||||
| from playwright.async_api import Page | ||||
| from urllib.parse import urlparse, urljoin | ||||
|  | ||||
| from api.backend.models import Element, CapturedElement | ||||
| from api.backend.job.scraping.scraping_utils import scrape_content | ||||
| from api.backend.job.site_mapping.site_mapping import handle_site_mapping | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class HtmlElement(_Element): ... | ||||
|  | ||||
|  | ||||
| def is_same_domain(url: str, original_url: str) -> bool: | ||||
|     parsed_url = urlparse(url) | ||||
|     parsed_original_url = urlparse(original_url) | ||||
| @@ -31,46 +23,15 @@ def is_same_domain(url: str, original_url: str) -> bool: | ||||
|  | ||||
| def clean_xpath(xpath: str) -> str: | ||||
|     parts = xpath.split("/") | ||||
|     clean_parts: list[str] = [] | ||||
|     for part in parts: | ||||
|         if part == "": | ||||
|             clean_parts.append("/") | ||||
|         else: | ||||
|             clean_parts.append(part) | ||||
|     clean_xpath = "//".join(clean_parts).replace("////", "//") | ||||
|     clean_xpath = clean_xpath.replace("'", "\\'") | ||||
|     clean_parts = ["/" if part == "" else part for part in parts] | ||||
|     clean_xpath = "//".join(clean_parts).replace("////", "//").replace("'", "\\'") | ||||
|     LOG.info(f"Cleaned xpath: {clean_xpath}") | ||||
|  | ||||
|     return clean_xpath | ||||
|  | ||||
|  | ||||
| def sxpath(context: _Element, xpath: str) -> list[HtmlElement]: | ||||
|     return context.xpath(xpath)  # pyright: ignore [reportReturnType] | ||||
|  | ||||
|  | ||||
| def interceptor(headers: dict[str, Any]): | ||||
|     def _interceptor(request: Any): | ||||
|         for key, val in headers.items(): | ||||
|             if request.headers.get(key): | ||||
|                 del request.headers[key] | ||||
|             request.headers[key] = val | ||||
|         if "sec-ch-ua" in request.headers: | ||||
|             original_value = request.headers["sec-ch-ua"] | ||||
|             del request.headers["sec-ch-ua"] | ||||
|             modified_value = original_value.replace("HeadlessChrome", "Chrome") | ||||
|             request.headers["sec-ch-ua"] = modified_value | ||||
|  | ||||
|     return _interceptor | ||||
|  | ||||
|  | ||||
| def create_driver(): | ||||
|     ua = UserAgent() | ||||
|     chrome_options = ChromeOptions() | ||||
|     chrome_options.add_argument("--headless") | ||||
|     chrome_options.add_argument("--no-sandbox") | ||||
|     chrome_options.add_argument("--disable-dev-shm-usage") | ||||
|     chrome_options.add_argument(f"user-agent={ua.random}") | ||||
|  | ||||
|     return webdriver.Chrome(options=chrome_options) | ||||
| def sxpath(context: etree._Element, xpath: str): | ||||
|     return context.xpath(xpath) | ||||
|  | ||||
|  | ||||
| async def make_site_request( | ||||
| @@ -80,92 +41,105 @@ async def make_site_request( | ||||
|     visited_urls: set[str] = set(), | ||||
|     pages: set[tuple[str, str]] = set(), | ||||
|     original_url: str = "", | ||||
| ) -> None: | ||||
|     """Make basic `GET` request to site using Selenium.""" | ||||
|     # Check if URL has already been visited | ||||
|     proxies: Optional[list[str]] = None, | ||||
|     site_map: Optional[dict[str, Any]] = None, | ||||
|     collect_media: bool = False, | ||||
| ): | ||||
|     if url in visited_urls: | ||||
|         return | ||||
|  | ||||
|     driver = create_driver() | ||||
|     driver.implicitly_wait(10) | ||||
|     proxy = None | ||||
|     if proxies: | ||||
|         proxy = random.choice(proxies) | ||||
|         LOG.info(f"Using proxy: {proxy}") | ||||
|  | ||||
|     if headers: | ||||
|         driver.request_interceptor = interceptor(headers) | ||||
|     async with AsyncCamoufox(headless=True, proxy=proxy) as browser: | ||||
|         page: Page = await browser.new_page() | ||||
|  | ||||
|         if headers: | ||||
|             await page.set_extra_http_headers(headers) | ||||
|  | ||||
|     try: | ||||
|         LOG.info(f"Visiting URL: {url}") | ||||
|         driver.get(url) | ||||
|         final_url = driver.current_url | ||||
|         visited_urls.add(url) | ||||
|         visited_urls.add(final_url) | ||||
|         _ = WebDriverWait(driver, 10).until( | ||||
|             EC.presence_of_element_located((By.TAG_NAME, "body")) | ||||
|         ) | ||||
|  | ||||
|         last_height = driver.execute_script("return document.body.scrollHeight") | ||||
|         while True: | ||||
|             driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||||
|         try: | ||||
|             await page.goto(url, timeout=60000) | ||||
|             await page.wait_for_load_state("networkidle", timeout=10000) | ||||
|  | ||||
|             time.sleep(2)  # Wait for the page to load | ||||
|             new_height = driver.execute_script("return document.body.scrollHeight") | ||||
|             final_url = page.url | ||||
|  | ||||
|             if new_height == last_height: | ||||
|                 break | ||||
|             visited_urls.add(url) | ||||
|             visited_urls.add(final_url) | ||||
|  | ||||
|             last_height = new_height | ||||
|             html_content = await scrape_content(page, pages, collect_media) | ||||
|  | ||||
|         driver.execute_script("return document.body.scrollHeight") | ||||
|         page_source = driver.page_source | ||||
|             html_content = await page.content() | ||||
|             pages.add((html_content, final_url)) | ||||
|  | ||||
|         LOG.debug(f"Page source for url: {url}\n{page_source}") | ||||
|         pages.add((page_source, final_url)) | ||||
|     finally: | ||||
|         driver.quit() | ||||
|             if site_map: | ||||
|                 await handle_site_mapping( | ||||
|                     site_map, page, pages, collect_media=collect_media | ||||
|                 ) | ||||
|  | ||||
|         finally: | ||||
|             await page.close() | ||||
|             await browser.close() | ||||
|  | ||||
|     if not multi_page_scrape: | ||||
|         return | ||||
|  | ||||
|     soup = BeautifulSoup(page_source, "html.parser") | ||||
|     soup = BeautifulSoup(html_content, "html.parser") | ||||
|  | ||||
|     for a_tag in soup.find_all("a"): | ||||
|         link = a_tag.get("href") | ||||
|         if not isinstance(a_tag, Tag): | ||||
|             continue | ||||
|  | ||||
|         if link: | ||||
|             if not urlparse(link).netloc: | ||||
|                 base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url)) | ||||
|                 link = urljoin(base_url, link) | ||||
|         link = cast(str, a_tag.get("href", "")) | ||||
|  | ||||
|             if link not in visited_urls and is_same_domain(link, original_url): | ||||
|                 await make_site_request( | ||||
|                     link, | ||||
|                     headers=headers, | ||||
|                     multi_page_scrape=multi_page_scrape, | ||||
|                     visited_urls=visited_urls, | ||||
|                     pages=pages, | ||||
|                     original_url=original_url, | ||||
|                 ) | ||||
|         if not link: | ||||
|             continue | ||||
|  | ||||
|         if not urlparse(link).netloc: | ||||
|             base_url = "{0.scheme}://{0.netloc}".format(urlparse(final_url)) | ||||
|             link = urljoin(base_url, link) | ||||
|  | ||||
|         if link not in visited_urls and is_same_domain(link, original_url): | ||||
|             await make_site_request( | ||||
|                 link, | ||||
|                 headers=headers, | ||||
|                 multi_page_scrape=multi_page_scrape, | ||||
|                 visited_urls=visited_urls, | ||||
|                 pages=pages, | ||||
|                 original_url=original_url, | ||||
|                 proxies=proxies, | ||||
|                 site_map=site_map, | ||||
|                 collect_media=collect_media, | ||||
|             ) | ||||
|  | ||||
|  | ||||
| async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]): | ||||
|     soup = BeautifulSoup(page[0], "lxml") | ||||
|     root = etree.HTML(str(soup)) | ||||
|  | ||||
|     elements: dict[str, list[CapturedElement]] = dict() | ||||
|     elements: dict[str, list[CapturedElement]] = {} | ||||
|  | ||||
|     for elem in xpaths: | ||||
|         el = sxpath(root, elem.xpath) | ||||
|  | ||||
|         for e in el: | ||||
|             text = "\t".join(str(t) for t in e.itertext()) | ||||
|         for e in el:  # type: ignore | ||||
|             text = ( | ||||
|                 "\t".join(str(t) for t in e.itertext()) | ||||
|                 if isinstance(e, etree._Element) | ||||
|                 else str(e)  # type: ignore | ||||
|             ) | ||||
|  | ||||
|             captured_element = CapturedElement( | ||||
|                 xpath=elem.xpath, text=text, name=elem.name | ||||
|             ) | ||||
|  | ||||
|             if elem.name in elements: | ||||
|                 elements[elem.name].append(captured_element) | ||||
|                 continue | ||||
|  | ||||
|             elements[elem.name] = [captured_element] | ||||
|             else: | ||||
|                 elements[elem.name] = [captured_element] | ||||
|  | ||||
|     return {page[1]: elements} | ||||
|  | ||||
| @@ -173,22 +147,28 @@ async def collect_scraped_elements(page: tuple[str, str], xpaths: list[Element]) | ||||
| async def scrape( | ||||
|     url: str, | ||||
|     xpaths: list[Element], | ||||
|     headers: Optional[dict[str, Any]], | ||||
|     headers: Optional[dict[str, Any]] = None, | ||||
|     multi_page_scrape: bool = False, | ||||
|     proxies: Optional[list[str]] = None, | ||||
|     site_map: Optional[dict[str, Any]] = None, | ||||
|     collect_media: bool = False, | ||||
| ): | ||||
|     visited_urls: set[str] = set() | ||||
|     pages: set[tuple[str, str]] = set() | ||||
|  | ||||
|     _ = await make_site_request( | ||||
|     await make_site_request( | ||||
|         url, | ||||
|         headers, | ||||
|         headers=headers, | ||||
|         multi_page_scrape=multi_page_scrape, | ||||
|         visited_urls=visited_urls, | ||||
|         pages=pages, | ||||
|         original_url=url, | ||||
|         proxies=proxies, | ||||
|         site_map=site_map, | ||||
|         collect_media=collect_media, | ||||
|     ) | ||||
|  | ||||
|     elements: list[dict[str, dict[str, list[CapturedElement]]]] = list() | ||||
|     elements: list[dict[str, dict[str, list[CapturedElement]]]] = [] | ||||
|  | ||||
|     for page in pages: | ||||
|         elements.append(await collect_scraped_elements(page, xpaths)) | ||||
|   | ||||
| @@ -5,12 +5,14 @@ from faker import Faker | ||||
| fake = Faker() | ||||
|  | ||||
|  | ||||
| def create_job(): | ||||
| def create_job( | ||||
|     job_options: JobOptions = JobOptions(multi_page_scrape=False, custom_headers={}) | ||||
| ): | ||||
|     return Job( | ||||
|         id=uuid.uuid4().hex, | ||||
|         url="https://example.com", | ||||
|         elements=[Element(name="test", xpath="xpath")], | ||||
|         job_options=JobOptions(multi_page_scrape=False, custom_headers={}), | ||||
|         job_options=job_options, | ||||
|     ) | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -9,12 +9,18 @@ client = TestClient(app) | ||||
|  | ||||
| mocked_job = create_completed_job().model_dump() | ||||
| mock_results = [mocked_job] | ||||
| mocked_random_int = 123456 | ||||
|  | ||||
|  | ||||
| @pytest.mark.asyncio | ||||
| @patch("api.backend.app.query") | ||||
| async def test_download(mock_query: AsyncMock): | ||||
| @patch("api.backend.routers.job_router.query") | ||||
| @patch("api.backend.routers.job_router.random.randint") | ||||
| async def test_download(mock_randint: AsyncMock, mock_query: AsyncMock): | ||||
|     # Ensure the mock returns immediately | ||||
|     mock_query.return_value = mock_results | ||||
|     mock_randint.return_value = mocked_random_int | ||||
|  | ||||
|     # Create a DownloadJob instance | ||||
|     download_job = DownloadJob(ids=[mocked_job["id"]]) | ||||
|  | ||||
|     # Make a POST request to the /download endpoint | ||||
| @@ -26,5 +32,9 @@ async def test_download(mock_query: AsyncMock): | ||||
|  | ||||
|     # Check the content of the CSV | ||||
|     csv_content = response.content.decode("utf-8") | ||||
|     expected_csv = f"id,url,element_name,xpath,text,user,time_created\r\n{mocked_job['id']},https://example.com,element_name,//div,example,{mocked_job['user']},{mocked_job['time_created']}\r\n" | ||||
|     expected_csv = ( | ||||
|         f'"id","url","element_name","xpath","text","user","time_created"\r\n' | ||||
|         f'"{mocked_job["id"]}-{mocked_random_int}","https://example.com","element_name","//div","example",' | ||||
|         f'"{mocked_job["user"]}","{mocked_job["time_created"]}"\r\n' | ||||
|     ) | ||||
|     assert csv_content == expected_csv | ||||
|   | ||||
							
								
								
									
										0
									
								
								api/backend/tests/scraping/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								api/backend/tests/scraping/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										25
									
								
								api/backend/tests/scraping/test_scraping.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								api/backend/tests/scraping/test_scraping.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | ||||
| import pytest | ||||
| import logging | ||||
| from playwright.async_api import async_playwright, Error | ||||
|  | ||||
| logging.basicConfig(level=logging.DEBUG) | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| @pytest.mark.asyncio | ||||
| async def test_proxy(): | ||||
|     proxy = "127.0.0.1:8080" | ||||
|  | ||||
|     async with async_playwright() as p: | ||||
|         browser = await p.firefox.launch( | ||||
|             headless=True, proxy={"server": f"http://{proxy}"} | ||||
|         ) | ||||
|         context = await browser.new_context() | ||||
|         page = await context.new_page() | ||||
|  | ||||
|         with pytest.raises(Error) as excinfo: | ||||
|             await page.goto("http://example.com") | ||||
|  | ||||
|         assert "NS_ERROR_PROXY_CONNECTION_REFUSED" in str(excinfo.value) | ||||
|  | ||||
|         await browser.close() | ||||
| @@ -1,5 +1,8 @@ | ||||
| from typing import Optional | ||||
| from typing import Any, Optional | ||||
| import logging | ||||
| import json | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def clean_text(text: str): | ||||
| @@ -17,3 +20,30 @@ def get_log_level(level_name: Optional[str]) -> int: | ||||
|         level = getattr(logging, level_name, logging.INFO) | ||||
|  | ||||
|     return level | ||||
|  | ||||
|  | ||||
| def format_list_for_query(ids: list[str]): | ||||
|     return ( | ||||
|         f"({','.join(['?' for _ in ids])})"  # Returns placeholders, e.g., "(?, ?, ?)" | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def format_sql_row_to_python(row: dict[str, Any]): | ||||
|     new_row: dict[str, Any] = {} | ||||
|     for key, value in row.items(): | ||||
|         if isinstance(value, str): | ||||
|             try: | ||||
|                 new_row[key] = json.loads(value) | ||||
|             except json.JSONDecodeError: | ||||
|                 new_row[key] = value | ||||
|         else: | ||||
|             new_row[key] = value | ||||
|  | ||||
|     return new_row | ||||
|  | ||||
|  | ||||
| def format_json(items: list[Any]): | ||||
|     for idx, item in enumerate(items): | ||||
|         if isinstance(item, (dict, list)): | ||||
|             formatted_item = json.dumps(item) | ||||
|             items[idx] = formatted_item | ||||
|   | ||||
| @@ -1,19 +1,35 @@ | ||||
| import os | ||||
|  | ||||
| from api.backend.job import get_queued_job, update_job | ||||
| from api.backend.scraping import scrape | ||||
| from api.backend.models import Element | ||||
| from fastapi.encoders import jsonable_encoder | ||||
|  | ||||
| import asyncio | ||||
| import logging | ||||
| import sys | ||||
| import traceback | ||||
|  | ||||
| logging.basicConfig(stream=sys.stdout, level=logging.INFO) | ||||
| LOG = logging.getLogger(__name__) | ||||
| from api.backend.database.startup import init_database | ||||
|  | ||||
| from api.backend.worker.post_job_complete.post_job_complete import post_job_complete | ||||
| from api.backend.worker.logger import LOG | ||||
|  | ||||
|  | ||||
| NOTIFICATION_CHANNEL = os.getenv("NOTIFICATION_CHANNEL", "") | ||||
| NOTIFICATION_WEBHOOK_URL = os.getenv("NOTIFICATION_WEBHOOK_URL", "") | ||||
| SCRAPERR_FRONTEND_URL = os.getenv("SCRAPERR_FRONTEND_URL", "") | ||||
| EMAIL = os.getenv("EMAIL", "") | ||||
| TO = os.getenv("TO", "") | ||||
| SMTP_HOST = os.getenv("SMTP_HOST", "") | ||||
| SMTP_PORT = int(os.getenv("SMTP_PORT", 587)) | ||||
| SMTP_USER = os.getenv("SMTP_USER", "") | ||||
| SMTP_PASSWORD = os.getenv("SMTP_PASSWORD", "") | ||||
| USE_TLS = os.getenv("USE_TLS", "false").lower() == "true" | ||||
|  | ||||
|  | ||||
| async def process_job(): | ||||
|     job = await get_queued_job() | ||||
|     status = "Queued" | ||||
|  | ||||
|     if job: | ||||
|         LOG.info(f"Beginning processing job: {job}.") | ||||
|         try: | ||||
| @@ -23,6 +39,9 @@ async def process_job(): | ||||
|                 [Element(**j) for j in job["elements"]], | ||||
|                 job["job_options"]["custom_headers"], | ||||
|                 job["job_options"]["multi_page_scrape"], | ||||
|                 job["job_options"]["proxies"], | ||||
|                 job["job_options"]["site_map"], | ||||
|                 job["job_options"]["collect_media"], | ||||
|             ) | ||||
|             LOG.info( | ||||
|                 f"Scraped result for url: {job['url']}, with elements: {job['elements']}\n{scraped}" | ||||
| @@ -31,14 +50,37 @@ async def process_job(): | ||||
|                 [job["id"]], field="result", value=jsonable_encoder(scraped) | ||||
|             ) | ||||
|             _ = await update_job([job["id"]], field="status", value="Completed") | ||||
|             status = "Completed" | ||||
|  | ||||
|         except Exception as e: | ||||
|             _ = await update_job([job["id"]], field="status", value="Failed") | ||||
|             _ = await update_job([job["id"]], field="result", value=e) | ||||
|             LOG.error(f"Exception as occured: {e}\n{traceback.print_exc()}") | ||||
|             status = "Failed" | ||||
|         finally: | ||||
|             job["status"] = status | ||||
|             await post_job_complete( | ||||
|                 job, | ||||
|                 { | ||||
|                     "channel": NOTIFICATION_CHANNEL, | ||||
|                     "webhook_url": NOTIFICATION_WEBHOOK_URL, | ||||
|                     "scraperr_frontend_url": SCRAPERR_FRONTEND_URL, | ||||
|                     "email": EMAIL, | ||||
|                     "to": TO, | ||||
|                     "smtp_host": SMTP_HOST, | ||||
|                     "smtp_port": SMTP_PORT, | ||||
|                     "smtp_user": SMTP_USER, | ||||
|                     "smtp_password": SMTP_PASSWORD, | ||||
|                     "use_tls": USE_TLS, | ||||
|                 }, | ||||
|             ) | ||||
|  | ||||
|  | ||||
| async def main(): | ||||
|     LOG.info("Starting job worker...") | ||||
|  | ||||
|     init_database() | ||||
|  | ||||
|     while True: | ||||
|         await process_job() | ||||
|         await asyncio.sleep(5) | ||||
|   | ||||
							
								
								
									
										12
									
								
								api/backend/worker/logger.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								api/backend/worker/logger.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,12 @@ | ||||
| import logging | ||||
| import os | ||||
|  | ||||
| from api.backend.utils import get_log_level | ||||
|  | ||||
| logging.basicConfig( | ||||
|     level=get_log_level(os.getenv("LOG_LEVEL")), | ||||
|     format="%(levelname)s:     %(asctime)s - %(name)s - %(message)s", | ||||
|     handlers=[logging.StreamHandler()], | ||||
| ) | ||||
|  | ||||
| LOG = logging.getLogger(__name__) | ||||
							
								
								
									
										56
									
								
								api/backend/worker/post_job_complete/discord_notification.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								api/backend/worker/post_job_complete/discord_notification.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,56 @@ | ||||
| import json | ||||
| from typing import Any | ||||
|  | ||||
| import requests | ||||
|  | ||||
| from api.backend.worker.logger import LOG | ||||
| from api.backend.worker.post_job_complete.models import ( | ||||
|     PostJobCompleteOptions, | ||||
|     JOB_COLOR_MAP, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def discord_notification(job: dict[str, Any], options: PostJobCompleteOptions): | ||||
|     webhook_url = options["webhook_url"] | ||||
|     scraperr_frontend_url = options["scraperr_frontend_url"] | ||||
|  | ||||
|     LOG.info(f"Sending discord notification to {webhook_url}") | ||||
|  | ||||
|     embed = { | ||||
|         "title": "Job Completed", | ||||
|         "description": "Scraping job has been completed.", | ||||
|         "color": JOB_COLOR_MAP[job["status"]], | ||||
|         "url": f"{scraperr_frontend_url}/jobs?search={job['id']}&type=id", | ||||
|         "image": { | ||||
|             "url": "https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png", | ||||
|         }, | ||||
|         "author": { | ||||
|             "name": "Scraperr", | ||||
|             "url": "https://github.com/jaypyles/Scraperr", | ||||
|         }, | ||||
|         "fields": [ | ||||
|             { | ||||
|                 "name": "Status", | ||||
|                 "value": "Completed", | ||||
|                 "inline": True, | ||||
|             }, | ||||
|             { | ||||
|                 "name": "URL", | ||||
|                 "value": job["url"], | ||||
|                 "inline": True, | ||||
|             }, | ||||
|             { | ||||
|                 "name": "ID", | ||||
|                 "value": job["id"], | ||||
|                 "inline": False, | ||||
|             }, | ||||
|             { | ||||
|                 "name": "Options", | ||||
|                 "value": f"```json\n{json.dumps(job['job_options'], indent=4)}\n```", | ||||
|                 "inline": False, | ||||
|             }, | ||||
|         ], | ||||
|     } | ||||
|  | ||||
|     payload = {"embeds": [embed]} | ||||
|     requests.post(webhook_url, json=payload) | ||||
							
								
								
									
										97
									
								
								api/backend/worker/post_job_complete/email_notifcation.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								api/backend/worker/post_job_complete/email_notifcation.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | ||||
| import smtplib | ||||
| import ssl | ||||
| from email.mime.text import MIMEText | ||||
| from email.mime.multipart import MIMEMultipart | ||||
| import json | ||||
| from typing import Any | ||||
|  | ||||
| from api.backend.worker.logger import LOG | ||||
|  | ||||
| from api.backend.worker.post_job_complete.models import ( | ||||
|     JOB_COLOR_MAP, | ||||
|     PostJobCompleteOptions, | ||||
| ) | ||||
|  | ||||
|  | ||||
| def send_job_complete_email( | ||||
|     job: dict[str, Any], | ||||
|     options: PostJobCompleteOptions, | ||||
| ): | ||||
|     status = job["status"] | ||||
|     status_color = JOB_COLOR_MAP.get(status, 0x808080) | ||||
|     job_url = job["url"] | ||||
|     job_id = job["id"] | ||||
|     job_options_json = json.dumps(job["job_options"], indent=4) | ||||
|     frontend_url = options["scraperr_frontend_url"] | ||||
|  | ||||
|     subject = "📦 Job Completed - Scraperr Notification" | ||||
|  | ||||
|     html = f""" | ||||
|     <html> | ||||
|       <body style="font-family: Arial, sans-serif;"> | ||||
|         <h2 style="color: #{status_color:06x};">✅ Job Completed</h2> | ||||
|         <p>Scraping job has been completed successfully.</p> | ||||
|  | ||||
|         <a href="{frontend_url}/jobs?search={job_id}&type=id" target="_blank"> | ||||
|           <img src="https://github.com/jaypyles/Scraperr/raw/master/docs/logo_picture.png" alt="Scraperr Logo" width="200"> | ||||
|         </a> | ||||
|  | ||||
|         <h3>Job Info:</h3> | ||||
|         <ul> | ||||
|           <li><strong>Status:</strong> {status}</li> | ||||
|           <li><strong>Job URL:</strong> <a href="{job_url}">{job_url}</a></li> | ||||
|           <li><strong>Job ID:</strong> {job_id}</li> | ||||
|         </ul> | ||||
|  | ||||
|         <h3>Options:</h3> | ||||
|         <pre style="background-color:#f4f4f4; padding:10px; border-radius:5px;"> | ||||
| {job_options_json} | ||||
|         </pre> | ||||
|  | ||||
|         <h3>View your job here:</h3> | ||||
|         <a href="{options['scraperr_frontend_url']}/jobs?search={job_id}&type=id">Scraperr Job</a> | ||||
|  | ||||
|         <p style="font-size: 12px; color: gray;"> | ||||
|           Sent by <a href="https://github.com/jaypyles/Scraperr" target="_blank">Scraperr</a> | ||||
|         </p> | ||||
|       </body> | ||||
|     </html> | ||||
|     """ | ||||
|  | ||||
|     # Create email | ||||
|     message = MIMEMultipart("alternative") | ||||
|     message["From"] = options["email"] | ||||
|     message["To"] = options["to"] | ||||
|     message["Subject"] = subject | ||||
|     message.attach( | ||||
|         MIMEText( | ||||
|             "Job completed. View this email in HTML format for full details.", "plain" | ||||
|         ) | ||||
|     ) | ||||
|     message.attach(MIMEText(html, "html")) | ||||
|  | ||||
|     context = ssl.create_default_context() | ||||
|  | ||||
|     try: | ||||
|         if options["use_tls"]: | ||||
|             with smtplib.SMTP(options["smtp_host"], options["smtp_port"]) as server: | ||||
|                 server.starttls(context=context) | ||||
|                 server.login(options["smtp_user"], options["smtp_password"]) | ||||
|                 server.sendmail( | ||||
|                     from_addr=options["email"], | ||||
|                     to_addrs=options["to"], | ||||
|                     msg=message.as_string(), | ||||
|                 ) | ||||
|         else: | ||||
|             with smtplib.SMTP_SSL( | ||||
|                 options["smtp_host"], options["smtp_port"], context=context | ||||
|             ) as server: | ||||
|                 server.login(options["smtp_user"], options["smtp_password"]) | ||||
|                 server.sendmail( | ||||
|                     from_addr=options["email"], | ||||
|                     to_addrs=options["to"], | ||||
|                     msg=message.as_string(), | ||||
|                 ) | ||||
|         LOG.info("✅ Email sent successfully!") | ||||
|     except Exception as e: | ||||
|         LOG.error(f"❌ Failed to send email: {e}") | ||||
							
								
								
									
										22
									
								
								api/backend/worker/post_job_complete/models.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								api/backend/worker/post_job_complete/models.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | ||||
| from typing import TypedDict | ||||
|  | ||||
|  | ||||
| class PostJobCompleteOptions(TypedDict): | ||||
|     channel: str | ||||
|     webhook_url: str | ||||
|     scraperr_frontend_url: str | ||||
|     email: str | ||||
|     to: str | ||||
|     smtp_host: str | ||||
|     smtp_port: int | ||||
|     smtp_user: str | ||||
|     smtp_password: str | ||||
|     use_tls: bool | ||||
|  | ||||
|  | ||||
| JOB_COLOR_MAP = { | ||||
|     "Queued": 0x0000FF, | ||||
|     "Scraping": 0x0000FF, | ||||
|     "Completed": 0x00FF00, | ||||
|     "Failed": 0xFF0000, | ||||
| } | ||||
							
								
								
									
										24
									
								
								api/backend/worker/post_job_complete/post_job_complete.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								api/backend/worker/post_job_complete/post_job_complete.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,24 @@ | ||||
| from typing import Any | ||||
|  | ||||
| from api.backend.worker.post_job_complete.models import PostJobCompleteOptions | ||||
| from api.backend.worker.post_job_complete.email_notifcation import ( | ||||
|     send_job_complete_email, | ||||
| ) | ||||
| from api.backend.worker.post_job_complete.discord_notification import ( | ||||
|     discord_notification, | ||||
| ) | ||||
|  | ||||
|  | ||||
| async def post_job_complete(job: dict[str, Any], options: PostJobCompleteOptions): | ||||
|     if options["channel"] == "": | ||||
|         return | ||||
|  | ||||
|     if not options.values(): | ||||
|         return | ||||
|  | ||||
|     if options["channel"] == "discord": | ||||
|         discord_notification(job, options) | ||||
|     elif options["channel"] == "email": | ||||
|         send_job_complete_email(job, options) | ||||
|     else: | ||||
|         raise ValueError(f"Invalid channel: {options['channel']}") | ||||
							
								
								
									
										60
									
								
								cypress/e2e/authentication.cy.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								cypress/e2e/authentication.cy.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,60 @@ | ||||
| describe("Authentication", () => { | ||||
|   it("should register", () => { | ||||
|     cy.intercept("POST", "/api/signup").as("signup"); | ||||
|  | ||||
|     cy.visit("/").then(() => { | ||||
|       cy.get("button").contains("Login").click(); | ||||
|       cy.url().should("include", "/login"); | ||||
|  | ||||
|       cy.get("form").should("be.visible"); | ||||
|       cy.get("button") | ||||
|         .contains("No Account? Sign up") | ||||
|         .should("be.visible") | ||||
|         .click(); | ||||
|  | ||||
|       cy.get("input[name='email']").type("test@test.com"); | ||||
|       cy.get("input[name='password']").type("password"); | ||||
|       cy.get("input[name='fullName']").type("John Doe"); | ||||
|       cy.get("button[type='submit']").contains("Signup").click(); | ||||
|  | ||||
|       cy.wait("@signup").then((interception) => { | ||||
|         if (!interception.response) { | ||||
|           cy.log("No response received!"); | ||||
|           throw new Error("signup request did not return a response"); | ||||
|         } | ||||
|  | ||||
|         cy.log("Response status: " + interception.response.statusCode); | ||||
|         cy.log("Response body: " + JSON.stringify(interception.response.body)); | ||||
|  | ||||
|         expect(interception.response.statusCode).to.eq(200); | ||||
|       }); | ||||
|     }); | ||||
|   }); | ||||
|  | ||||
|   it("should login", () => { | ||||
|     cy.intercept("POST", "/api/token").as("token"); | ||||
|  | ||||
|     cy.visit("/").then(() => { | ||||
|       cy.get("button") | ||||
|         .contains("Login") | ||||
|         .click() | ||||
|         .then(() => { | ||||
|           cy.get("input[name='email']").type("test@test.com"); | ||||
|           cy.get("input[name='password']").type("password"); | ||||
|           cy.get("button[type='submit']").contains("Login").click(); | ||||
|  | ||||
|           cy.wait("@token").then((interception) => { | ||||
|             if (!interception.response) { | ||||
|               cy.log("No response received!"); | ||||
|               throw new Error("token request did not return a response"); | ||||
|             } | ||||
|  | ||||
|             cy.log("Response status: " + interception.response.statusCode); | ||||
|             cy.log("Response body: " + JSON.stringify(interception.response.body)); | ||||
|  | ||||
|             expect(interception.response.statusCode).to.eq(200); | ||||
|           }); | ||||
|         }); | ||||
|     }); | ||||
|   }); | ||||
| }); | ||||
| @@ -1,19 +1,34 @@ | ||||
| describe("Job", () => { | ||||
| describe.only("Job", () => { | ||||
|   it("should create a job", () => { | ||||
|     cy.intercept("POST", "/api/submit-scrape-job").as("submitScrapeJob"); | ||||
|  | ||||
|     cy.visit("/"); | ||||
|  | ||||
|     const input = cy.get('[data-cy="url-input"]'); | ||||
|     input.type("https://example.com"); | ||||
|     cy.get('[data-cy="url-input"]').type("https://example.com"); | ||||
|     cy.get('[data-cy="name-field"]').type("example"); | ||||
|     cy.get('[data-cy="xpath-field"]').type("//body"); | ||||
|     cy.get('[data-cy="add-button"]').click(); | ||||
|  | ||||
|     const nameField = cy.get('[data-cy="name-field"]'); | ||||
|     const xPathField = cy.get('[data-cy="xpath-field"]'); | ||||
|     const addButton = cy.get('[data-cy="add-button"]'); | ||||
|     cy.contains("Submit").click(); | ||||
|  | ||||
|     nameField.type("example"); | ||||
|     xPathField.type("//body"); | ||||
|     addButton.click(); | ||||
|     cy.wait("@submitScrapeJob").then((interception) => { | ||||
|       if (!interception.response) { | ||||
|         cy.log("No response received!"); | ||||
|         cy.log("Request body: " + JSON.stringify(interception.request?.body)); | ||||
|         throw new Error("submitScrapeJob request did not return a response"); | ||||
|       } | ||||
|  | ||||
|     const submit = cy.contains("Submit"); | ||||
|     submit.click(); | ||||
|       cy.log("Response status: " + interception.response.statusCode); | ||||
|       cy.log("Response body: " + JSON.stringify(interception.response.body)); | ||||
|  | ||||
|       expect(interception.response.statusCode).to.eq(200); | ||||
|     }); | ||||
|  | ||||
|     cy.get("li").contains("Jobs").click(); | ||||
|  | ||||
|     cy.contains("div", "https://example.com", { timeout: 10000 }).should( | ||||
|       "exist" | ||||
|     ); | ||||
|     cy.contains("div", "Completed", { timeout: 20000 }).should("exist"); | ||||
|   }); | ||||
| }); | ||||
|   | ||||
| @@ -34,4 +34,4 @@ | ||||
| //       visit(originalFn: CommandOriginalFn, url: string, options: Partial<VisitOptions>): Chainable<Element> | ||||
| //     } | ||||
| //   } | ||||
| // } | ||||
| // } | ||||
|   | ||||
| @@ -2,12 +2,6 @@ version: "3" | ||||
| services: | ||||
|   scraperr: | ||||
|     command: ["npm", "run", "dev"] | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.scraperr.rule=Host(`localhost`)" | ||||
|       - "traefik.http.routers.scraperr.entrypoints=web" | ||||
|       - "traefik.http.services.scraperr.loadbalancer.server.port=3000" | ||||
|       - "traefik.http.routers.scraperr.tls=false" | ||||
|     volumes: | ||||
|       - "$PWD/src:/app/src" | ||||
|       - "$PWD/public:/app/public" | ||||
| @@ -16,7 +10,7 @@ services: | ||||
|       - "$PWD/package-lock.json:/app/package-lock.json" | ||||
|       - "$PWD/tsconfig.json:/app/tsconfig.json" | ||||
|   scraperr_api: | ||||
|     ports: | ||||
|       - "8000:8000" | ||||
|     environment: | ||||
|       - LOG_LEVEL=INFO | ||||
|     volumes: | ||||
|       - "$PWD/api:/project/app/api" | ||||
|   | ||||
| @@ -1,16 +1,18 @@ | ||||
| services: | ||||
|   scraperr: | ||||
|     depends_on: | ||||
|       - scraperr_api | ||||
|     image: jpyles0524/scraperr:latest | ||||
|     build: | ||||
|       context: . | ||||
|       dockerfile: docker/frontend/Dockerfile | ||||
|     container_name: scraperr | ||||
|     command: ["npm", "run", "start"] | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.scraperr.rule=Host(`localhost`)" # change this to your domain, if not running on localhost | ||||
|       - "traefik.http.routers.scraperr.entrypoints=web" # websecure if using https | ||||
|       - "traefik.http.services.scraperr.loadbalancer.server.port=3000" | ||||
|     environment: | ||||
|       - NEXT_PUBLIC_API_URL=http://scraperr_api:8000 # your API URL | ||||
|       - SERVER_URL=http://scraperr_api:8000 # your docker container API URL | ||||
|     ports: | ||||
|       - 80:3000 | ||||
|     networks: | ||||
|       - web | ||||
|   scraperr_api: | ||||
| @@ -21,46 +23,14 @@ services: | ||||
|       dockerfile: docker/api/Dockerfile | ||||
|     environment: | ||||
|       - LOG_LEVEL=INFO | ||||
|       - OLLAMA_URL=http://ollama:11434 | ||||
|       - OLLAMA_MODEL=phi3 | ||||
|       - MONGODB_URI=mongodb://root:example@webscrape-mongo:27017 # used to access MongoDB | ||||
|       - SECRET_KEY=your_secret_key # used to encode authentication tokens (can be a random string) | ||||
|       - ALGORITHM=HS256 # authentication encoding algorithm | ||||
|       - ACCESS_TOKEN_EXPIRE_MINUTES=600 # access token expire minutes | ||||
|     container_name: scraperr_api | ||||
|     volumes: | ||||
|       - /var/run/docker.sock:/var/run/docker.sock | ||||
|     labels: | ||||
|       - "traefik.enable=true" | ||||
|       - "traefik.http.routers.scraperr_api.rule=Host(`localhost`) && PathPrefix(`/api`)" # change this to your domain, if not running on localhost | ||||
|       - "traefik.http.routers.scraperr_api.entrypoints=web" # websecure if using https | ||||
|       - "traefik.http.middlewares.api-stripprefix.stripprefix.prefixes=/api" | ||||
|       - "traefik.http.routers.scraperr_api.middlewares=api-stripprefix" | ||||
|       - "traefik.http.services.scraperr_api.loadbalancer.server.port=8000" | ||||
|     networks: | ||||
|       - web | ||||
|   traefik: | ||||
|     image: traefik:latest | ||||
|     container_name: traefik | ||||
|     command: | ||||
|       - "--providers.docker=true" | ||||
|       - "--entrypoints.web.address=:80" | ||||
|       - "--entrypoints.websecure.address=:443" | ||||
|     ports: | ||||
|       - 80:80 | ||||
|       - 443:443 | ||||
|       - 8000:8000 | ||||
|     volumes: | ||||
|       - /var/run/docker.sock:/var/run/docker.sock:ro" | ||||
|     networks: | ||||
|       - web | ||||
|   mongo: | ||||
|     container_name: webscrape-mongo | ||||
|     image: mongo | ||||
|     restart: always | ||||
|     environment: | ||||
|       MONGO_INITDB_ROOT_USERNAME: root | ||||
|       MONGO_INITDB_ROOT_PASSWORD: example | ||||
|       - "$PWD/data:/project/app/data" | ||||
|       - "$PWD/media:/project/app/media" | ||||
|     networks: | ||||
|       - web | ||||
|  | ||||
| networks: | ||||
|   web: | ||||
|   | ||||
| @@ -1,36 +1,33 @@ | ||||
| # Build python dependencies | ||||
| FROM python:3.10.12-slim as pybuilder | ||||
|  | ||||
| RUN apt update && apt install -y uvicorn | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y curl && \ | ||||
|     apt-get install -y uvicorn wget gnupg supervisor libgl1 libglx-mesa0 libglx0 vainfo libva-dev libva-glx2 libva-drm2 && \ | ||||
|     curl -LsSf https://astral.sh/uv/install.sh | sh && \ | ||||
|     apt-get remove -y curl && \ | ||||
|     apt-get autoremove -y && \ | ||||
|     rm -rf /var/lib/apt/lists/* | ||||
|  | ||||
| RUN python -m pip --no-cache-dir install pdm | ||||
| RUN pdm config python.use_venv false | ||||
|  | ||||
|  | ||||
| WORKDIR /project/app | ||||
| COPY pyproject.toml pdm.lock /project/app/ | ||||
| RUN pdm install | ||||
|  | ||||
| RUN pdm run playwright install --with-deps | ||||
|  | ||||
| RUN pdm run camoufox fetch | ||||
|  | ||||
| COPY ./api/ /project/app/api | ||||
|  | ||||
| # Create final image | ||||
| FROM python:3.10.12-slim | ||||
|  | ||||
| RUN apt-get update | ||||
| RUN apt-get install -y wget gnupg supervisor | ||||
| RUN wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - | ||||
| RUN sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' | ||||
| RUN apt-get update | ||||
| RUN apt-get install -y google-chrome-stable | ||||
|  | ||||
| ENV PYTHONPATH=/project/pkgs | ||||
| COPY --from=pybuilder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages | ||||
| COPY --from=pybuilder /usr/local/bin /usr/local/bin | ||||
| COPY --from=pybuilder /project/app /project/ | ||||
|  | ||||
| COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf | ||||
|  | ||||
| EXPOSE 8000 | ||||
|  | ||||
| WORKDIR /project/ | ||||
| WORKDIR /project/app | ||||
|  | ||||
| CMD [ "supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf" ] | ||||
| @@ -1,5 +1,5 @@ | ||||
| # Build next dependencies | ||||
| FROM node:latest | ||||
| FROM node:23.1 | ||||
| WORKDIR /app | ||||
|  | ||||
| COPY package*.json ./ | ||||
| @@ -15,6 +15,4 @@ COPY src /app/src | ||||
|  | ||||
| RUN npm run build | ||||
|  | ||||
| EXPOSE 3000 | ||||
|  | ||||
| # CMD [ "npm", "run" ] | ||||
| EXPOSE 3000 | ||||
										
											Binary file not shown.
										
									
								
							| Before Width: | Height: | Size: 46 KiB After Width: | Height: | Size: 47 KiB | 
| @@ -1,4 +0,0 @@ | ||||
| tls: | ||||
|   certificates: | ||||
|     - certFile: /etc/certs/ssl-cert.pem | ||||
|       keyFile: /etc/certs/ssl-cert.key | ||||
							
								
								
									
										37
									
								
								ipython.py
									
									
									
									
									
								
							
							
						
						
									
										37
									
								
								ipython.py
									
									
									
									
									
								
							| @@ -1,37 +0,0 @@ | ||||
| # STL | ||||
| import os | ||||
|  | ||||
| # PDM | ||||
| import boto3 | ||||
| from dotenv import load_dotenv | ||||
|  | ||||
| # Load environment variables from .env file | ||||
| load_dotenv() | ||||
|  | ||||
|  | ||||
| def test_insert_and_delete(): | ||||
|     # Get environment variables | ||||
|     region_name = os.getenv("AWS_REGION") | ||||
|     # Initialize DynamoDB resource | ||||
|     dynamodb = boto3.resource("dynamodb", region_name=region_name) | ||||
|     table = dynamodb.Table("scrape") | ||||
|  | ||||
|     # Item to insert | ||||
|     item = { | ||||
|         "id": "123",  # Replace with the appropriate id value | ||||
|         "attribute1": "value1", | ||||
|         "attribute2": "value2", | ||||
|         # Add more attributes as needed | ||||
|     } | ||||
|  | ||||
|     # Insert the item | ||||
|     table.put_item(Item=item) | ||||
|     print(f"Inserted item: {item}") | ||||
|  | ||||
|     # Delete the item | ||||
|     table.delete_item(Key={"id": "123"})  # Replace with the appropriate id value | ||||
|     print(f"Deleted item with id: {item['id']}") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     test_insert_and_delete() | ||||
							
								
								
									
										15660
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										15660
									
								
								package-lock.json
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										12
									
								
								package.json
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								package.json
									
									
									
									
									
								
							| @@ -19,6 +19,7 @@ | ||||
|     "bootstrap": "^5.3.0", | ||||
|     "chart.js": "^4.4.3", | ||||
|     "cookie": "^0.6.0", | ||||
|     "dotenv": "^16.5.0", | ||||
|     "framer-motion": "^4.1.17", | ||||
|     "js-cookie": "^3.0.5", | ||||
|     "next": "^14.2.4", | ||||
| @@ -31,7 +32,6 @@ | ||||
|     "react-modal-image": "^2.6.0", | ||||
|     "react-router": "^6.14.1", | ||||
|     "react-router-dom": "^6.14.1", | ||||
|     "react-scripts": "^5.0.1", | ||||
|     "react-spinners": "^0.14.1", | ||||
|     "typescript": "^4.9.5", | ||||
|     "web-vitals": "^2.1.4" | ||||
| @@ -63,12 +63,18 @@ | ||||
|     ] | ||||
|   }, | ||||
|   "devDependencies": { | ||||
|     "@types/cypress": "^0.1.6", | ||||
|     "@types/cypress": "^1.1.6", | ||||
|     "@types/js-cookie": "^3.0.6", | ||||
|     "cypress": "^13.15.0", | ||||
|     "autoprefixer": "^10.4.21", | ||||
|     "cypress": "^13.17.0", | ||||
|     "eslint": "^9.26.0", | ||||
|     "postcss": "^8.5.3", | ||||
|     "tailwindcss": "^3.3.5" | ||||
|   }, | ||||
|   "overrides": { | ||||
|     "react-refresh": "0.11.0" | ||||
|   }, | ||||
|   "resolutions": { | ||||
|     "postcss": "^8.4.31" | ||||
|   } | ||||
| } | ||||
|   | ||||
| @@ -2,9 +2,7 @@ | ||||
| name = "web-scrape" | ||||
| version = "0.1.0" | ||||
| description = "" | ||||
| authors = [ | ||||
|     {name = "Jayden Pyles", email = "jpylesbuisness@gmail.com"}, | ||||
| ] | ||||
| authors = [{ name = "Jayden Pyles", email = "jpylesbuisness@gmail.com" }] | ||||
| dependencies = [ | ||||
|     "uvicorn>=0.30.1", | ||||
|     "fastapi>=0.111.0", | ||||
| @@ -18,7 +16,6 @@ dependencies = [ | ||||
|     "lxml-stubs>=0.5.1", | ||||
|     "fake-useragent>=1.5.1", | ||||
|     "requests-html>=0.10.0", | ||||
|     "selenium>=4.22.0", | ||||
|     "webdriver-manager>=4.0.1", | ||||
|     "pydantic[email]>=2.9.2", | ||||
|     "pandas>=2.2.2", | ||||
| @@ -39,20 +36,21 @@ dependencies = [ | ||||
|     "exceptiongroup>=1.2.2", | ||||
|     "Faker>=30.6.0", | ||||
|     "pytest-asyncio>=0.24.0", | ||||
|     "python-multipart>=0.0.12", | ||||
|     "python-multipart>=0.0.1", | ||||
|     "bcrypt==4.0.1", | ||||
|     "apscheduler>=3.11.0", | ||||
|     "playwright>=1.52.0", | ||||
|     "camoufox>=0.4.11", | ||||
| ] | ||||
| requires-python = ">=3.10" | ||||
| readme = "README.md" | ||||
| license = {text = "MIT"} | ||||
| license = { text = "MIT" } | ||||
|  | ||||
| [tool.pdm] | ||||
| distribution = true | ||||
|  | ||||
| [tool.pdm.dev-dependencies] | ||||
| dev = [ | ||||
|     "ipython>=8.26.0", | ||||
|     "pytest>=8.3.3", | ||||
| ] | ||||
| dev = ["ipython>=8.26.0", "pytest>=8.3.3"] | ||||
| [tool.pyright] | ||||
| include = ["./api/backend/"] | ||||
| exclude = ["**/node_modules", "**/__pycache__"] | ||||
| @@ -60,14 +58,42 @@ ignore = [] | ||||
| defineConstant = { DEBUG = true } | ||||
| stubPath = "" | ||||
|  | ||||
| reportUnknownMemberType= false | ||||
| reportMissingImports = true | ||||
| reportMissingTypeStubs = false | ||||
| reportAny = false | ||||
| reportCallInDefaultInitializer = false | ||||
| # Type checking strictness | ||||
| typeCheckingMode = "strict"                        # Enables strict type checking mode | ||||
| reportPrivateUsage = "none" | ||||
| reportMissingTypeStubs = "none" | ||||
| reportUntypedFunctionDecorator = "error" | ||||
| reportUntypedClassDecorator = "error" | ||||
| reportUntypedBaseClass = "error" | ||||
| reportInvalidTypeVarUse = "error" | ||||
| reportUnnecessaryTypeIgnoreComment = "information" | ||||
| reportUnknownVariableType = "none" | ||||
| reportUnknownMemberType = "none" | ||||
| reportUnknownParameterType = "none" | ||||
|  | ||||
| pythonVersion = "3.9" | ||||
| pythonPlatform = "Linux" | ||||
| # Additional checks | ||||
| reportImplicitStringConcatenation = "error" | ||||
| reportInvalidStringEscapeSequence = "error" | ||||
| reportMissingImports = "error" | ||||
| reportMissingModuleSource = "error" | ||||
| reportOptionalCall = "error" | ||||
| reportOptionalIterable = "error" | ||||
| reportOptionalMemberAccess = "error" | ||||
| reportOptionalOperand = "error" | ||||
| reportOptionalSubscript = "error" | ||||
| reportTypedDictNotRequiredAccess = "error" | ||||
|  | ||||
| # Function return type checking | ||||
| reportIncompleteStub = "error" | ||||
| reportIncompatibleMethodOverride = "error" | ||||
| reportInvalidStubStatement = "error" | ||||
| reportInconsistentOverload = "error" | ||||
|  | ||||
| # Misc settings | ||||
| pythonVersion = "3.10"           # Matches your Python version from pyproject.toml | ||||
| strictListInference = true | ||||
| strictDictionaryInference = true | ||||
| strictSetInference = true | ||||
|  | ||||
|  | ||||
| [tool.isort] | ||||
|   | ||||
| @@ -28,10 +28,6 @@ export const JobSelector = ({ | ||||
|   const [popoverJob, setPopoverJob] = useState<Job | null>(null); | ||||
|   const theme = useTheme(); | ||||
|  | ||||
|   useEffect(() => { | ||||
|     fetchJobs(setJobs, { chat: true }); | ||||
|   }, []); | ||||
|  | ||||
|   const handlePopoverOpen = ( | ||||
|     event: React.MouseEvent<HTMLElement>, | ||||
|     job: Job | ||||
| @@ -124,7 +120,9 @@ export const JobSelector = ({ | ||||
|                   fontStyle: "italic", | ||||
|                 }} | ||||
|               > | ||||
|                 {new Date(popoverJob.time_created).toLocaleString()} | ||||
|                 {popoverJob.time_created | ||||
|                   ? new Date(popoverJob.time_created).toLocaleString() | ||||
|                   : "Unknown"} | ||||
|               </Typography> | ||||
|             </div> | ||||
|           </Box> | ||||
|   | ||||
| @@ -2,7 +2,7 @@ | ||||
|  | ||||
| import React from "react"; | ||||
| import { useAuth } from "../../../contexts/AuthContext"; | ||||
| import { Box, Drawer, Divider } from "@mui/material"; | ||||
| import { Box, Drawer } from "@mui/material"; | ||||
|  | ||||
| import { QuickSettings } from "../../nav/quick-settings"; | ||||
| import { NavItems } from "./nav-items/nav-items"; | ||||
|   | ||||
| @@ -7,6 +7,7 @@ import TerminalIcon from "@mui/icons-material/Terminal"; | ||||
| import BarChart from "@mui/icons-material/BarChart"; | ||||
| import AutoAwesomeIcon from "@mui/icons-material/AutoAwesome"; | ||||
| import { List } from "@mui/material"; | ||||
| import { Schedule } from "@mui/icons-material"; | ||||
|  | ||||
| const items = [ | ||||
|   { | ||||
| @@ -16,7 +17,7 @@ const items = [ | ||||
|   }, | ||||
|   { | ||||
|     icon: <HttpIcon />, | ||||
|     text: "Previous Jobs", | ||||
|     text: "Jobs", | ||||
|     href: "/jobs", | ||||
|   }, | ||||
|   { | ||||
| @@ -30,9 +31,9 @@ const items = [ | ||||
|     href: "/statistics", | ||||
|   }, | ||||
|   { | ||||
|     icon: <TerminalIcon />, | ||||
|     text: "View App Logs", | ||||
|     href: "/logs", | ||||
|     icon: <Schedule />, | ||||
|     text: "Cron Jobs", | ||||
|     href: "/cron-jobs", | ||||
|   }, | ||||
| ]; | ||||
|  | ||||
|   | ||||
| @@ -15,6 +15,7 @@ import { | ||||
|   Button, | ||||
|   Tooltip, | ||||
|   IconButton, | ||||
|   TableContainer, | ||||
| } from "@mui/material"; | ||||
| import ExpandMoreIcon from "@mui/icons-material/ExpandMore"; | ||||
| import StarIcon from "@mui/icons-material/Star"; | ||||
| @@ -52,145 +53,155 @@ export const JobQueue = ({ | ||||
|   const router = useRouter(); | ||||
|  | ||||
|   return ( | ||||
|     <Table sx={{ tableLayout: "fixed", width: "100%" }}> | ||||
|       <TableHead> | ||||
|         <TableRow> | ||||
|           <TableCell>Select</TableCell> | ||||
|           <TableCell>Id</TableCell> | ||||
|           <TableCell>Url</TableCell> | ||||
|           <TableCell>Elements</TableCell> | ||||
|           <TableCell>Result</TableCell> | ||||
|           <TableCell>Time Created</TableCell> | ||||
|           <TableCell>Status</TableCell> | ||||
|           <TableCell>Actions</TableCell> | ||||
|         </TableRow> | ||||
|       </TableHead> | ||||
|       <TableBody> | ||||
|         {filteredJobs.map((row, index) => ( | ||||
|           <TableRow key={index}> | ||||
|             <TableCell padding="checkbox"> | ||||
|               <Checkbox | ||||
|                 checked={selectedJobs.has(row.id)} | ||||
|                 onChange={() => onSelectJob(row.id)} | ||||
|               /> | ||||
|               <Tooltip title="Chat with AI"> | ||||
|                 <span> | ||||
|                   <IconButton | ||||
|                     onClick={() => { | ||||
|                       router.push({ | ||||
|                         pathname: "/chat", | ||||
|                         query: { | ||||
|                           job: row.id, | ||||
|                         }, | ||||
|                       }); | ||||
|                     }} | ||||
|                   > | ||||
|                     <AutoAwesome /> | ||||
|                   </IconButton> | ||||
|                 </span> | ||||
|               </Tooltip> | ||||
|               <Tooltip title="Favorite Job"> | ||||
|                 <span> | ||||
|                   <IconButton | ||||
|                     color={row.favorite ? "warning" : "default"} | ||||
|                     onClick={() => { | ||||
|                       onFavorite([row.id], "favorite", !row.favorite); | ||||
|                       row.favorite = !row.favorite; | ||||
|                     }} | ||||
|                   > | ||||
|                     <StarIcon /> | ||||
|                   </IconButton> | ||||
|                 </span> | ||||
|               </Tooltip> | ||||
|             </TableCell> | ||||
|             <TableCell sx={{ maxWidth: 100, overflow: "auto" }}> | ||||
|               <Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box> | ||||
|             </TableCell> | ||||
|             <TableCell sx={{ maxWidth: 200, overflow: "auto" }}> | ||||
|               <Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box> | ||||
|             </TableCell> | ||||
|             <TableCell sx={{ maxWidth: 150, overflow: "auto" }}> | ||||
|               <Box sx={{ maxHeight: 100, overflow: "auto" }}> | ||||
|                 {JSON.stringify(row.elements)} | ||||
|               </Box> | ||||
|             </TableCell> | ||||
|             <TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}> | ||||
|               <Accordion sx={{ margin: 0, padding: 0.5 }}> | ||||
|                 <AccordionSummary | ||||
|                   expandIcon={<ExpandMoreIcon />} | ||||
|                   aria-controls="panel1a-content" | ||||
|                   id="panel1a-header" | ||||
|                   sx={{ | ||||
|                     minHeight: 0, | ||||
|                     "&.Mui-expanded": { minHeight: 0 }, | ||||
|                   }} | ||||
|                 > | ||||
|                   <Box | ||||
|                     sx={{ | ||||
|                       maxHeight: 150, | ||||
|                       overflow: "auto", | ||||
|                       width: "100%", | ||||
|                     }} | ||||
|                   > | ||||
|                     <Typography sx={{ fontSize: "0.875rem" }}> | ||||
|                       Show Result | ||||
|                     </Typography> | ||||
|                   </Box> | ||||
|                 </AccordionSummary> | ||||
|                 <AccordionDetails sx={{ padding: 1 }}> | ||||
|                   <Box sx={{ maxHeight: 200, overflow: "auto" }}> | ||||
|                     <Typography | ||||
|                       sx={{ | ||||
|                         fontSize: "0.875rem", | ||||
|                         whiteSpace: "pre-wrap", | ||||
|     <TableContainer component={Box} sx={{ maxHeight: "90dvh" }}> | ||||
|       <Table sx={{ tableLayout: "fixed", width: "100%" }}> | ||||
|         <TableHead> | ||||
|           <TableRow> | ||||
|             <TableCell>Select</TableCell> | ||||
|             <TableCell>Id</TableCell> | ||||
|             <TableCell>Url</TableCell> | ||||
|             <TableCell>Elements</TableCell> | ||||
|             <TableCell>Result</TableCell> | ||||
|             <TableCell>Time Created</TableCell> | ||||
|             <TableCell>Status</TableCell> | ||||
|             <TableCell>Actions</TableCell> | ||||
|           </TableRow> | ||||
|         </TableHead> | ||||
|         <TableBody sx={{ overflow: "auto" }}> | ||||
|           {filteredJobs.map((row, index) => ( | ||||
|             <TableRow key={index}> | ||||
|               <TableCell padding="checkbox"> | ||||
|                 <Checkbox | ||||
|                   checked={selectedJobs.has(row.id)} | ||||
|                   onChange={() => onSelectJob(row.id)} | ||||
|                 /> | ||||
|                 <Tooltip title="Chat with AI"> | ||||
|                   <span> | ||||
|                     <IconButton | ||||
|                       onClick={() => { | ||||
|                         router.push({ | ||||
|                           pathname: "/chat", | ||||
|                           query: { | ||||
|                             job: row.id, | ||||
|                           }, | ||||
|                         }); | ||||
|                       }} | ||||
|                     > | ||||
|                       {JSON.stringify(row.result, null, 2)} | ||||
|                     </Typography> | ||||
|                   </Box> | ||||
|                 </AccordionDetails> | ||||
|               </Accordion> | ||||
|             </TableCell> | ||||
|             <TableCell sx={{ maxWidth: 150, overflow: "auto" }}> | ||||
|               <Box sx={{ maxHeight: 100, overflow: "auto" }}> | ||||
|                 {new Date(row.time_created).toLocaleString()} | ||||
|               </Box> | ||||
|             </TableCell> | ||||
|             <TableCell sx={{ maxWidth: 50, overflow: "auto" }}> | ||||
|               <Box sx={{ maxHeight: 100, overflow: "auto" }}> | ||||
|                 <Box | ||||
|                   className="rounded-md p-2 text-center" | ||||
|                   sx={{ bgcolor: colors[row.status] }} | ||||
|                 > | ||||
|                   {row.status} | ||||
|                       <AutoAwesome /> | ||||
|                     </IconButton> | ||||
|                   </span> | ||||
|                 </Tooltip> | ||||
|                 <Tooltip title="Favorite Job"> | ||||
|                   <span> | ||||
|                     <IconButton | ||||
|                       color={row.favorite ? "warning" : "default"} | ||||
|                       onClick={() => { | ||||
|                         onFavorite([row.id], "favorite", !row.favorite); | ||||
|                         row.favorite = !row.favorite; | ||||
|                       }} | ||||
|                     > | ||||
|                       <StarIcon /> | ||||
|                     </IconButton> | ||||
|                   </span> | ||||
|                 </Tooltip> | ||||
|               </TableCell> | ||||
|               <TableCell sx={{ maxWidth: 100, overflow: "auto" }}> | ||||
|                 <Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.id}</Box> | ||||
|               </TableCell> | ||||
|               <TableCell sx={{ maxWidth: 200, overflow: "auto" }}> | ||||
|                 <Box sx={{ maxHeight: 100, overflow: "auto" }}>{row.url}</Box> | ||||
|               </TableCell> | ||||
|               <TableCell sx={{ maxWidth: 150, overflow: "auto" }}> | ||||
|                 <Box sx={{ maxHeight: 100, overflow: "auto" }}> | ||||
|                   {JSON.stringify(row.elements)} | ||||
|                 </Box> | ||||
|               </Box> | ||||
|             </TableCell> | ||||
|             <TableCell sx={{ maxWidth: 150, overflow: "auto" }}> | ||||
|               <Box sx={{ display: "flex", gap: 1 }}> | ||||
|                 <Button | ||||
|                   onClick={() => { | ||||
|                     onDownload([row.id]); | ||||
|                   }} | ||||
|                   size="small" | ||||
|                   sx={{ minWidth: 0, padding: "4px 8px" }} | ||||
|                 > | ||||
|                   Download | ||||
|                 </Button> | ||||
|                 <Button | ||||
|                   onClick={() => | ||||
|                     onNavigate(row.elements, row.url, row.job_options) | ||||
|                   } | ||||
|                   size="small" | ||||
|                   sx={{ minWidth: 0, padding: "4px 8px" }} | ||||
|                 > | ||||
|                   Rerun | ||||
|                 </Button> | ||||
|               </Box> | ||||
|             </TableCell> | ||||
|           </TableRow> | ||||
|         ))} | ||||
|       </TableBody> | ||||
|     </Table> | ||||
|               </TableCell> | ||||
|               <TableCell sx={{ maxWidth: 150, overflow: "auto", padding: 0 }}> | ||||
|                 <Accordion sx={{ margin: 0, padding: 0.5 }}> | ||||
|                   <AccordionSummary | ||||
|                     expandIcon={<ExpandMoreIcon />} | ||||
|                     aria-controls="panel1a-content" | ||||
|                     id="panel1a-header" | ||||
|                     sx={{ | ||||
|                       minHeight: 0, | ||||
|                       "&.Mui-expanded": { minHeight: 0 }, | ||||
|                     }} | ||||
|                   > | ||||
|                     <Box | ||||
|                       sx={{ | ||||
|                         maxHeight: 150, | ||||
|                         overflow: "auto", | ||||
|                         width: "100%", | ||||
|                       }} | ||||
|                     > | ||||
|                       <Typography sx={{ fontSize: "0.875rem" }}> | ||||
|                         Show Result | ||||
|                       </Typography> | ||||
|                     </Box> | ||||
|                   </AccordionSummary> | ||||
|                   <AccordionDetails sx={{ padding: 1 }}> | ||||
|                     <Box sx={{ maxHeight: 200, overflow: "auto" }}> | ||||
|                       <Typography | ||||
|                         sx={{ | ||||
|                           fontSize: "0.875rem", | ||||
|                           whiteSpace: "pre-wrap", | ||||
|                         }} | ||||
|                       > | ||||
|                         {JSON.stringify(row.result, null, 2)} | ||||
|                       </Typography> | ||||
|                     </Box> | ||||
|                   </AccordionDetails> | ||||
|                 </Accordion> | ||||
|               </TableCell> | ||||
|               <TableCell sx={{ maxWidth: 150, overflow: "auto" }}> | ||||
|                 <Box sx={{ maxHeight: 100, overflow: "auto" }}> | ||||
|                   {new Date(row.time_created).toLocaleString()} | ||||
|                 </Box> | ||||
|               </TableCell> | ||||
|               <TableCell sx={{ maxWidth: 50, overflow: "auto" }}> | ||||
|                 <Box sx={{ maxHeight: 100, overflow: "auto" }}> | ||||
|                   <Box | ||||
|                     className="rounded-md p-2 text-center" | ||||
|                     sx={{ bgcolor: colors[row.status] }} | ||||
|                   > | ||||
|                     {row.status} | ||||
|                   </Box> | ||||
|                 </Box> | ||||
|               </TableCell> | ||||
|               <TableCell sx={{ maxWidth: 150, overflow: "auto" }}> | ||||
|                 <Box sx={{ display: "flex", gap: 1 }}> | ||||
|                   <Button | ||||
|                     onClick={() => { | ||||
|                       onDownload([row.id]); | ||||
|                     }} | ||||
|                     size="small" | ||||
|                     sx={{ | ||||
|                       minWidth: 0, | ||||
|                       padding: "4px 8px", | ||||
|                       fontSize: "0.625rem", | ||||
|                     }} | ||||
|                   > | ||||
|                     Download | ||||
|                   </Button> | ||||
|                   <Button | ||||
|                     onClick={() => | ||||
|                       onNavigate(row.elements, row.url, row.job_options) | ||||
|                     } | ||||
|                     size="small" | ||||
|                     sx={{ | ||||
|                       minWidth: 0, | ||||
|                       padding: "4px 8px", | ||||
|                       fontSize: "0.625rem", | ||||
|                     }} | ||||
|                   > | ||||
|                     Rerun | ||||
|                   </Button> | ||||
|                 </Box> | ||||
|               </TableCell> | ||||
|             </TableRow> | ||||
|           ))} | ||||
|         </TableBody> | ||||
|       </Table> | ||||
|     </TableContainer> | ||||
|   ); | ||||
| }; | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| import React, { Dispatch, SetStateAction, useState } from "react"; | ||||
| import React, { SetStateAction, useState } from "react"; | ||||
| import { | ||||
|   IconButton, | ||||
|   Box, | ||||
| @@ -18,8 +18,8 @@ import StarIcon from "@mui/icons-material/Star"; | ||||
| import { useRouter } from "next/router"; | ||||
| import { Favorites, JobQueue } from "."; | ||||
| import { Job } from "../../types"; | ||||
| import { Constants } from "../../lib"; | ||||
| import Cookies from "js-cookie"; | ||||
| import { useSearchParams } from "next/navigation"; | ||||
|  | ||||
| interface JobTableProps { | ||||
|   jobs: Job[]; | ||||
| @@ -38,20 +38,24 @@ const COLOR_MAP: ColorMap = { | ||||
| }; | ||||
|  | ||||
| export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => { | ||||
|   const searchParams = useSearchParams(); | ||||
|   const search = searchParams.get("search"); | ||||
|   const type = searchParams.get("type"); | ||||
|  | ||||
|   const [selectedJobs, setSelectedJobs] = useState<Set<string>>(new Set()); | ||||
|   const [allSelected, setAllSelected] = useState(false); | ||||
|   const [searchQuery, setSearchQuery] = useState<string>(""); | ||||
|   const [searchMode, setSearchMode] = useState<string>("url"); | ||||
|   const [searchQuery, setSearchQuery] = useState<string>(search || ""); | ||||
|   const [searchMode, setSearchMode] = useState<string>(type || "url"); | ||||
|   const [favoriteView, setFavoriteView] = useState<boolean>(false); | ||||
|  | ||||
|   const token = Cookies.get("token"); | ||||
|   const router = useRouter(); | ||||
|  | ||||
|   const handleDownload = async (ids: string[]) => { | ||||
|     const response = await fetch(`${Constants.DOMAIN}/api/download`, { | ||||
|     const response = await fetch("/api/download", { | ||||
|       method: "POST", | ||||
|       headers: { "Content-Type": "application/json" }, | ||||
|       body: JSON.stringify({ ids: ids }), | ||||
|       body: JSON.stringify({ data: { ids: ids } }), | ||||
|     }); | ||||
|  | ||||
|     if (response.ok) { | ||||
| @@ -104,10 +108,10 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => { | ||||
|   }; | ||||
|  | ||||
|   const handleDeleteSelected = async () => { | ||||
|     const response = await fetch(`${Constants.DOMAIN}/api/delete-scrape-jobs`, { | ||||
|     const response = await fetch("/api/delete", { | ||||
|       method: "POST", | ||||
|       headers: { "Content-Type": "application/json" }, | ||||
|       body: JSON.stringify({ ids: Array.from(selectedJobs) }), | ||||
|       body: JSON.stringify({ data: { ids: Array.from(selectedJobs) } }), | ||||
|     }); | ||||
|  | ||||
|     if (response.ok) { | ||||
| @@ -142,16 +146,36 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => { | ||||
|       value: value, | ||||
|     }; | ||||
|  | ||||
|     await fetch(`${Constants.DOMAIN}/api/update`, { | ||||
|     await fetch("/api/update", { | ||||
|       method: "POST", | ||||
|       headers: { | ||||
|         "Content-Type": "application/json", | ||||
|         Authorization: `Bearer ${token}`, | ||||
|       }, | ||||
|       body: JSON.stringify(postBody), | ||||
|       body: JSON.stringify({ data: postBody }), | ||||
|     }); | ||||
|   }; | ||||
|  | ||||
|   const scrollbarStyles = { | ||||
|     "&::-webkit-scrollbar": { | ||||
|       width: "8px", | ||||
|       height: "8px", | ||||
|     }, | ||||
|     "&::-webkit-scrollbar-track": { | ||||
|       backgroundColor: "rgba(0,0,0,0.05)", | ||||
|       borderRadius: "8px", | ||||
|     }, | ||||
|     "&::-webkit-scrollbar-thumb": { | ||||
|       backgroundColor: "rgba(0,0,0,0.2)", | ||||
|       borderRadius: "8px", | ||||
|       "&:hover": { | ||||
|         backgroundColor: "rgba(0,0,0,0.3)", | ||||
|       }, | ||||
|     }, | ||||
|     scrollbarWidth: "thin", | ||||
|     scrollbarColor: "rgba(0,0,0,0.2) rgba(0,0,0,0.05)", | ||||
|   }; | ||||
|  | ||||
|   return ( | ||||
|     <Box | ||||
|       width="100%" | ||||
| @@ -166,6 +190,7 @@ export const JobTable: React.FC<JobTableProps> = ({ jobs, setJobs }) => { | ||||
|         maxWidth="100%" | ||||
|         bgcolor="background.default" | ||||
|         overflow="auto" | ||||
|         sx={scrollbarStyles} | ||||
|       > | ||||
|         <Box | ||||
|           className="flex flex-row justify-between p-2 w-full" | ||||
|   | ||||
| @@ -1 +0,0 @@ | ||||
| export * from "./log-container"; | ||||
| @@ -1,3 +0,0 @@ | ||||
| .logContainer { | ||||
|   max-width: none !important; | ||||
| } | ||||
| @@ -1,98 +0,0 @@ | ||||
| import React, { useState, useEffect, useRef } from "react"; | ||||
| import { Container, IconButton } from "@mui/material"; | ||||
| import { ArrowUpward, ArrowDownward } from "@mui/icons-material"; | ||||
| import { Constants } from "../../../lib/constants"; | ||||
|  | ||||
| import classes from "./log-container.module.css"; | ||||
|  | ||||
| interface LogContainerProps { | ||||
|   initialLogs: string; | ||||
| } | ||||
|  | ||||
| export const LogContainer: React.FC<LogContainerProps> = ({ initialLogs }) => { | ||||
|   const [logs, setLogs] = useState<string>(initialLogs); | ||||
|   const logsContainerRef = useRef<HTMLDivElement | null>(null); | ||||
|  | ||||
|   useEffect(() => { | ||||
|     const eventSource = new EventSource(`${Constants.DOMAIN}/api/logs`); | ||||
|  | ||||
|     setLogs(""); | ||||
|  | ||||
|     eventSource.onmessage = (event) => { | ||||
|       setLogs((prevLogs) => prevLogs + event.data + "\n"); | ||||
|       if (logsContainerRef.current) { | ||||
|         logsContainerRef.current.scrollTop = | ||||
|           logsContainerRef.current.scrollHeight; | ||||
|       } | ||||
|     }; | ||||
|  | ||||
|     eventSource.onerror = () => { | ||||
|       eventSource.close(); | ||||
|     }; | ||||
|  | ||||
|     return () => { | ||||
|       eventSource.close(); | ||||
|     }; | ||||
|   }, []); | ||||
|  | ||||
|   const scrollToTop = () => { | ||||
|     if (logsContainerRef.current) { | ||||
|       logsContainerRef.current.scrollTop = 0; | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   const scrollToBottom = () => { | ||||
|     if (logsContainerRef.current) { | ||||
|       logsContainerRef.current.scrollTop = | ||||
|         logsContainerRef.current.scrollHeight; | ||||
|     } | ||||
|   }; | ||||
|   return ( | ||||
|     <Container | ||||
|       sx={{ | ||||
|         position: "relative", | ||||
|         backgroundColor: "black", | ||||
|         color: "white", | ||||
|         padding: "10px", | ||||
|         overflowY: "scroll", | ||||
|         whiteSpace: "pre-wrap", | ||||
|         overflowWrap: "normal", | ||||
|         maxHeight: "95vh", | ||||
|       }} | ||||
|       className={classes.logContainer} | ||||
|       ref={logsContainerRef} | ||||
|     > | ||||
|       <pre | ||||
|         style={{ | ||||
|           whiteSpace: "pre-wrap", | ||||
|           wordWrap: "break-word", | ||||
|           margin: 0, | ||||
|         }} | ||||
|       > | ||||
|         {logs} | ||||
|       </pre> | ||||
|       <IconButton | ||||
|         sx={{ | ||||
|           position: "fixed", | ||||
|           top: 20, | ||||
|           right: 20, | ||||
|           backgroundColor: "rgba(255, 255, 255, 0.1)", | ||||
|         }} | ||||
|         onClick={scrollToTop} | ||||
|       > | ||||
|         <ArrowUpward style={{ color: "white" }} /> | ||||
|       </IconButton> | ||||
|       <IconButton | ||||
|         sx={{ | ||||
|           position: "fixed", | ||||
|           bottom: 20, | ||||
|           right: 20, | ||||
|           backgroundColor: "rgba(255, 255, 255, 0.1)", | ||||
|         }} | ||||
|         onClick={scrollToBottom} | ||||
|       > | ||||
|         <ArrowDownward style={{ color: "white" }} /> | ||||
|       </IconButton> | ||||
|     </Container> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										351
									
								
								src/components/pages/chat/chat.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										351
									
								
								src/components/pages/chat/chat.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,351 @@ | ||||
| import React, { useEffect, useRef, useState } from "react"; | ||||
| import { | ||||
|   Box, | ||||
|   TextField, | ||||
|   Typography, | ||||
|   Paper, | ||||
|   useTheme, | ||||
|   IconButton, | ||||
|   Tooltip, | ||||
| } from "@mui/material"; | ||||
| import { JobSelector } from "../../ai"; | ||||
| import { Job, Message } from "../../../types"; | ||||
| import { useSearchParams } from "next/navigation"; | ||||
| import { checkAI, fetchJob, fetchJobs, updateJob } from "../../../lib"; | ||||
| import SendIcon from "@mui/icons-material/Send"; | ||||
| import EditNoteIcon from "@mui/icons-material/EditNote"; | ||||
|  | ||||
| export const AI: React.FC = () => { | ||||
|   const theme = useTheme(); | ||||
|   const [currentMessage, setCurrentMessage] = useState<string>(""); | ||||
|   const [selectedJob, setSelectedJob] = useState<Job | null>(null); | ||||
|   const [messages, setMessages] = useState<Message[]>([]); | ||||
|   const [aiEnabled, setAiEnabled] = useState<boolean>(false); | ||||
|   const [jobs, setJobs] = useState<Job[]>([]); | ||||
|   const [thinking, setThinking] = useState<boolean>(false); | ||||
|  | ||||
|   const searchParams = useSearchParams(); | ||||
|  | ||||
|   const getJobFromParam = async () => { | ||||
|     const jobId = searchParams.get("job"); | ||||
|  | ||||
|     if (jobId) { | ||||
|       const job = await fetchJob(jobId); | ||||
|  | ||||
|       if (job.length) { | ||||
|         setSelectedJob(job[0]); | ||||
|         if (job[0].chat) { | ||||
|           setMessages(job[0].chat); | ||||
|         } | ||||
|       } | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   useEffect(() => { | ||||
|     checkAI(setAiEnabled); | ||||
|     getJobFromParam(); | ||||
|   }, []); | ||||
|  | ||||
|   useEffect(() => { | ||||
|     if (selectedJob?.chat) { | ||||
|       setMessages(selectedJob?.chat); | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     setMessages([]); | ||||
|   }, [selectedJob]); | ||||
|  | ||||
|   const handleMessageSend = async (msg: string) => { | ||||
|     if (!selectedJob) { | ||||
|       throw Error("Job is not currently selected, but should be."); | ||||
|     } | ||||
|  | ||||
|     const updatedMessages = await sendMessage(msg); | ||||
|     await updateJob([selectedJob?.id], "chat", updatedMessages); | ||||
|   }; | ||||
|  | ||||
|   const sendMessage = async (msg: string) => { | ||||
|     const newMessage = { | ||||
|       content: msg, | ||||
|       role: "user", | ||||
|     }; | ||||
|  | ||||
|     setMessages((prevMessages) => [...prevMessages, newMessage]); | ||||
|     setCurrentMessage(""); | ||||
|     setThinking(true); | ||||
|  | ||||
|     const jobMessage = { | ||||
|       role: "system", | ||||
|       content: `Here is the content return from a scraping job: ${JSON.stringify( | ||||
|         selectedJob?.result | ||||
|       )} for the url: ${ | ||||
|         selectedJob?.url | ||||
|       }. The following messages will pertain to the content of the scraped job.`, | ||||
|     }; | ||||
|  | ||||
|     const response = await fetch("/api/ai", { | ||||
|       method: "POST", | ||||
|       headers: { | ||||
|         "Content-Type": "application/json", | ||||
|       }, | ||||
|       body: JSON.stringify({ | ||||
|         data: { messages: [jobMessage, ...messages, newMessage] }, | ||||
|       }), | ||||
|     }); | ||||
|  | ||||
|     const updatedMessages = [...messages, newMessage]; | ||||
|  | ||||
|     const reader = response.body?.getReader(); | ||||
|     const decoder = new TextDecoder("utf-8"); | ||||
|  | ||||
|     let aiResponse = ""; | ||||
|     if (reader) { | ||||
|       setThinking(false); | ||||
|       while (true) { | ||||
|         const { done, value } = await reader.read(); | ||||
|         if (done) break; | ||||
|         const chunk = decoder.decode(value, { stream: true }); | ||||
|         aiResponse += chunk; | ||||
|  | ||||
|         setMessages((prevMessages) => { | ||||
|           const lastMessage = prevMessages[prevMessages.length - 1]; | ||||
|           if (lastMessage && lastMessage.role === "assistant") { | ||||
|             return [ | ||||
|               ...prevMessages.slice(0, -1), | ||||
|               { ...lastMessage, content: aiResponse }, | ||||
|             ]; | ||||
|           } else { | ||||
|             return [ | ||||
|               ...prevMessages, | ||||
|               { | ||||
|                 content: aiResponse, | ||||
|                 role: "assistant", | ||||
|               }, | ||||
|             ]; | ||||
|           } | ||||
|         }); | ||||
|       } | ||||
|     } | ||||
|     return [...updatedMessages, { role: "assistant", content: aiResponse }]; | ||||
|   }; | ||||
|  | ||||
|   const handleNewChat = (selectedJob: Job) => { | ||||
|     updateJob([selectedJob.id], "chat", []); | ||||
|     setMessages([]); | ||||
|   }; | ||||
|  | ||||
|   useEffect(() => { | ||||
|     fetchJobs(setJobs); | ||||
|   }, []); | ||||
|  | ||||
|   return ( | ||||
|     <Box | ||||
|       sx={{ | ||||
|         display: "flex", | ||||
|         flexDirection: "column", | ||||
|         height: "95vh", | ||||
|         maxWidth: "100%", | ||||
|         paddingLeft: 0, | ||||
|         paddingRight: 0, | ||||
|         borderRadius: "8px", | ||||
|         border: | ||||
|           theme.palette.mode === "light" ? "solid white" : "solid #4b5057", | ||||
|         boxShadow: "0 4px 8px rgba(0, 0, 0, 0.1)", | ||||
|         overflow: "hidden", | ||||
|       }} | ||||
|     > | ||||
|       {aiEnabled ? ( | ||||
|         <> | ||||
|           <Paper | ||||
|             elevation={3} | ||||
|             sx={{ | ||||
|               p: 2, | ||||
|               textAlign: "center", | ||||
|               fontSize: "1.2em", | ||||
|               position: "relative", | ||||
|               borderRadius: "8px 8px 0 0", | ||||
|               borderBottom: `2px solid ${theme.palette.divider}`, | ||||
|             }} | ||||
|           > | ||||
|             <Box | ||||
|               sx={{ | ||||
|                 display: "flex", | ||||
|                 justifyContent: "center", | ||||
|                 alignItems: "center", | ||||
|                 position: "relative", | ||||
|                 padding: theme.spacing(1), | ||||
|               }} | ||||
|             > | ||||
|               <Typography | ||||
|                 sx={{ | ||||
|                   flex: 1, | ||||
|                   textAlign: "center", | ||||
|                 }} | ||||
|               > | ||||
|                 Chat with AI | ||||
|               </Typography> | ||||
|               <JobSelector | ||||
|                 selectedJob={selectedJob} | ||||
|                 setSelectedJob={setSelectedJob} | ||||
|                 setJobs={setJobs} | ||||
|                 jobs={jobs} | ||||
|                 sxProps={{ | ||||
|                   position: "absolute", | ||||
|                   right: theme.spacing(2), | ||||
|                   width: "25%", | ||||
|                 }} | ||||
|               /> | ||||
|             </Box> | ||||
|           </Paper> | ||||
|           <Box | ||||
|             sx={{ | ||||
|               position: "relative", | ||||
|               flex: 1, | ||||
|               p: 2, | ||||
|               overflowY: "auto", | ||||
|               maxHeight: "100%", | ||||
|             }} | ||||
|           > | ||||
|             {!selectedJob ? ( | ||||
|               <Box | ||||
|                 sx={{ | ||||
|                   position: "absolute", | ||||
|                   top: 0, | ||||
|                   left: "50%", | ||||
|                   transform: "translateX(-50%)", | ||||
|                   padding: 2, | ||||
|                   bgcolor: "rgba(128,128,128,0.1)", | ||||
|                   mt: 1, | ||||
|                   borderRadius: "8px", | ||||
|                 }} | ||||
|                 className="rounded-md" | ||||
|               > | ||||
|                 <Typography variant="body1"> | ||||
|                   Select a Job to Begin Chatting | ||||
|                 </Typography> | ||||
|               </Box> | ||||
|             ) : ( | ||||
|               <> | ||||
|                 {messages && | ||||
|                   messages.map((message, index) => ( | ||||
|                     <Box | ||||
|                       key={index} | ||||
|                       sx={{ | ||||
|                         my: 2, | ||||
|                         p: 1, | ||||
|                         borderRadius: "8px", | ||||
|                         boxShadow: "0 2px 4px rgba(0, 0, 0, 0.1)", | ||||
|                         bgcolor: | ||||
|                           message.role === "user" | ||||
|                             ? theme.palette.UserMessage.main | ||||
|                             : theme.palette.AIMessage.main, | ||||
|                         marginLeft: message.role === "user" ? "auto" : "", | ||||
|                         maxWidth: "40%", | ||||
|                       }} | ||||
|                     > | ||||
|                       <Typography variant="body1" sx={{ color: "white" }}> | ||||
|                         {message.content} | ||||
|                       </Typography> | ||||
|                     </Box> | ||||
|                   ))} | ||||
|                 {thinking && ( | ||||
|                   <Box | ||||
|                     sx={{ | ||||
|                       width: "full", | ||||
|                       display: "flex", | ||||
|                       flexDirection: "column", | ||||
|                       justifyContent: "start", | ||||
|                     }} | ||||
|                   > | ||||
|                     <Typography | ||||
|                       sx={{ | ||||
|                         bgcolor: "rgba(128,128,128,0.1)", | ||||
|                         maxWidth: "20%", | ||||
|                         my: 2, | ||||
|                         p: 1, | ||||
|                         borderRadius: "8px", | ||||
|                         boxShadow: "0 2px 4px rgba(0, 0, 0, 0.1)", | ||||
|                       }} | ||||
|                       variant="body1" | ||||
|                     > | ||||
|                       AI is thinking... | ||||
|                     </Typography> | ||||
|                   </Box> | ||||
|                 )} | ||||
|               </> | ||||
|             )} | ||||
|           </Box> | ||||
|           <Box | ||||
|             sx={{ | ||||
|               display: "flex", | ||||
|               p: 2, | ||||
|               borderTop: `1px solid ${theme.palette.divider}`, | ||||
|             }} | ||||
|           > | ||||
|             <Tooltip title="New Chat" placement="top"> | ||||
|               <IconButton | ||||
|                 disabled={!(messages.length > 0)} | ||||
|                 sx={{ marginRight: 2 }} | ||||
|                 size="medium" | ||||
|                 onClick={() => { | ||||
|                   if (!selectedJob) { | ||||
|                     throw new Error("Selected job must be present but isn't."); | ||||
|                   } | ||||
|                   handleNewChat(selectedJob); | ||||
|                 }} | ||||
|               > | ||||
|                 <EditNoteIcon fontSize="medium" /> | ||||
|               </IconButton> | ||||
|             </Tooltip> | ||||
|             <TextField | ||||
|               fullWidth | ||||
|               placeholder="Type your message here..." | ||||
|               disabled={!selectedJob} | ||||
|               value={currentMessage} | ||||
|               onChange={(e) => setCurrentMessage(e.target.value)} | ||||
|               onKeyDown={(e) => { | ||||
|                 if (e.key === "Enter") { | ||||
|                   handleMessageSend(currentMessage); | ||||
|                 } | ||||
|               }} | ||||
|               sx={{ borderRadius: "8px" }} | ||||
|             /> | ||||
|  | ||||
|             <Tooltip title="Send" placement="top"> | ||||
|               <IconButton | ||||
|                 color="primary" | ||||
|                 sx={{ ml: 2 }} | ||||
|                 disabled={!selectedJob} | ||||
|                 onClick={() => { | ||||
|                   handleMessageSend(currentMessage); | ||||
|                 }} | ||||
|               > | ||||
|                 <SendIcon /> | ||||
|               </IconButton> | ||||
|             </Tooltip> | ||||
|           </Box> | ||||
|         </> | ||||
|       ) : ( | ||||
|         <Box | ||||
|           bgcolor="background.default" | ||||
|           minHeight="100vh" | ||||
|           display="flex" | ||||
|           justifyContent="center" | ||||
|           alignItems="center" | ||||
|         > | ||||
|           <h4 | ||||
|             style={{ | ||||
|               color: "#fff", | ||||
|               padding: "20px", | ||||
|               borderRadius: "8px", | ||||
|               background: "rgba(0, 0, 0, 0.6)", | ||||
|               boxShadow: "0 4px 8px rgba(0, 0, 0, 0.2)", | ||||
|             }} | ||||
|           > | ||||
|             Must set either OPENAI_KEY or OLLAMA_MODEL to use AI features. | ||||
|           </h4> | ||||
|         </Box> | ||||
|       )} | ||||
|     </Box> | ||||
|   ); | ||||
| }; | ||||
| @@ -0,0 +1,182 @@ | ||||
| import { Job } from "@/types"; | ||||
| import { | ||||
|   Button, | ||||
|   Dialog, | ||||
|   DialogTitle, | ||||
|   DialogContent, | ||||
|   TextField, | ||||
|   Snackbar, | ||||
|   Alert, | ||||
| } from "@mui/material"; | ||||
| import Cookies from "js-cookie"; | ||||
| import { useState } from "react"; | ||||
|  | ||||
| export type CreateCronJobsProps = { | ||||
|   availableJobs: Job[]; | ||||
|   user: any; | ||||
| }; | ||||
|  | ||||
| export const CreateCronJobs = ({ | ||||
|   availableJobs, | ||||
|   user, | ||||
| }: CreateCronJobsProps) => { | ||||
|   const [open, setOpen] = useState(false); | ||||
|  | ||||
|   return ( | ||||
|     <> | ||||
|       <Button | ||||
|         variant="contained" | ||||
|         color="primary" | ||||
|         onClick={() => setOpen(true)} | ||||
|         sx={{ borderRadius: 2 }} | ||||
|       > | ||||
|         Create Cron Job | ||||
|       </Button> | ||||
|       <CreateCronJobDialog | ||||
|         open={open} | ||||
|         onClose={() => setOpen(false)} | ||||
|         availableJobs={availableJobs} | ||||
|         user={user} | ||||
|       /> | ||||
|     </> | ||||
|   ); | ||||
| }; | ||||
|  | ||||
| const CreateCronJobDialog = ({ | ||||
|   open, | ||||
|   onClose, | ||||
|   availableJobs, | ||||
|   user, | ||||
| }: { | ||||
|   open: boolean; | ||||
|   onClose: () => void; | ||||
|   availableJobs: Job[]; | ||||
|   user: any; | ||||
| }) => { | ||||
|   const [cronExpression, setCronExpression] = useState(""); | ||||
|   const [jobId, setJobId] = useState(""); | ||||
|   const [successOpen, setSuccessOpen] = useState(false); | ||||
|   const [isSubmitting, setIsSubmitting] = useState(false); | ||||
|   const [error, setError] = useState(""); | ||||
|  | ||||
|   const handleSubmit = async () => { | ||||
|     if (!cronExpression || !jobId) { | ||||
|       setError("Please fill in all fields"); | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     setIsSubmitting(true); | ||||
|     const token = Cookies.get("token"); | ||||
|  | ||||
|     try { | ||||
|       const response = await fetch("/api/schedule-cron-job", { | ||||
|         method: "POST", | ||||
|         headers: { | ||||
|           "Content-Type": "application/json", | ||||
|           Authorization: `Bearer ${token}`, | ||||
|         }, | ||||
|         body: JSON.stringify({ | ||||
|           data: { | ||||
|             cron_expression: cronExpression, | ||||
|             job_id: jobId, | ||||
|             user_email: user.email, | ||||
|           }, | ||||
|         }), | ||||
|       }); | ||||
|  | ||||
|       if (!response.ok) { | ||||
|         throw new Error("Failed to schedule job"); | ||||
|       } | ||||
|  | ||||
|       setSuccessOpen(true); | ||||
|       setCronExpression(""); | ||||
|       setJobId(""); | ||||
|       setTimeout(() => { | ||||
|         onClose(); | ||||
|       }, 1500); | ||||
|       window.location.reload(); | ||||
|     } catch (error) { | ||||
|       console.error(error); | ||||
|       setError("Failed to create cron job"); | ||||
|     } finally { | ||||
|       setIsSubmitting(false); | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   const handleClose = () => { | ||||
|     setSuccessOpen(false); | ||||
|   }; | ||||
|  | ||||
|   return ( | ||||
|     <> | ||||
|       <Dialog | ||||
|         open={open} | ||||
|         onClose={onClose} | ||||
|         PaperProps={{ | ||||
|           sx: { borderRadius: 2, minWidth: "400px" }, | ||||
|         }} | ||||
|       > | ||||
|         <DialogTitle sx={{ fontWeight: 500 }}>Create Cron Job</DialogTitle> | ||||
|         <DialogContent> | ||||
|           <div className="flex flex-col gap-1 mt0"> | ||||
|             <TextField | ||||
|               label="Cron Expression" | ||||
|               fullWidth | ||||
|               value={cronExpression} | ||||
|               onChange={(e) => setCronExpression(e.target.value)} | ||||
|               variant="outlined" | ||||
|               placeholder="* * * * *" | ||||
|               margin="normal" | ||||
|               helperText="Format: minute hour day month day-of-week" | ||||
|             /> | ||||
|  | ||||
|             <TextField | ||||
|               label="Job ID" | ||||
|               fullWidth | ||||
|               value={jobId} | ||||
|               onChange={(e) => setJobId(e.target.value)} | ||||
|               variant="outlined" | ||||
|               margin="normal" | ||||
|             /> | ||||
|  | ||||
|             {error && ( | ||||
|               <Alert severity="error" sx={{ mt: 2 }}> | ||||
|                 {error} | ||||
|               </Alert> | ||||
|             )} | ||||
|  | ||||
|             <div className="flex justify-end gap-2 mt-4"> | ||||
|               <Button | ||||
|                 variant="outlined" | ||||
|                 onClick={onClose} | ||||
|                 sx={{ borderRadius: 2 }} | ||||
|               > | ||||
|                 Cancel | ||||
|               </Button> | ||||
|               <Button | ||||
|                 variant="contained" | ||||
|                 color="primary" | ||||
|                 onClick={handleSubmit} | ||||
|                 disabled={isSubmitting} | ||||
|                 sx={{ borderRadius: 2 }} | ||||
|               > | ||||
|                 {isSubmitting ? "Submitting..." : "Create Job"} | ||||
|               </Button> | ||||
|             </div> | ||||
|           </div> | ||||
|         </DialogContent> | ||||
|       </Dialog> | ||||
|  | ||||
|       <Snackbar | ||||
|         open={successOpen} | ||||
|         autoHideDuration={4000} | ||||
|         onClose={handleClose} | ||||
|         anchorOrigin={{ vertical: "bottom", horizontal: "right" }} | ||||
|       > | ||||
|         <Alert onClose={handleClose} severity="success" sx={{ width: "100%" }}> | ||||
|           Cron job created successfully! | ||||
|         </Alert> | ||||
|       </Snackbar> | ||||
|     </> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										1
									
								
								src/components/pages/cron-jobs/create-cron-jobs/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/pages/cron-jobs/create-cron-jobs/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./create-cron-jobs"; | ||||
							
								
								
									
										0
									
								
								src/components/pages/cron-jobs/cron-jobs.module.css
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/components/pages/cron-jobs/cron-jobs.module.css
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										124
									
								
								src/components/pages/cron-jobs/cron-jobs.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										124
									
								
								src/components/pages/cron-jobs/cron-jobs.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,124 @@ | ||||
| import { Job, CronJob } from "@/types/job"; | ||||
| import { useState, useEffect } from "react"; | ||||
| import { CreateCronJobs } from "./create-cron-jobs"; | ||||
| import { | ||||
|   Table, | ||||
|   TableHead, | ||||
|   TableRow, | ||||
|   TableCell, | ||||
|   TableBody, | ||||
|   Button, | ||||
|   Box, | ||||
|   Typography, | ||||
|   useTheme, | ||||
| } from "@mui/material"; | ||||
| import Cookies from "js-cookie"; | ||||
|  | ||||
| export type CronJobsProps = { | ||||
|   initialJobs: Job[]; | ||||
|   initialCronJobs: CronJob[]; | ||||
|   initialUser: any; | ||||
| }; | ||||
|  | ||||
| export const CronJobs = ({ | ||||
|   initialJobs, | ||||
|   initialCronJobs, | ||||
|   initialUser, | ||||
| }: CronJobsProps) => { | ||||
|   const [jobs, setJobs] = useState<Job[]>(initialJobs); | ||||
|   const [cronJobs, setCronJobs] = useState<CronJob[]>(initialCronJobs); | ||||
|   const [user, setUser] = useState<any>(initialUser); | ||||
|   const theme = useTheme(); | ||||
|  | ||||
|   useEffect(() => { | ||||
|     setJobs(initialJobs); | ||||
|     setCronJobs(initialCronJobs); | ||||
|     setUser(initialUser); | ||||
|   }, [initialJobs, initialCronJobs, initialUser]); | ||||
|  | ||||
|   const handleDeleteCronJob = async (id: string) => { | ||||
|     const token = Cookies.get("token"); | ||||
|     const response = await fetch("/api/delete-cron-job", { | ||||
|       method: "POST", | ||||
|       headers: { | ||||
|         "Content-Type": "application/json", | ||||
|         Authorization: `Bearer ${token}`, | ||||
|       }, | ||||
|       body: JSON.stringify({ data: { id, user_email: user.email } }), | ||||
|     }); | ||||
|  | ||||
|     if (response.ok) { | ||||
|       console.log("Cron job deleted successfully"); | ||||
|       setCronJobs(cronJobs.filter((cronJob) => cronJob.id !== id)); | ||||
|     } else { | ||||
|       console.error("Failed to delete cron job"); | ||||
|     } | ||||
|   }; | ||||
|  | ||||
|   if (!user) { | ||||
|     return ( | ||||
|       <Box | ||||
|         sx={{ | ||||
|           display: "flex", | ||||
|           justifyContent: "center", | ||||
|           alignItems: "center", | ||||
|           height: "100%", | ||||
|           borderRadius: "8px", | ||||
|           border: | ||||
|             theme.palette.mode === "light" ? "solid white" : "solid #4b5057", | ||||
|           boxShadow: "0 4px 8px rgba(0, 0, 0, 0.1)", | ||||
|         }} | ||||
|       > | ||||
|         <h4 | ||||
|           style={{ | ||||
|             color: "#fff", | ||||
|             padding: "20px", | ||||
|             borderRadius: "8px", | ||||
|             background: "rgba(0, 0, 0, 0.6)", | ||||
|           }} | ||||
|         > | ||||
|           Please login to view your cron jobs | ||||
|         </h4> | ||||
|       </Box> | ||||
|     ); | ||||
|   } | ||||
|  | ||||
|   return ( | ||||
|     <div> | ||||
|       <CreateCronJobs availableJobs={jobs} user={user} /> | ||||
|  | ||||
|       <Table> | ||||
|         <TableHead> | ||||
|           <TableRow> | ||||
|             <TableCell>Cron Expression</TableCell> | ||||
|             <TableCell>Job ID</TableCell> | ||||
|             <TableCell>User Email</TableCell> | ||||
|             <TableCell>Created At</TableCell> | ||||
|             <TableCell>Updated At</TableCell> | ||||
|             <TableCell>Actions</TableCell> | ||||
|           </TableRow> | ||||
|         </TableHead> | ||||
|         <TableBody> | ||||
|           {cronJobs.map((cronJob) => ( | ||||
|             <TableRow key={cronJob.id}> | ||||
|               <TableCell>{cronJob.cron_expression}</TableCell> | ||||
|               <TableCell>{cronJob.job_id}</TableCell> | ||||
|               <TableCell>{cronJob.user_email}</TableCell> | ||||
|               <TableCell> | ||||
|                 {new Date(cronJob.time_created).toLocaleString()} | ||||
|               </TableCell> | ||||
|               <TableCell> | ||||
|                 {new Date(cronJob.time_updated).toLocaleString()} | ||||
|               </TableCell> | ||||
|               <TableCell> | ||||
|                 <Button onClick={() => handleDeleteCronJob(cronJob.id)}> | ||||
|                   Delete | ||||
|                 </Button> | ||||
|               </TableCell> | ||||
|             </TableRow> | ||||
|           ))} | ||||
|         </TableBody> | ||||
|       </Table> | ||||
|     </div> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										62
									
								
								src/components/pages/cron-jobs/get-server-side-props.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										62
									
								
								src/components/pages/cron-jobs/get-server-side-props.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,62 @@ | ||||
| import axios from "axios"; | ||||
| import { GetServerSideProps } from "next"; | ||||
| import { parseCookies } from "nookies"; | ||||
| import { CronJob, Job } from "../../../types"; | ||||
|  | ||||
| export const getServerSideProps: GetServerSideProps = async (context) => { | ||||
|   const { req } = context; | ||||
|   const cookies = parseCookies({ req }); | ||||
|   const token = cookies.token; | ||||
|   let user = null; | ||||
|   let initialJobs: Job[] = []; | ||||
|   let initialCronJobs: CronJob[] = []; | ||||
|   if (token) { | ||||
|     try { | ||||
|       const userResponse = await axios.get( | ||||
|         `${process.env.NEXT_PUBLIC_API_URL}/api/auth/users/me`, | ||||
|         { | ||||
|           headers: { Authorization: `Bearer ${token}` }, | ||||
|         } | ||||
|       ); | ||||
|  | ||||
|       user = userResponse.data; | ||||
|  | ||||
|       const jobsResponse = await fetch( | ||||
|         `${process.env.NEXT_PUBLIC_API_URL}/api/retrieve-scrape-jobs`, | ||||
|         { | ||||
|           method: "POST", | ||||
|           body: JSON.stringify({ user: user.email }), | ||||
|           headers: { | ||||
|             "content-type": "application/json", | ||||
|             Authorization: `Bearer ${token}`, | ||||
|           }, | ||||
|         } | ||||
|       ); | ||||
|  | ||||
|       initialJobs = await jobsResponse.json(); | ||||
|       console.log(initialJobs); | ||||
|  | ||||
|       const cronJobsResponse = await fetch( | ||||
|         `${process.env.NEXT_PUBLIC_API_URL}/api/cron-jobs`, | ||||
|         { | ||||
|           headers: { | ||||
|             "content-type": "application/json", | ||||
|             Authorization: `Bearer ${token}`, | ||||
|           }, | ||||
|         } | ||||
|       ); | ||||
|  | ||||
|       initialCronJobs = await cronJobsResponse.json(); | ||||
|     } catch (error) { | ||||
|       console.error("Error fetching user or jobs:", error); | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   return { | ||||
|     props: { | ||||
|       initialJobs, | ||||
|       initialUser: user, | ||||
|       initialCronJobs, | ||||
|     }, | ||||
|   }; | ||||
| }; | ||||
							
								
								
									
										1
									
								
								src/components/pages/cron-jobs/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/pages/cron-jobs/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export { CronJobs } from "./cron-jobs"; | ||||
							
								
								
									
										107
									
								
								src/components/pages/home/home.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										107
									
								
								src/components/pages/home/home.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,107 @@ | ||||
| "use client"; | ||||
|  | ||||
| import React, { useState, useEffect, useRef } from "react"; | ||||
| import { Button, Container, Box, Snackbar, Alert } from "@mui/material"; | ||||
| import { useRouter } from "next/router"; | ||||
| import { Element, Result } from "@/types"; | ||||
| import { ElementTable, JobSubmitter } from "@/components/submit/job-submitter"; | ||||
| import { useJobSubmitterProvider } from "@/components/submit/job-submitter/provider"; | ||||
|  | ||||
| export const Home = () => { | ||||
|   const { | ||||
|     submittedURL, | ||||
|     setSubmittedURL, | ||||
|     rows, | ||||
|     setRows, | ||||
|     results, | ||||
|     snackbarOpen, | ||||
|     setSnackbarOpen, | ||||
|     snackbarMessage, | ||||
|     snackbarSeverity, | ||||
|   } = useJobSubmitterProvider(); | ||||
|   const router = useRouter(); | ||||
|   const { elements, url } = router.query; | ||||
|  | ||||
|   const resultsRef = useRef<HTMLTableElement | null>(null); | ||||
|  | ||||
|   useEffect(() => { | ||||
|     if (elements) { | ||||
|       setRows(JSON.parse(elements as string)); | ||||
|     } | ||||
|     if (url) { | ||||
|       setSubmittedURL(url as string); | ||||
|     } | ||||
|   }, [elements, url]); | ||||
|  | ||||
|   useEffect(() => { | ||||
|     if (results && resultsRef.current) { | ||||
|       resultsRef.current.scrollIntoView({ behavior: "smooth" }); | ||||
|     } | ||||
|   }, [results]); | ||||
|  | ||||
|   const handleCloseSnackbar = () => { | ||||
|     setSnackbarOpen(false); | ||||
|   }; | ||||
|  | ||||
|   const ErrorSnackbar = () => { | ||||
|     return ( | ||||
|       <Snackbar | ||||
|         open={snackbarOpen} | ||||
|         autoHideDuration={6000} | ||||
|         onClose={handleCloseSnackbar} | ||||
|       > | ||||
|         <Alert onClose={handleCloseSnackbar} severity="error"> | ||||
|           {snackbarMessage} | ||||
|         </Alert> | ||||
|       </Snackbar> | ||||
|     ); | ||||
|   }; | ||||
|  | ||||
|   const NotifySnackbar = () => { | ||||
|     const goTo = () => { | ||||
|       router.push("/jobs"); | ||||
|     }; | ||||
|  | ||||
|     const action = ( | ||||
|       <Button color="inherit" size="small" onClick={goTo}> | ||||
|         Go To Job | ||||
|       </Button> | ||||
|     ); | ||||
|  | ||||
|     return ( | ||||
|       <Snackbar | ||||
|         open={snackbarOpen} | ||||
|         autoHideDuration={6000} | ||||
|         onClose={handleCloseSnackbar} | ||||
|       > | ||||
|         <Alert onClose={handleCloseSnackbar} severity="info" action={action}> | ||||
|           {snackbarMessage} | ||||
|         </Alert> | ||||
|       </Snackbar> | ||||
|     ); | ||||
|   }; | ||||
|  | ||||
|   return ( | ||||
|     <Box | ||||
|       bgcolor="background.default" | ||||
|       display="flex" | ||||
|       flexDirection="column" | ||||
|       justifyContent="center" | ||||
|       alignItems="center" | ||||
|       height="100%" | ||||
|       py={4} | ||||
|     > | ||||
|       <Container maxWidth="lg" className="overflow-y-auto max-h-full"> | ||||
|         <JobSubmitter /> | ||||
|         {submittedURL.length ? ( | ||||
|           <ElementTable | ||||
|             rows={rows} | ||||
|             setRows={setRows} | ||||
|             submittedURL={submittedURL} | ||||
|           /> | ||||
|         ) : null} | ||||
|       </Container> | ||||
|       {snackbarSeverity === "info" ? <NotifySnackbar /> : <ErrorSnackbar />} | ||||
|     </Box> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										1
									
								
								src/components/pages/home/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/pages/home/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./home"; | ||||
| @@ -1,2 +1 @@ | ||||
| export * from "./ElementTable"; | ||||
| export * from "./job-submitter"; | ||||
|   | ||||
| @@ -15,9 +15,11 @@ import { | ||||
|   IconButton, | ||||
|   Tooltip, | ||||
|   useTheme, | ||||
|   Divider, | ||||
| } from "@mui/material"; | ||||
| import AddIcon from "@mui/icons-material/Add"; | ||||
| import { Element } from "../../types"; | ||||
| import { Element } from "@/types"; | ||||
| import { SiteMap } from "../site-map"; | ||||
| 
 | ||||
| interface Props { | ||||
|   rows: Element[]; | ||||
| @@ -169,6 +171,13 @@ export const ElementTable = ({ rows, setRows, submittedURL }: Props) => { | ||||
|           </div> | ||||
|         </TableContainer> | ||||
|       </Box> | ||||
|       <Divider | ||||
|         sx={{ | ||||
|           borderColor: theme.palette.mode === "dark" ? "#ffffff" : "0000000", | ||||
|           marginBottom: 2, | ||||
|         }} | ||||
|       /> | ||||
|       <SiteMap /> | ||||
|     </Box> | ||||
|   ); | ||||
| }; | ||||
| @@ -0,0 +1 @@ | ||||
| export { ElementTable } from "./element-table"; | ||||
| @@ -1 +1,2 @@ | ||||
| export { JobSubmitter } from "./job-submitter"; | ||||
| export { ElementTable } from "./element-table"; | ||||
|   | ||||
| @@ -1,26 +1,20 @@ | ||||
| import React, { Dispatch } from "react"; | ||||
| import React from "react"; | ||||
| import { TextField, Button, CircularProgress } from "@mui/material"; | ||||
| import { Element } from "@/types"; | ||||
| import { useJobSubmitterProvider } from "../provider"; | ||||
|  | ||||
| export type JobSubmitterInputProps = { | ||||
|   submittedURL: string; | ||||
|   setSubmittedURL: Dispatch<React.SetStateAction<string>>; | ||||
|   isValidURL: boolean; | ||||
|   urlError: string | null; | ||||
|   handleSubmit: () => void; | ||||
|   loading: boolean; | ||||
|   rows: Element[]; | ||||
| }; | ||||
|  | ||||
| export const JobSubmitterInput = ({ | ||||
|   submittedURL, | ||||
|   setSubmittedURL, | ||||
|   isValidURL, | ||||
|   urlError, | ||||
|   handleSubmit, | ||||
|   loading, | ||||
|   rows, | ||||
|   urlError, | ||||
| }: JobSubmitterInputProps) => { | ||||
|   const { submittedURL, setSubmittedURL, isValidURL, rows } = | ||||
|     useJobSubmitterProvider(); | ||||
|   return ( | ||||
|     <div className="flex flex-row space-x-4 items-center mb-2"> | ||||
|       <TextField | ||||
| @@ -40,7 +34,7 @@ export const JobSubmitterInput = ({ | ||||
|         size="small" | ||||
|         onClick={handleSubmit} | ||||
|         disabled={!(rows.length > 0) || loading} | ||||
|         className={`bg-gradient-to-r from-[#034efc] to-gray-500 text-white font-semibold rounded-md  | ||||
|         className={`bg-[#034efc] text-white font-semibold rounded-md  | ||||
|                     transition-transform transform hover:scale-105 disabled:opacity-50`} | ||||
|       > | ||||
|         {loading ? <CircularProgress size={24} color="inherit" /> : "Submit"} | ||||
|   | ||||
| @@ -1,38 +1,91 @@ | ||||
| import { RawJobOptions } from "@/types/job"; | ||||
| import { Box, FormControlLabel, Checkbox, TextField } from "@mui/material"; | ||||
| import { Dispatch, SetStateAction } from "react"; | ||||
|  | ||||
| import { JobOptions } from "@/types/job"; | ||||
|  | ||||
| export type JobSubmitterOptionsProps = { | ||||
|   jobOptions: JobOptions; | ||||
|   setJobOptions: Dispatch<SetStateAction<JobOptions>>; | ||||
|   jobOptions: RawJobOptions; | ||||
|   setJobOptions: Dispatch<SetStateAction<RawJobOptions>>; | ||||
|   customJSONSelected: boolean; | ||||
|   setCustomJSONSelected: Dispatch<SetStateAction<boolean>>; | ||||
|   handleSelectProxies: () => void; | ||||
|   proxiesSelected: boolean; | ||||
| }; | ||||
|  | ||||
| export const JobSubmitterOptions = ({ | ||||
|   jobOptions, | ||||
|   setJobOptions, | ||||
|   handleSelectProxies, | ||||
|   customJSONSelected, | ||||
|   setCustomJSONSelected, | ||||
|   proxiesSelected, | ||||
| }: JobSubmitterOptionsProps) => { | ||||
|   const handleMultiPageScrapeChange = () => { | ||||
|     setJobOptions((prevJobOptions) => ({ | ||||
|       ...prevJobOptions, | ||||
|       multi_page_scrape: !prevJobOptions.multi_page_scrape, | ||||
|     })); | ||||
|   }; | ||||
|  | ||||
|   const handleProxiesChange = (e: React.ChangeEvent<HTMLInputElement>) => { | ||||
|     setJobOptions((prevJobOptions) => ({ | ||||
|       ...prevJobOptions, | ||||
|       proxies: e.target.value, | ||||
|     })); | ||||
|   }; | ||||
|  | ||||
|   const handleCustomHeadersChange = ( | ||||
|     e: React.ChangeEvent<HTMLInputElement> | ||||
|   ) => { | ||||
|     setJobOptions((prevJobOptions) => ({ | ||||
|       ...prevJobOptions, | ||||
|       custom_headers: e.target.value, | ||||
|     })); | ||||
|   }; | ||||
|  | ||||
|   const handleCollectMediaChange = () => { | ||||
|     setJobOptions((prevJobOptions) => ({ | ||||
|       ...prevJobOptions, | ||||
|       collect_media: !prevJobOptions.collect_media, | ||||
|     })); | ||||
|   }; | ||||
|  | ||||
|   return ( | ||||
|     <Box bgcolor="background.paper" className="flex flex-col mb-2 rounded-md"> | ||||
|       <div id="options" className="p-2 flex flex-row space-x-2"> | ||||
|         <FormControlLabel | ||||
|           label="Multi-Page Scrape" | ||||
|           className="mr-0" | ||||
|           control={ | ||||
|             <Checkbox | ||||
|               checked={jobOptions.multi_page_scrape} | ||||
|               onChange={() => | ||||
|                 setJobOptions((prevJobOptions) => ({ | ||||
|                   ...prevJobOptions, | ||||
|                   multi_page_scrape: !prevJobOptions.multi_page_scrape, | ||||
|                 })) | ||||
|               } | ||||
|               onChange={handleMultiPageScrapeChange} | ||||
|             /> | ||||
|           } | ||||
|         ></FormControlLabel> | ||||
|         <FormControlLabel | ||||
|           label="Proxies" | ||||
|           control={ | ||||
|             <Checkbox | ||||
|               checked={proxiesSelected} | ||||
|               onChange={handleSelectProxies} | ||||
|             /> | ||||
|           } | ||||
|         ></FormControlLabel> | ||||
|         {proxiesSelected ? ( | ||||
|           <div id="proxies"> | ||||
|             <TextField | ||||
|               InputLabelProps={{ shrink: false }} | ||||
|               fullWidth | ||||
|               multiline={false} | ||||
|               variant="outlined" | ||||
|               value={jobOptions.proxies || ""} | ||||
|               onChange={handleProxiesChange} | ||||
|               inputProps={{ | ||||
|                 style: { whiteSpace: "nowrap", overflowX: "auto" }, | ||||
|               }} | ||||
|             /> | ||||
|           </div> | ||||
|         ) : null} | ||||
|         <FormControlLabel | ||||
|           label="Custom Headers (JSON)" | ||||
|           control={ | ||||
| @@ -48,6 +101,15 @@ export const JobSubmitterOptions = ({ | ||||
|             /> | ||||
|           } | ||||
|         ></FormControlLabel> | ||||
|         <FormControlLabel | ||||
|           label="Collect Media" | ||||
|           control={ | ||||
|             <Checkbox | ||||
|               checked={jobOptions.collect_media} | ||||
|               onChange={handleCollectMediaChange} | ||||
|             /> | ||||
|           } | ||||
|         /> | ||||
|       </div> | ||||
|       {customJSONSelected ? ( | ||||
|         <div id="custom-json" className="pl-2 pr-2 pb-2"> | ||||
| @@ -58,14 +120,8 @@ export const JobSubmitterOptions = ({ | ||||
|             minRows={4} | ||||
|             variant="outlined" | ||||
|             value={jobOptions.custom_headers || ""} | ||||
|             onChange={(e) => | ||||
|               setJobOptions((prevJobOptions) => ({ | ||||
|                 ...prevJobOptions, | ||||
|                 custom_headers: e.target.value, | ||||
|               })) | ||||
|             } | ||||
|             onChange={handleCustomHeadersChange} | ||||
|             style={{ maxHeight: "20vh", overflow: "auto" }} | ||||
|             className="mt-2" | ||||
|           /> | ||||
|         </div> | ||||
|       ) : null} | ||||
|   | ||||
| @@ -1,70 +1,51 @@ | ||||
| "use client"; | ||||
|  | ||||
| import React, { useEffect, useState, Dispatch } from "react"; | ||||
| import { Element } from "@/types"; | ||||
| import React, { useEffect, useState } from "react"; | ||||
| import { useAuth } from "@/contexts/AuthContext"; | ||||
| import { useRouter } from "next/router"; | ||||
| import { Constants } from "@/lib"; | ||||
|  | ||||
| import { RawJobOptions } from "@/types/job"; | ||||
| import { parseJobOptions, validateURL } from "@/lib"; | ||||
| import { JobSubmitterHeader } from "./job-submitter-header"; | ||||
| import { JobSubmitterInput } from "./job-submitter-input"; | ||||
| import { JobSubmitterOptions } from "./job-submitter-options"; | ||||
| import { ApiService } from "@/services"; | ||||
| import { useJobSubmitterProvider } from "./provider"; | ||||
|  | ||||
| interface StateProps { | ||||
|   submittedURL: string; | ||||
|   setSubmittedURL: Dispatch<React.SetStateAction<string>>; | ||||
|   rows: Element[]; | ||||
|   isValidURL: boolean; | ||||
|   setIsValidUrl: Dispatch<React.SetStateAction<boolean>>; | ||||
|   setSnackbarMessage: Dispatch<React.SetStateAction<string>>; | ||||
|   setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>; | ||||
|   setSnackbarSeverity: Dispatch<React.SetStateAction<string>>; | ||||
| } | ||||
| const initialJobOptions: RawJobOptions = { | ||||
|   multi_page_scrape: false, | ||||
|   custom_headers: null, | ||||
|   proxies: null, | ||||
|   collect_media: false, | ||||
| }; | ||||
|  | ||||
| interface Props { | ||||
|   stateProps: StateProps; | ||||
| } | ||||
|  | ||||
| interface JobOptions { | ||||
|   multi_page_scrape: boolean; | ||||
|   custom_headers: null | string; | ||||
| } | ||||
|  | ||||
| export const JobSubmitter = ({ stateProps }: Props) => { | ||||
| export const JobSubmitter = () => { | ||||
|   const { user } = useAuth(); | ||||
|   const router = useRouter(); | ||||
|  | ||||
|   const { job_options } = router.query; | ||||
|  | ||||
|   const { | ||||
|     submittedURL, | ||||
|     setSubmittedURL, | ||||
|     rows, | ||||
|     isValidURL, | ||||
|     siteMap, | ||||
|     setIsValidUrl, | ||||
|     setSnackbarMessage, | ||||
|     setSnackbarOpen, | ||||
|     setSnackbarSeverity, | ||||
|   } = stateProps; | ||||
|     setSiteMap, | ||||
|   } = useJobSubmitterProvider(); | ||||
|  | ||||
|   const [urlError, setUrlError] = useState<string | null>(null); | ||||
|   const [loading, setLoading] = useState<boolean>(false); | ||||
|   const [jobOptions, setJobOptions] = useState<JobOptions>({ | ||||
|     multi_page_scrape: false, | ||||
|     custom_headers: null, | ||||
|   }); | ||||
|   const [jobOptions, setJobOptions] = | ||||
|     useState<RawJobOptions>(initialJobOptions); | ||||
|   const [customJSONSelected, setCustomJSONSelected] = useState<boolean>(false); | ||||
|   const [proxiesSelected, setProxiesSelected] = useState<boolean>(false); | ||||
|  | ||||
|   function validateURL(url: string): boolean { | ||||
|     try { | ||||
|       new URL(url); | ||||
|       return true; | ||||
|     } catch (_) { | ||||
|       return false; | ||||
|     } | ||||
|   } | ||||
|   const handleSelectProxies = () => { | ||||
|     setProxiesSelected(!proxiesSelected); | ||||
|   }; | ||||
|  | ||||
|   const handleSubmit = () => { | ||||
|   const handleSubmit = async () => { | ||||
|     if (!validateURL(submittedURL)) { | ||||
|       setIsValidUrl(false); | ||||
|       setUrlError("Please enter a valid URL."); | ||||
| @@ -76,6 +57,7 @@ export const JobSubmitter = ({ stateProps }: Props) => { | ||||
|     setLoading(true); | ||||
|  | ||||
|     let customHeaders; | ||||
|  | ||||
|     try { | ||||
|       customHeaders = jobOptions.custom_headers | ||||
|         ? JSON.parse(jobOptions.custom_headers) | ||||
| @@ -88,21 +70,15 @@ export const JobSubmitter = ({ stateProps }: Props) => { | ||||
|       return; | ||||
|     } | ||||
|  | ||||
|     fetch(`${Constants.DOMAIN}/api/submit-scrape-job`, { | ||||
|       method: "POST", | ||||
|       headers: { "content-type": "application/json" }, | ||||
|       body: JSON.stringify({ | ||||
|         url: submittedURL, | ||||
|         elements: rows, | ||||
|         user: user?.email, | ||||
|         time_created: new Date().toISOString(), | ||||
|         job_options: { | ||||
|           ...jobOptions, | ||||
|           custom_headers: customHeaders, | ||||
|         }, | ||||
|       }), | ||||
|     }) | ||||
|       .then((response) => { | ||||
|     await ApiService.submitJob( | ||||
|       submittedURL, | ||||
|       rows, | ||||
|       user, | ||||
|       jobOptions, | ||||
|       customHeaders, | ||||
|       siteMap | ||||
|     ) | ||||
|       .then(async (response) => { | ||||
|         if (!response.ok) { | ||||
|           return response.json().then((error) => { | ||||
|             throw new Error(error.error); | ||||
| @@ -111,7 +87,10 @@ export const JobSubmitter = ({ stateProps }: Props) => { | ||||
|         return response.json(); | ||||
|       }) | ||||
|       .then((data) => { | ||||
|         setSnackbarMessage(data || "Job submitted successfully."); | ||||
|         setSnackbarMessage( | ||||
|           `Job: ${data.id} submitted successfully.` || | ||||
|             "Job submitted successfully." | ||||
|         ); | ||||
|         setSnackbarSeverity("info"); | ||||
|         setSnackbarOpen(true); | ||||
|       }) | ||||
| @@ -123,47 +102,35 @@ export const JobSubmitter = ({ stateProps }: Props) => { | ||||
|       .finally(() => setLoading(false)); | ||||
|   }; | ||||
|  | ||||
|   // Parse the job options from the query string | ||||
|   useEffect(() => { | ||||
|     if (job_options) { | ||||
|       const jsonOptions = JSON.parse(job_options as string); | ||||
|       const newJobOptions: JobOptions = { | ||||
|         multi_page_scrape: false, | ||||
|         custom_headers: null, | ||||
|       }; | ||||
|  | ||||
|       if ( | ||||
|         jsonOptions.custom_headers && | ||||
|         Object.keys(jsonOptions.custom_headers).length | ||||
|       ) { | ||||
|         setCustomJSONSelected(true); | ||||
|         newJobOptions.custom_headers = JSON.stringify( | ||||
|           jsonOptions.custom_headers | ||||
|         ); | ||||
|       } | ||||
|  | ||||
|       newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape; | ||||
|       setJobOptions(newJobOptions); | ||||
|       parseJobOptions( | ||||
|         job_options as string, | ||||
|         setCustomJSONSelected, | ||||
|         setProxiesSelected, | ||||
|         setJobOptions, | ||||
|         setSiteMap | ||||
|       ); | ||||
|     } | ||||
|   }, [job_options]); | ||||
|  | ||||
|   return ( | ||||
|     <> | ||||
|       <div> | ||||
|         <JobSubmitterHeader /> | ||||
|         <JobSubmitterInput | ||||
|           {...stateProps} | ||||
|           urlError={urlError} | ||||
|           handleSubmit={handleSubmit} | ||||
|           loading={loading} | ||||
|         /> | ||||
|         <JobSubmitterOptions | ||||
|           {...stateProps} | ||||
|           jobOptions={jobOptions} | ||||
|           setJobOptions={setJobOptions} | ||||
|           customJSONSelected={customJSONSelected} | ||||
|           setCustomJSONSelected={setCustomJSONSelected} | ||||
|         /> | ||||
|       </div> | ||||
|     </> | ||||
|     <div> | ||||
|       <JobSubmitterHeader /> | ||||
|       <JobSubmitterInput | ||||
|         urlError={urlError} | ||||
|         handleSubmit={handleSubmit} | ||||
|         loading={loading} | ||||
|       /> | ||||
|       <JobSubmitterOptions | ||||
|         jobOptions={jobOptions} | ||||
|         setJobOptions={setJobOptions} | ||||
|         customJSONSelected={customJSONSelected} | ||||
|         setCustomJSONSelected={setCustomJSONSelected} | ||||
|         handleSelectProxies={handleSelectProxies} | ||||
|         proxiesSelected={proxiesSelected} | ||||
|       /> | ||||
|     </div> | ||||
|   ); | ||||
| }; | ||||
|   | ||||
							
								
								
									
										84
									
								
								src/components/submit/job-submitter/provider.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										84
									
								
								src/components/submit/job-submitter/provider.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,84 @@ | ||||
| import React, { | ||||
|   createContext, | ||||
|   PropsWithChildren, | ||||
|   useContext, | ||||
|   useState, | ||||
|   Dispatch, | ||||
|   useMemo, | ||||
| } from "react"; | ||||
| import { Element, Result, SiteMap } from "@/types"; | ||||
|  | ||||
| type JobSubmitterProviderType = { | ||||
|   submittedURL: string; | ||||
|   setSubmittedURL: Dispatch<React.SetStateAction<string>>; | ||||
|   rows: Element[]; | ||||
|   setRows: Dispatch<React.SetStateAction<Element[]>>; | ||||
|   results: Result; | ||||
|   setResults: Dispatch<React.SetStateAction<Result>>; | ||||
|   snackbarOpen: boolean; | ||||
|   setSnackbarOpen: Dispatch<React.SetStateAction<boolean>>; | ||||
|   snackbarMessage: string; | ||||
|   setSnackbarMessage: Dispatch<React.SetStateAction<string>>; | ||||
|   snackbarSeverity: string; | ||||
|   setSnackbarSeverity: Dispatch<React.SetStateAction<string>>; | ||||
|   isValidURL: boolean; | ||||
|   setIsValidUrl: Dispatch<React.SetStateAction<boolean>>; | ||||
|   siteMap: SiteMap | null; | ||||
|   setSiteMap: Dispatch<React.SetStateAction<SiteMap | null>>; | ||||
| }; | ||||
|  | ||||
| const JobSubmitterProvider = createContext<JobSubmitterProviderType>( | ||||
|   {} as JobSubmitterProviderType | ||||
| ); | ||||
|  | ||||
| export const Provider = ({ children }: PropsWithChildren) => { | ||||
|   const [submittedURL, setSubmittedURL] = useState<string>(""); | ||||
|   const [rows, setRows] = useState<Element[]>([]); | ||||
|   const [results, setResults] = useState<Result>({}); | ||||
|   const [snackbarOpen, setSnackbarOpen] = useState<boolean>(false); | ||||
|   const [snackbarMessage, setSnackbarMessage] = useState<string>(""); | ||||
|   const [snackbarSeverity, setSnackbarSeverity] = useState<string>("error"); | ||||
|   const [isValidURL, setIsValidUrl] = useState<boolean>(true); | ||||
|   const [siteMap, setSiteMap] = useState<SiteMap | null>(null); | ||||
|  | ||||
|   const value: JobSubmitterProviderType = useMemo( | ||||
|     () => ({ | ||||
|       submittedURL, | ||||
|       setSubmittedURL, | ||||
|       rows, | ||||
|       setRows, | ||||
|       results, | ||||
|       setResults, | ||||
|       snackbarOpen, | ||||
|       setSnackbarOpen, | ||||
|       snackbarMessage, | ||||
|       setSnackbarMessage, | ||||
|       snackbarSeverity, | ||||
|       setSnackbarSeverity, | ||||
|       isValidURL, | ||||
|       setIsValidUrl, | ||||
|       siteMap, | ||||
|       setSiteMap, | ||||
|     }), | ||||
|     [ | ||||
|       submittedURL, | ||||
|       rows, | ||||
|       results, | ||||
|       snackbarOpen, | ||||
|       snackbarMessage, | ||||
|       snackbarSeverity, | ||||
|       isValidURL, | ||||
|       siteMap, | ||||
|     ] | ||||
|   ); | ||||
|  | ||||
|   return ( | ||||
|     <JobSubmitterProvider.Provider value={value}> | ||||
|       {children} | ||||
|     </JobSubmitterProvider.Provider> | ||||
|   ); | ||||
| }; | ||||
|  | ||||
| export const useJobSubmitterProvider = () => { | ||||
|   return useContext(JobSubmitterProvider); | ||||
| }; | ||||
							
								
								
									
										1
									
								
								src/components/submit/job-submitter/site-map/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								src/components/submit/job-submitter/site-map/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
| export * from "./site-map"; | ||||
| @@ -0,0 +1 @@ | ||||
| export * from "./site-map-input"; | ||||
| @@ -0,0 +1,22 @@ | ||||
| .button { | ||||
|   height: 3rem; | ||||
|   width: 2rem; | ||||
|  | ||||
|   color: #ffffff; | ||||
|   font-weight: 600; | ||||
|   border-radius: 0.375rem; | ||||
|   transition: transform 0.2s ease-in-out; | ||||
|   transform: scale(1); | ||||
| } | ||||
|  | ||||
| .button:hover { | ||||
|   transform: scale(1.05); | ||||
| } | ||||
|  | ||||
| .remove { | ||||
|   background-color: var(--delete-red) !important; | ||||
| } | ||||
|  | ||||
| .remove:hover { | ||||
|   background-color: var(--delete-red-hover) !important; | ||||
| } | ||||
| @@ -0,0 +1,135 @@ | ||||
| import { useState } from "react"; | ||||
| import { useJobSubmitterProvider } from "../../provider"; | ||||
| import { | ||||
|   MenuItem, | ||||
|   Select, | ||||
|   TextField, | ||||
|   FormControl, | ||||
|   Button, | ||||
|   Checkbox, | ||||
|   FormControlLabel, | ||||
| } from "@mui/material"; | ||||
| import { ActionOption } from "@/types/job"; | ||||
| import classes from "./site-map-input.module.css"; | ||||
| import { clsx } from "clsx"; | ||||
|  | ||||
| export type SiteMapInputProps = { | ||||
|   disabled?: boolean; | ||||
|   xpath?: string; | ||||
|   option?: ActionOption; | ||||
|   clickOnce?: boolean; | ||||
|   input?: string; | ||||
| }; | ||||
|  | ||||
| export const SiteMapInput = ({ | ||||
|   disabled, | ||||
|   xpath, | ||||
|   option, | ||||
|   clickOnce, | ||||
|   input, | ||||
| }: SiteMapInputProps) => { | ||||
|   console.log(clickOnce); | ||||
|   const [optionState, setOptionState] = useState<ActionOption>( | ||||
|     option || "click" | ||||
|   ); | ||||
|   const [xpathState, setXpathState] = useState<string>(xpath || ""); | ||||
|   const [clickOnceState, setClickOnceState] = useState<boolean>( | ||||
|     clickOnce || false | ||||
|   ); | ||||
|   const [inputState, setInputState] = useState<string>(input || ""); | ||||
|  | ||||
|   const { siteMap, setSiteMap } = useJobSubmitterProvider(); | ||||
|  | ||||
|   const handleAdd = () => { | ||||
|     if (!siteMap) return; | ||||
|  | ||||
|     console.log(optionState, xpathState, clickOnceState, inputState); | ||||
|  | ||||
|     setSiteMap((prevSiteMap) => ({ | ||||
|       ...prevSiteMap, | ||||
|       actions: [ | ||||
|         { | ||||
|           type: optionState, | ||||
|           xpath: xpathState, | ||||
|           name: "", | ||||
|           do_once: clickOnceState, | ||||
|           input: inputState, | ||||
|         }, | ||||
|         ...(prevSiteMap?.actions || []), | ||||
|       ], | ||||
|     })); | ||||
|  | ||||
|     setXpathState(""); | ||||
|   }; | ||||
|  | ||||
|   const handleRemove = () => { | ||||
|     if (!siteMap) return; | ||||
|  | ||||
|     setSiteMap((prevSiteMap) => ({ | ||||
|       ...prevSiteMap, | ||||
|       actions: (prevSiteMap?.actions || []).slice(0, -1), | ||||
|     })); | ||||
|   }; | ||||
|  | ||||
|   return ( | ||||
|     <div className="flex flex-col gap-2 w-full"> | ||||
|       <div className="flex gap-2 items-center"> | ||||
|         <FormControl className="w-1/4"> | ||||
|           <Select | ||||
|             disabled={disabled} | ||||
|             displayEmpty | ||||
|             value={optionState} | ||||
|             onChange={(e) => setOptionState(e.target.value as ActionOption)} | ||||
|           > | ||||
|             <MenuItem value="click">Click</MenuItem> | ||||
|             <MenuItem value="input">Input</MenuItem> | ||||
|           </Select> | ||||
|         </FormControl> | ||||
|         {optionState === "input" && ( | ||||
|           <TextField | ||||
|             label="Input Text" | ||||
|             fullWidth | ||||
|             value={inputState} | ||||
|             onChange={(e) => setInputState(e.target.value)} | ||||
|             disabled={disabled} | ||||
|           /> | ||||
|         )} | ||||
|         <TextField | ||||
|           label="XPath Selector" | ||||
|           fullWidth | ||||
|           value={xpathState} | ||||
|           onChange={(e) => setXpathState(e.target.value)} | ||||
|           disabled={disabled} | ||||
|         /> | ||||
|         {disabled ? ( | ||||
|           <Button | ||||
|             onClick={handleRemove} | ||||
|             className={clsx(classes.button, classes.remove)} | ||||
|           > | ||||
|             Delete | ||||
|           </Button> | ||||
|         ) : ( | ||||
|           <Button | ||||
|             onClick={handleAdd} | ||||
|             disabled={!xpathState} | ||||
|             className={clsx(classes.button, classes.add)} | ||||
|           > | ||||
|             Add | ||||
|           </Button> | ||||
|         )} | ||||
|       </div> | ||||
|       {!disabled && ( | ||||
|         <FormControlLabel | ||||
|           label="Do Once" | ||||
|           control={ | ||||
|             <Checkbox | ||||
|               checked={clickOnceState} | ||||
|               disabled={disabled} | ||||
|               onChange={() => setClickOnceState(!clickOnceState)} | ||||
|             /> | ||||
|           } | ||||
|         /> | ||||
|       )} | ||||
|     </div> | ||||
|   ); | ||||
| }; | ||||
							
								
								
									
										70
									
								
								src/components/submit/job-submitter/site-map/site-map.tsx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								src/components/submit/job-submitter/site-map/site-map.tsx
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,70 @@ | ||||
| import { useEffect, useState } from "react"; | ||||
| import { useJobSubmitterProvider } from "../provider"; | ||||
| import { Button, Divider, Typography, useTheme } from "@mui/material"; | ||||
| import { SiteMapInput } from "./site-map-input"; | ||||
|  | ||||
| export const SiteMap = () => { | ||||
|   const { siteMap, setSiteMap } = useJobSubmitterProvider(); | ||||
|   const [showSiteMap, setShowSiteMap] = useState<boolean>(false); | ||||
|   const theme = useTheme(); | ||||
|  | ||||
|   const handleCreateSiteMap = () => { | ||||
|     setSiteMap({ actions: [] }); | ||||
|     setShowSiteMap(true); | ||||
|   }; | ||||
|  | ||||
|   const handleClearSiteMap = () => { | ||||
|     setSiteMap(null); | ||||
|     setShowSiteMap(false); | ||||
|   }; | ||||
|  | ||||
|   useEffect(() => { | ||||
|     if (siteMap) { | ||||
|       setShowSiteMap(true); | ||||
|     } | ||||
|   }, [siteMap]); | ||||
|  | ||||
|   return ( | ||||
|     <div className="flex flex-col gap-4"> | ||||
|       {siteMap ? ( | ||||
|         <Button onClick={handleClearSiteMap}>Clear Site Map</Button> | ||||
|       ) : ( | ||||
|         <Button onClick={handleCreateSiteMap}>Create Site Map</Button> | ||||
|       )} | ||||
|       {showSiteMap && ( | ||||
|         <div className="flex flex-col gap-4"> | ||||
|           <SiteMapInput /> | ||||
|           {siteMap?.actions && siteMap?.actions.length > 0 && ( | ||||
|             <> | ||||
|               <Divider | ||||
|                 sx={{ | ||||
|                   borderColor: | ||||
|                     theme.palette.mode === "dark" ? "#ffffff" : "0000000", | ||||
|                 }} | ||||
|               /> | ||||
|               <Typography className="w-full text-center" variant="h5"> | ||||
|                 Site Map Actions | ||||
|               </Typography> | ||||
|             </> | ||||
|           )} | ||||
|           <ul className="flex flex-col gap-4"> | ||||
|             {siteMap?.actions.reverse().map((action, index) => ( | ||||
|               <li key={action.xpath} className="flex w-full items-center"> | ||||
|                 <Typography variant="h6" className="w-[10%] mr-2"> | ||||
|                   Action {index + 1}: | ||||
|                 </Typography> | ||||
|                 <SiteMapInput | ||||
|                   disabled={Boolean(siteMap)} | ||||
|                   xpath={action.xpath} | ||||
|                   option={action.type} | ||||
|                   clickOnce={action.do_once} | ||||
|                   input={action.input} | ||||
|                 /> | ||||
|               </li> | ||||
|             ))} | ||||
|           </ul> | ||||
|         </div> | ||||
|       )} | ||||
|     </div> | ||||
|   ); | ||||
| }; | ||||
| @@ -1,6 +1,5 @@ | ||||
| import React, { createContext, useContext, useState, useEffect } from "react"; | ||||
| import axios from "axios"; | ||||
| import { Constants } from "../lib"; | ||||
| import Cookies from "js-cookie"; | ||||
|  | ||||
| interface AuthContextProps { | ||||
| @@ -25,7 +24,7 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => { | ||||
|     const token = Cookies.get("token"); | ||||
|     if (token) { | ||||
|       axios | ||||
|         .get(`${Constants.DOMAIN}/api/auth/users/me`, { | ||||
|         .get(`/api/me`, { | ||||
|           headers: { Authorization: `Bearer ${token}` }, | ||||
|         }) | ||||
|         .then((response) => { | ||||
| @@ -42,10 +41,8 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => { | ||||
|     const params = new URLSearchParams(); | ||||
|     params.append("username", email); | ||||
|     params.append("password", password); | ||||
|     const response = await axios.post( | ||||
|       `${Constants.DOMAIN}/api/auth/token`, | ||||
|       params | ||||
|     ); | ||||
|     const response = await axios.post(`/api/token`, params); | ||||
|  | ||||
|     Cookies.set("token", response.data.access_token, { | ||||
|       expires: 7, | ||||
|       path: "/", | ||||
| @@ -53,12 +50,11 @@ export const AuthProvider: React.FC<AuthProps> = ({ children }) => { | ||||
|       secure: false, | ||||
|       sameSite: "Lax", | ||||
|     }); | ||||
|     const userResponse = await axios.get( | ||||
|       `${Constants.DOMAIN}/api/auth/users/me`, | ||||
|       { | ||||
|         headers: { Authorization: `Bearer ${response.data.access_token}` }, | ||||
|       } | ||||
|     ); | ||||
|  | ||||
|     const userResponse = await axios.get(`/api/me`, { | ||||
|       headers: { Authorization: `Bearer ${response.data.access_token}` }, | ||||
|     }); | ||||
|  | ||||
|     setUser(userResponse.data); | ||||
|     setIsAuthenticated(true); | ||||
|   }; | ||||
|   | ||||
							
								
								
									
										2
									
								
								src/lib/helpers/index.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								src/lib/helpers/index.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| export * from "./parse-job-options"; | ||||
| export * from "./validate-url"; | ||||
							
								
								
									
										42
									
								
								src/lib/helpers/parse-job-options.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								src/lib/helpers/parse-job-options.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,42 @@ | ||||
| import { Dispatch, SetStateAction } from "react"; | ||||
|  | ||||
| import { RawJobOptions, SiteMap } from "@/types"; | ||||
|  | ||||
| export const parseJobOptions = ( | ||||
|   job_options: string, | ||||
|   setCustomJSONSelected: Dispatch<SetStateAction<boolean>>, | ||||
|   setProxiesSelected: Dispatch<SetStateAction<boolean>>, | ||||
|   setJobOptions: Dispatch<SetStateAction<RawJobOptions>>, | ||||
|   setSiteMap: Dispatch<SetStateAction<any>> | ||||
| ) => { | ||||
|   if (job_options) { | ||||
|     const jsonOptions = JSON.parse(job_options as string); | ||||
|     const newJobOptions: RawJobOptions = { | ||||
|       multi_page_scrape: false, | ||||
|       custom_headers: null, | ||||
|       proxies: null, | ||||
|       collect_media: false, | ||||
|     }; | ||||
|  | ||||
|     if ( | ||||
|       jsonOptions.custom_headers && | ||||
|       Object.keys(jsonOptions.custom_headers).length | ||||
|     ) { | ||||
|       setCustomJSONSelected(true); | ||||
|       newJobOptions.custom_headers = JSON.stringify(jsonOptions.custom_headers); | ||||
|     } | ||||
|  | ||||
|     newJobOptions.multi_page_scrape = jsonOptions.multi_page_scrape; | ||||
|  | ||||
|     if (jsonOptions.proxies.length > 0) { | ||||
|       setProxiesSelected(true); | ||||
|       newJobOptions.proxies = jsonOptions.proxies.join(","); | ||||
|     } | ||||
|  | ||||
|     if (jsonOptions.site_map) { | ||||
|       setSiteMap(jsonOptions.site_map); | ||||
|     } | ||||
|  | ||||
|     setJobOptions(newJobOptions); | ||||
|   } | ||||
| }; | ||||
							
								
								
									
										8
									
								
								src/lib/helpers/validate-url.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								src/lib/helpers/validate-url.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,8 @@ | ||||
| export function validateURL(url: string): boolean { | ||||
|   try { | ||||
|     new URL(url); | ||||
|     return true; | ||||
|   } catch (_) { | ||||
|     return false; | ||||
|   } | ||||
| } | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user