mirror of
https://github.com/jaypyles/Scraperr.git
synced 2025-12-10 17:55:45 +00:00
feat: convert to self-hosted version
This commit is contained in:
@@ -1,49 +0,0 @@
|
||||
# STL
|
||||
import os
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
# PDM
|
||||
import boto3
|
||||
from mypy_boto3_dynamodb.service_resource import Table, DynamoDBServiceResource
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def connect_to_dynamo() -> Table:
|
||||
region_name = os.getenv("AWS_REGION")
|
||||
dynamodb: DynamoDBServiceResource = boto3.resource(
|
||||
"dynamodb", region_name=region_name
|
||||
)
|
||||
return dynamodb.Table("scrape")
|
||||
|
||||
|
||||
def insert(table: Table, item: dict[str, Any]) -> None:
|
||||
i = table.put_item(Item=item)
|
||||
LOG.info(f"Inserted item: {i}")
|
||||
|
||||
|
||||
def query(table: Table, index_name: str, key_condition: Any) -> list[Any]:
|
||||
try:
|
||||
response = table.query(
|
||||
IndexName=index_name, KeyConditionExpression=key_condition
|
||||
)
|
||||
items = response.get("Items", [])
|
||||
for item in items:
|
||||
LOG.info(f"Queried item: {item}")
|
||||
return items
|
||||
except Exception as e:
|
||||
LOG.error(f"Failed to query table: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def query_by_id(table: Table, key_condition: Any) -> list[Any]:
|
||||
try:
|
||||
response = table.query(KeyConditionExpression=key_condition)
|
||||
items = response.get("Items", [])
|
||||
for item in items:
|
||||
LOG.info(f"Queried item: {item}")
|
||||
return items
|
||||
except Exception as e:
|
||||
LOG.error(f"Failed to query table: {e}")
|
||||
raise
|
||||
@@ -5,27 +5,28 @@ from io import StringIO
|
||||
|
||||
# PDM
|
||||
import pandas as pd
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi import FastAPI
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from boto3.dynamodb.conditions import Key
|
||||
|
||||
# LOCAL
|
||||
from api.backend.amazon import query, insert, query_by_id, connect_to_dynamo
|
||||
from api.backend.job import query, insert
|
||||
from api.backend.models import DownloadJob, SubmitScrapeJob, RetrieveScrapeJobs
|
||||
from api.backend.scraping import scrape
|
||||
from api.backend.auth.auth_router import auth_router
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[%(levelname)s] %(asctime)s - %(name)s - %(message)s",
|
||||
format="%(levelname)s: %(asctime)s - %(name)s - %(message)s",
|
||||
handlers=[logging.StreamHandler()],
|
||||
)
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI(title="api")
|
||||
app.include_router(auth_router)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
@@ -54,10 +55,9 @@ async def submit_scrape_job(job: SubmitScrapeJob):
|
||||
)
|
||||
|
||||
json_scraped = jsonable_encoder(scraped)
|
||||
table = connect_to_dynamo()
|
||||
job.result = json_scraped
|
||||
job.id = uuid.uuid4().hex
|
||||
insert(table, jsonable_encoder(job))
|
||||
await insert(jsonable_encoder(job))
|
||||
return JSONResponse(content=json_scraped)
|
||||
except Exception as e:
|
||||
return JSONResponse(content={"error": str(e)}, status_code=500)
|
||||
@@ -67,8 +67,7 @@ async def submit_scrape_job(job: SubmitScrapeJob):
|
||||
async def retrieve_scrape_jobs(retrieve: RetrieveScrapeJobs):
|
||||
LOG.info(f"Retrieving jobs for account: {retrieve.user}")
|
||||
try:
|
||||
table = connect_to_dynamo()
|
||||
results = query(table, "user", Key("user").eq(retrieve.user))
|
||||
results = await query({"user": retrieve.user})
|
||||
return JSONResponse(content=results)
|
||||
except Exception as e:
|
||||
LOG.error(f"Exception occurred: {e}")
|
||||
@@ -79,8 +78,7 @@ async def retrieve_scrape_jobs(retrieve: RetrieveScrapeJobs):
|
||||
async def download(download_job: DownloadJob):
|
||||
LOG.info(f"Downloading job with id: {download_job.id}")
|
||||
try:
|
||||
table = connect_to_dynamo()
|
||||
results = query_by_id(table, Key("id").eq(download_job.id))
|
||||
results = await query({"id": download_job.id})
|
||||
|
||||
df = pd.DataFrame(results)
|
||||
|
||||
|
||||
0
api/backend/auth/__init__.py
Normal file
0
api/backend/auth/__init__.py
Normal file
57
api/backend/auth/auth_router.py
Normal file
57
api/backend/auth/auth_router.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# STL
|
||||
from datetime import timedelta
|
||||
|
||||
# PDM
|
||||
from fastapi import Depends, APIRouter, HTTPException, status
|
||||
from fastapi.security import OAuth2PasswordRequestForm
|
||||
|
||||
# LOCAL
|
||||
from api.backend.schemas import User, Token, UserCreate
|
||||
from api.backend.database import get_user_collection
|
||||
from api.backend.auth.auth_utils import (
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES,
|
||||
get_current_user,
|
||||
authenticate_user,
|
||||
get_password_hash,
|
||||
create_access_token,
|
||||
)
|
||||
|
||||
auth_router = APIRouter()
|
||||
|
||||
|
||||
@auth_router.post("/api/auth/token", response_model=Token)
|
||||
async def login_for_access_token(form_data: OAuth2PasswordRequestForm = Depends()):
|
||||
user = await authenticate_user(form_data.username, form_data.password)
|
||||
if not user:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Incorrect username or password",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
|
||||
expire_minutes = (
|
||||
int(ACCESS_TOKEN_EXPIRE_MINUTES) if ACCESS_TOKEN_EXPIRE_MINUTES else 60
|
||||
)
|
||||
|
||||
access_token_expires = timedelta(minutes=expire_minutes)
|
||||
access_token = create_access_token(
|
||||
data={"sub": user.email}, expires_delta=access_token_expires
|
||||
)
|
||||
|
||||
return {"access_token": access_token, "token_type": "bearer"}
|
||||
|
||||
|
||||
@auth_router.post("/api/auth/signup", response_model=User)
|
||||
async def create_user(user: UserCreate):
|
||||
users_collection = get_user_collection()
|
||||
hashed_password = get_password_hash(user.password)
|
||||
user_dict = user.model_dump()
|
||||
user_dict["hashed_password"] = hashed_password
|
||||
del user_dict["password"]
|
||||
_ = await users_collection.insert_one(user_dict)
|
||||
return user_dict
|
||||
|
||||
|
||||
@auth_router.get("/api/auth/users/me", response_model=User)
|
||||
async def read_users_me(current_user: User = Depends(get_current_user)):
|
||||
return current_user
|
||||
101
api/backend/auth/auth_utils.py
Normal file
101
api/backend/auth/auth_utils.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# STL
|
||||
import os
|
||||
from typing import Any, Optional
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# PDM
|
||||
from jose import JWTError, jwt
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import Depends, HTTPException, status
|
||||
from passlib.context import CryptContext
|
||||
from fastapi.security import OAuth2PasswordBearer
|
||||
|
||||
# LOCAL
|
||||
from api.backend.schemas import User, UserInDB, TokenData
|
||||
from api.backend.database import get_user_collection
|
||||
|
||||
_ = load_dotenv()
|
||||
|
||||
SECRET_KEY = os.getenv("SECRET_KEY") or ""
|
||||
ALGORITHM = os.getenv("ALGORITHM") or ""
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES = os.getenv("ACCESS_TOKEN_EXPIRE_MINUTES")
|
||||
|
||||
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
|
||||
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="auth/token")
|
||||
|
||||
|
||||
def verify_password(plain_password: str, hashed_password: str):
|
||||
return pwd_context.verify(plain_password, hashed_password)
|
||||
|
||||
|
||||
def get_password_hash(password: str):
|
||||
return pwd_context.hash(password)
|
||||
|
||||
|
||||
async def get_user(email: str):
|
||||
user_collection = get_user_collection()
|
||||
user = await user_collection.find_one({"email": email})
|
||||
|
||||
if not user:
|
||||
return
|
||||
|
||||
return UserInDB(**user)
|
||||
|
||||
|
||||
async def authenticate_user(email: str, password: str):
|
||||
user = await get_user(email)
|
||||
|
||||
if not user:
|
||||
return False
|
||||
|
||||
if not verify_password(password, user.hashed_password):
|
||||
return False
|
||||
|
||||
return user
|
||||
|
||||
|
||||
def create_access_token(
|
||||
data: dict[str, Any], expires_delta: Optional[timedelta] = None
|
||||
):
|
||||
to_encode = data.copy()
|
||||
|
||||
if expires_delta:
|
||||
expire = datetime.now() + expires_delta
|
||||
else:
|
||||
expire = datetime.now() + timedelta(minutes=15)
|
||||
to_encode.update({"exp": expire})
|
||||
|
||||
encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
||||
return encoded_jwt
|
||||
|
||||
|
||||
async def get_current_user(token: str = Depends(oauth2_scheme)):
|
||||
credentials_exception = HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Could not validate credentials",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
|
||||
try:
|
||||
payload: Optional[dict[str, Any]] = jwt.decode(
|
||||
token, SECRET_KEY, algorithms=[ALGORITHM]
|
||||
)
|
||||
if not payload:
|
||||
raise credentials_exception
|
||||
|
||||
email = payload.get("sub")
|
||||
|
||||
if email is None:
|
||||
raise credentials_exception
|
||||
|
||||
token_data = TokenData(email=email)
|
||||
|
||||
except JWTError:
|
||||
raise credentials_exception
|
||||
|
||||
user = await get_user(email=token_data.email)
|
||||
|
||||
if user is None:
|
||||
raise credentials_exception
|
||||
|
||||
return user
|
||||
23
api/backend/database.py
Normal file
23
api/backend/database.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# STL
|
||||
import os
|
||||
from typing import Any
|
||||
|
||||
# PDM
|
||||
from dotenv import load_dotenv
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
_ = load_dotenv()
|
||||
|
||||
MONGODB_URI = os.getenv("MONGODB_URI")
|
||||
|
||||
|
||||
def get_user_collection():
|
||||
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
|
||||
db = client["scrape"]
|
||||
return db["users"]
|
||||
|
||||
|
||||
def get_job_collection():
|
||||
client: AsyncIOMotorClient[dict[str, Any]] = AsyncIOMotorClient(MONGODB_URI)
|
||||
db = client["scrape"]
|
||||
return db["jobs"]
|
||||
26
api/backend/job.py
Normal file
26
api/backend/job.py
Normal file
@@ -0,0 +1,26 @@
|
||||
# STL
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
# LOCAL
|
||||
from api.backend.database import get_job_collection
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def insert(item: dict[str, Any]) -> None:
|
||||
collection = get_job_collection()
|
||||
i = await collection.insert_one(item)
|
||||
LOG.info(f"Inserted item: {i}")
|
||||
|
||||
|
||||
async def query(filter: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
collection = get_job_collection()
|
||||
cursor = collection.find(filter)
|
||||
results: list[dict[str, Any]] = []
|
||||
|
||||
async for document in cursor:
|
||||
del document["_id"]
|
||||
results.append(document)
|
||||
|
||||
return results
|
||||
30
api/backend/schemas.py
Normal file
30
api/backend/schemas.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# STL
|
||||
from typing import Optional
|
||||
|
||||
# PDM
|
||||
from pydantic import EmailStr, BaseModel
|
||||
|
||||
|
||||
class Token(BaseModel):
|
||||
access_token: str
|
||||
token_type: str
|
||||
|
||||
|
||||
class TokenData(BaseModel):
|
||||
email: Optional[str] = None
|
||||
|
||||
|
||||
class User(BaseModel):
|
||||
email: EmailStr
|
||||
full_name: Optional[str] = None
|
||||
disabled: Optional[bool] = None
|
||||
|
||||
|
||||
class UserInDB(User):
|
||||
hashed_password: str
|
||||
|
||||
|
||||
class UserCreate(BaseModel):
|
||||
email: EmailStr
|
||||
password: str
|
||||
full_name: Optional[str] = None
|
||||
Reference in New Issue
Block a user