"""Document ingestion routes: upload PDF and track job status.""" from fastapi import APIRouter, UploadFile, File, HTTPException, status, Depends, BackgroundTasks from fastapi import Query from typing import Optional from core.security import get_current_user from domain.models import UploadJobResponse, JobStatusResponse, ErrorResponse from services.task_service import task_registry from services import ingestion_service from config.settings import settings router = APIRouter(prefix="/projects/{id_project}/documents", tags=["Documents"]) def _validate_pdf(file: UploadFile) -> None: filename = file.filename or "" content_type = (file.content_type or "").lower() if not filename.lower().endswith(".pdf") and content_type != "application/pdf": raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail="Only PDF files are supported at the moment." ) @router.post( "", response_model=UploadJobResponse, responses={ 400: {"model": ErrorResponse}, 401: {"model": ErrorResponse}, 500: {"model": ErrorResponse}, 202: {"description": "Accepted: returns job id while processing continues in background"}, } ) async def upload_project_document( id_project: str, background_tasks: BackgroundTasks, current_user: dict = Depends(get_current_user), file: UploadFile = File(..., description="PDF document to ingest"), index_name: str = Query("projects", description="Logical index (settings.vector_indexes key)"), ) -> UploadJobResponse: """ Start background ingestion for a project PDF (Mistral OCR -> chunk -> embed -> index). Returns a `job_id` for status polling. """ _validate_pdf(file) # Size validation file.file.seek(0, 2) file_size = file.file.tell() file.file.seek(0) max_bytes = settings.max_upload_mb_pdf * 1024 * 1024 if file_size > max_bytes: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=f"File too large. Max {settings.max_upload_mb_pdf} MB." ) content = await file.read() uploaded_by: Optional[str] = ( current_user.get("sub") or current_user.get("user_id") or current_user.get("email") ) # Create job and enqueue background task job = task_registry.create_job(project_id=id_project, filename=file.filename, uploaded_by=uploaded_by) background_tasks.add_task( ingestion_service.run_job, job.job_id, id_project, file.filename, content, uploaded_by, index_name ) return UploadJobResponse(job_id=job.job_id, status=job.status, created_at=job.created_at) @router.get( "/jobs/{job_id}", response_model=JobStatusResponse, responses={ 404: {"model": ErrorResponse}, 401: {"model": ErrorResponse}, } ) async def get_job_status( id_project: str, job_id: str, current_user: dict = Depends(get_current_user) ) -> JobStatusResponse: """ Get the current status/progress of a background ingestion job. """ snapshot = task_registry.status_snapshot(job_id) if not snapshot: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found") # Optional: ensure job belongs to the requested project job = task_registry.get(job_id) if job and job.project_id != id_project: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found for project") return JobStatusResponse(**snapshot)