Spaces:

ChambreAgriculturePaysLoire
/

routeur_ia_api

Running

App Files Files Community

Cyril Dupland commited on May 5

Commit

c0cbc50

1 Parent(s): 33e0e5a

Add support for .docx file ingestion: Update requirements to include python-docx, modify document type detection to support .docx, and implement text extraction and indexing for Word documents in the ingestion service.

Browse files

Files changed (3) hide show

api/routes/documents.py +16 -4
requirements.txt +3 -0
services/ingestion_service.py +94 -1

api/routes/documents.py CHANGED Viewed

@@ -15,7 +15,13 @@ from config.settings import settings
 router = APIRouter(prefix="/projects/{id_project}/documents", tags=["Documents"])
-SupportedDocumentType = Literal["pdf", "txt"]
 def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
@@ -25,6 +31,7 @@ def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
     Currently supports:
     - PDF: .pdf extension or application/pdf
     - TXT: .txt extension or text/plain
     """
     filename = (file.filename or "").lower()
     content_type = (file.content_type or "").lower()
@@ -33,13 +40,17 @@ def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
     if is_pdf:
         return "pdf"
     is_txt = filename.endswith(".txt") or content_type == "text/plain"
     if is_txt:
         return "txt"
     raise HTTPException(
         status_code=status.HTTP_400_BAD_REQUEST,
-        detail="Only PDF and plain text (.txt) files are supported at the moment.",
     )
@@ -58,7 +69,7 @@ async def upload_project_document(
     background_tasks: BackgroundTasks,
     current_user: dict = Depends(get_current_user),
     id_document: str = Form(..., description="UUID du document (obligatoire)"),
-    file: UploadFile = File(..., description="Project document to ingest (PDF or plain text)"),
     index_name: str = Query("projects", description="Logical index (settings.vector_indexes key)"),
 ) -> UploadJobResponse:
     """
@@ -66,10 +77,11 @@ async def upload_project_document(
     Body fields (multipart/form-data):
     - id_document (UUID, required): stable identifier of the logical document
-    - file (PDF or .txt): binary content
     Supported formats:
     - PDF: processed via Mistral OCR -> chunk -> embed -> index
     - Plain text (.txt): direct text -> chunk -> embed -> index (no OCR)
     Returns a `job_id` for status polling.

 router = APIRouter(prefix="/projects/{id_project}/documents", tags=["Documents"])
+SupportedDocumentType = Literal["pdf", "txt", "docx"]
+_DOCX_CONTENT_TYPES = {
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    "application/msword",
+}
 def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
     Currently supports:
     - PDF: .pdf extension or application/pdf
     - TXT: .txt extension or text/plain
+    - DOCX: .docx extension or Word OpenXML content type
     """
     filename = (file.filename or "").lower()
     content_type = (file.content_type or "").lower()
     if is_pdf:
         return "pdf"
+    is_docx = filename.endswith(".docx") or content_type in _DOCX_CONTENT_TYPES
+    if is_docx:
+        return "docx"
     is_txt = filename.endswith(".txt") or content_type == "text/plain"
     if is_txt:
         return "txt"
     raise HTTPException(
         status_code=status.HTTP_400_BAD_REQUEST,
+        detail="Only PDF, Word (.docx) and plain text (.txt) files are supported at the moment.",
     )
     background_tasks: BackgroundTasks,
     current_user: dict = Depends(get_current_user),
     id_document: str = Form(..., description="UUID du document (obligatoire)"),
+    file: UploadFile = File(..., description="Project document to ingest (PDF, .docx or plain text)"),
     index_name: str = Query("projects", description="Logical index (settings.vector_indexes key)"),
 ) -> UploadJobResponse:
     """
     Body fields (multipart/form-data):
     - id_document (UUID, required): stable identifier of the logical document
+    - file (PDF, .docx or .txt): binary content
     Supported formats:
     - PDF: processed via Mistral OCR -> chunk -> embed -> index
+    - DOCX (Word): text extraction -> chunk -> embed -> index (no OCR)
     - Plain text (.txt): direct text -> chunk -> embed -> index (no OCR)
     Returns a `job_id` for status polling.

requirements.txt CHANGED Viewed

@@ -51,6 +51,9 @@ reportlab>=4.4.4,<5
 markdown>=3.7,<4
 markdown-pdf>=1.13.0
 # Ecologits Carbon Footprint
 ecologits>=0.8.2,<1

 markdown>=3.7,<4
 markdown-pdf>=1.13.0
+# Ingestion .docx (Microsoft Word)
+python-docx>=1.2.0,<2
 # Ecologits Carbon Footprint
 ecologits>=0.8.2,<1

services/ingestion_service.py CHANGED Viewed

@@ -5,6 +5,7 @@ through the shared TaskRegistry.
 """
 from __future__ import annotations
 from typing import List, Literal
 from langchain_core.documents import Document
@@ -38,7 +39,41 @@ def _chunk_text(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
     return chunks
-DocumentFileType = Literal["pdf", "txt"]
 def _run_pdf_pipeline(
@@ -165,6 +200,62 @@ def _run_text_pipeline(
     task_registry.set_done(job_id, inserted_count=inserted_count)
 def run_job(
     job_id: str,
     project_id: str,
@@ -179,6 +270,8 @@ def run_job(
     try:
         if file_type == "txt":
             _run_text_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
         else:
             _run_pdf_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
     except Exception as exc:

 """
 from __future__ import annotations
+import io
 from typing import List, Literal
 from langchain_core.documents import Document
     return chunks
+DocumentFileType = Literal["pdf", "txt", "docx"]
+def _extract_text_from_docx(content_bytes: bytes) -> str:
+    """Extract textual content from a .docx (Word OpenXML) byte stream.
+    Concatenates paragraphs and table cells in document order, separated
+    by line breaks. Headers/footers are intentionally skipped to keep
+    chunks aligned with the visible body content.
+    """
+    try:
+        from docx import Document as _DocxDocument  # type: ignore[import-not-found]
+    except ImportError as exc:
+        raise RuntimeError(
+            "python-docx is required to ingest .docx files. "
+            "Install it with `pip install python-docx`."
+        ) from exc
+    document = _DocxDocument(io.BytesIO(content_bytes))
+    parts: List[str] = []
+    for paragraph in document.paragraphs:
+        text = (paragraph.text or "").strip()
+        if text:
+            parts.append(text)
+    for table in document.tables:
+        for row in table.rows:
+            row_cells = [(cell.text or "").strip() for cell in row.cells]
+            row_text = " | ".join(cell for cell in row_cells if cell)
+            if row_text:
+                parts.append(row_text)
+    return "\n".join(parts)
 def _run_pdf_pipeline(
     task_registry.set_done(job_id, inserted_count=inserted_count)
+def _run_docx_pipeline(
+    job_id: str,
+    project_id: str,
+    filename: str,
+    content_bytes: bytes,
+    uploaded_by: str | None,
+    index_name: str | None,
+    document_id: str | None,
+) -> None:
+    """Execute the .docx ingestion pipeline (Word -> text -> chunk -> index)."""
+    job = task_registry.get(job_id)
+    if not job:
+        return
+    task_registry.set_running(job_id)
+    # 1) Parse / extract text from the Word document
+    task_registry.set_stage(job_id, "parse")
+    job.log(f"Extracting text from Word document '{filename}'")
+    try:
+        text = _extract_text_from_docx(content_bytes)
+    except Exception as exc:
+        raise RuntimeError(f"Failed to read docx file '{filename}': {exc}") from exc
+    # 2) Chunk
+    task_registry.set_stage(job_id, "chunk")
+    chunk_size = settings.chunk_size
+    chunk_overlap = settings.chunk_overlap
+    chunks = _chunk_text(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    documents: List[Document] = []
+    chunks_total = len(chunks)
+    task_registry.set_progress(job_id, chunks_total=chunks_total, chunks_done=0)
+    for i, chunk in enumerate(chunks):
+        metadata = {
+            "source": filename,
+            "page_number": 1,
+            "type": f"{settings.doc_default_type}_docx",
+            "project_id": project_id,
+            "uploaded_by": uploaded_by,
+            "chunk_index": i,
+            "document_id": document_id,
+        }
+        documents.append(Document(page_content=chunk, metadata=metadata))
+        task_registry.set_progress(job_id, chunks_done=i + 1)
+    # 3) Embed + index
+    task_registry.set_stage(job_id, "index")
+    job.log(f"Indexing {len(documents)} docx chunks to Supabase")
+    ids = add_documents(documents, index_name=index_name)
+    inserted_count = len(ids)
+    task_registry.set_done(job_id, inserted_count=inserted_count)
 def run_job(
     job_id: str,
     project_id: str,
     try:
         if file_type == "txt":
             _run_text_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
+        elif file_type == "docx":
+            _run_docx_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
         else:
             _run_pdf_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
     except Exception as exc: