Cyril Dupland commited on
Commit ·
c0cbc50
1
Parent(s): 33e0e5a
Add support for .docx file ingestion: Update requirements to include python-docx, modify document type detection to support .docx, and implement text extraction and indexing for Word documents in the ingestion service.
Browse files- api/routes/documents.py +16 -4
- requirements.txt +3 -0
- services/ingestion_service.py +94 -1
api/routes/documents.py
CHANGED
|
@@ -15,7 +15,13 @@ from config.settings import settings
|
|
| 15 |
router = APIRouter(prefix="/projects/{id_project}/documents", tags=["Documents"])
|
| 16 |
|
| 17 |
|
| 18 |
-
SupportedDocumentType = Literal["pdf", "txt"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
|
|
@@ -25,6 +31,7 @@ def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
|
|
| 25 |
Currently supports:
|
| 26 |
- PDF: .pdf extension or application/pdf
|
| 27 |
- TXT: .txt extension or text/plain
|
|
|
|
| 28 |
"""
|
| 29 |
filename = (file.filename or "").lower()
|
| 30 |
content_type = (file.content_type or "").lower()
|
|
@@ -33,13 +40,17 @@ def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
|
|
| 33 |
if is_pdf:
|
| 34 |
return "pdf"
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
is_txt = filename.endswith(".txt") or content_type == "text/plain"
|
| 37 |
if is_txt:
|
| 38 |
return "txt"
|
| 39 |
|
| 40 |
raise HTTPException(
|
| 41 |
status_code=status.HTTP_400_BAD_REQUEST,
|
| 42 |
-
detail="Only PDF and plain text (.txt) files are supported at the moment.",
|
| 43 |
)
|
| 44 |
|
| 45 |
|
|
@@ -58,7 +69,7 @@ async def upload_project_document(
|
|
| 58 |
background_tasks: BackgroundTasks,
|
| 59 |
current_user: dict = Depends(get_current_user),
|
| 60 |
id_document: str = Form(..., description="UUID du document (obligatoire)"),
|
| 61 |
-
file: UploadFile = File(..., description="Project document to ingest (PDF or plain text)"),
|
| 62 |
index_name: str = Query("projects", description="Logical index (settings.vector_indexes key)"),
|
| 63 |
) -> UploadJobResponse:
|
| 64 |
"""
|
|
@@ -66,10 +77,11 @@ async def upload_project_document(
|
|
| 66 |
|
| 67 |
Body fields (multipart/form-data):
|
| 68 |
- id_document (UUID, required): stable identifier of the logical document
|
| 69 |
-
- file (PDF or .txt): binary content
|
| 70 |
|
| 71 |
Supported formats:
|
| 72 |
- PDF: processed via Mistral OCR -> chunk -> embed -> index
|
|
|
|
| 73 |
- Plain text (.txt): direct text -> chunk -> embed -> index (no OCR)
|
| 74 |
|
| 75 |
Returns a `job_id` for status polling.
|
|
|
|
| 15 |
router = APIRouter(prefix="/projects/{id_project}/documents", tags=["Documents"])
|
| 16 |
|
| 17 |
|
| 18 |
+
SupportedDocumentType = Literal["pdf", "txt", "docx"]
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
_DOCX_CONTENT_TYPES = {
|
| 22 |
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
| 23 |
+
"application/msword",
|
| 24 |
+
}
|
| 25 |
|
| 26 |
|
| 27 |
def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
|
|
|
|
| 31 |
Currently supports:
|
| 32 |
- PDF: .pdf extension or application/pdf
|
| 33 |
- TXT: .txt extension or text/plain
|
| 34 |
+
- DOCX: .docx extension or Word OpenXML content type
|
| 35 |
"""
|
| 36 |
filename = (file.filename or "").lower()
|
| 37 |
content_type = (file.content_type or "").lower()
|
|
|
|
| 40 |
if is_pdf:
|
| 41 |
return "pdf"
|
| 42 |
|
| 43 |
+
is_docx = filename.endswith(".docx") or content_type in _DOCX_CONTENT_TYPES
|
| 44 |
+
if is_docx:
|
| 45 |
+
return "docx"
|
| 46 |
+
|
| 47 |
is_txt = filename.endswith(".txt") or content_type == "text/plain"
|
| 48 |
if is_txt:
|
| 49 |
return "txt"
|
| 50 |
|
| 51 |
raise HTTPException(
|
| 52 |
status_code=status.HTTP_400_BAD_REQUEST,
|
| 53 |
+
detail="Only PDF, Word (.docx) and plain text (.txt) files are supported at the moment.",
|
| 54 |
)
|
| 55 |
|
| 56 |
|
|
|
|
| 69 |
background_tasks: BackgroundTasks,
|
| 70 |
current_user: dict = Depends(get_current_user),
|
| 71 |
id_document: str = Form(..., description="UUID du document (obligatoire)"),
|
| 72 |
+
file: UploadFile = File(..., description="Project document to ingest (PDF, .docx or plain text)"),
|
| 73 |
index_name: str = Query("projects", description="Logical index (settings.vector_indexes key)"),
|
| 74 |
) -> UploadJobResponse:
|
| 75 |
"""
|
|
|
|
| 77 |
|
| 78 |
Body fields (multipart/form-data):
|
| 79 |
- id_document (UUID, required): stable identifier of the logical document
|
| 80 |
+
- file (PDF, .docx or .txt): binary content
|
| 81 |
|
| 82 |
Supported formats:
|
| 83 |
- PDF: processed via Mistral OCR -> chunk -> embed -> index
|
| 84 |
+
- DOCX (Word): text extraction -> chunk -> embed -> index (no OCR)
|
| 85 |
- Plain text (.txt): direct text -> chunk -> embed -> index (no OCR)
|
| 86 |
|
| 87 |
Returns a `job_id` for status polling.
|
requirements.txt
CHANGED
|
@@ -51,6 +51,9 @@ reportlab>=4.4.4,<5
|
|
| 51 |
markdown>=3.7,<4
|
| 52 |
markdown-pdf>=1.13.0
|
| 53 |
|
|
|
|
|
|
|
|
|
|
| 54 |
# Ecologits Carbon Footprint
|
| 55 |
ecologits>=0.8.2,<1
|
| 56 |
|
|
|
|
| 51 |
markdown>=3.7,<4
|
| 52 |
markdown-pdf>=1.13.0
|
| 53 |
|
| 54 |
+
# Ingestion .docx (Microsoft Word)
|
| 55 |
+
python-docx>=1.2.0,<2
|
| 56 |
+
|
| 57 |
# Ecologits Carbon Footprint
|
| 58 |
ecologits>=0.8.2,<1
|
| 59 |
|
services/ingestion_service.py
CHANGED
|
@@ -5,6 +5,7 @@ through the shared TaskRegistry.
|
|
| 5 |
"""
|
| 6 |
from __future__ import annotations
|
| 7 |
|
|
|
|
| 8 |
from typing import List, Literal
|
| 9 |
|
| 10 |
from langchain_core.documents import Document
|
|
@@ -38,7 +39,41 @@ def _chunk_text(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
|
|
| 38 |
return chunks
|
| 39 |
|
| 40 |
|
| 41 |
-
DocumentFileType = Literal["pdf", "txt"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
def _run_pdf_pipeline(
|
|
@@ -165,6 +200,62 @@ def _run_text_pipeline(
|
|
| 165 |
task_registry.set_done(job_id, inserted_count=inserted_count)
|
| 166 |
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
def run_job(
|
| 169 |
job_id: str,
|
| 170 |
project_id: str,
|
|
@@ -179,6 +270,8 @@ def run_job(
|
|
| 179 |
try:
|
| 180 |
if file_type == "txt":
|
| 181 |
_run_text_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
|
|
|
|
|
|
|
| 182 |
else:
|
| 183 |
_run_pdf_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
|
| 184 |
except Exception as exc:
|
|
|
|
| 5 |
"""
|
| 6 |
from __future__ import annotations
|
| 7 |
|
| 8 |
+
import io
|
| 9 |
from typing import List, Literal
|
| 10 |
|
| 11 |
from langchain_core.documents import Document
|
|
|
|
| 39 |
return chunks
|
| 40 |
|
| 41 |
|
| 42 |
+
DocumentFileType = Literal["pdf", "txt", "docx"]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _extract_text_from_docx(content_bytes: bytes) -> str:
|
| 46 |
+
"""Extract textual content from a .docx (Word OpenXML) byte stream.
|
| 47 |
+
|
| 48 |
+
Concatenates paragraphs and table cells in document order, separated
|
| 49 |
+
by line breaks. Headers/footers are intentionally skipped to keep
|
| 50 |
+
chunks aligned with the visible body content.
|
| 51 |
+
"""
|
| 52 |
+
try:
|
| 53 |
+
from docx import Document as _DocxDocument # type: ignore[import-not-found]
|
| 54 |
+
except ImportError as exc:
|
| 55 |
+
raise RuntimeError(
|
| 56 |
+
"python-docx is required to ingest .docx files. "
|
| 57 |
+
"Install it with `pip install python-docx`."
|
| 58 |
+
) from exc
|
| 59 |
+
|
| 60 |
+
document = _DocxDocument(io.BytesIO(content_bytes))
|
| 61 |
+
|
| 62 |
+
parts: List[str] = []
|
| 63 |
+
|
| 64 |
+
for paragraph in document.paragraphs:
|
| 65 |
+
text = (paragraph.text or "").strip()
|
| 66 |
+
if text:
|
| 67 |
+
parts.append(text)
|
| 68 |
+
|
| 69 |
+
for table in document.tables:
|
| 70 |
+
for row in table.rows:
|
| 71 |
+
row_cells = [(cell.text or "").strip() for cell in row.cells]
|
| 72 |
+
row_text = " | ".join(cell for cell in row_cells if cell)
|
| 73 |
+
if row_text:
|
| 74 |
+
parts.append(row_text)
|
| 75 |
+
|
| 76 |
+
return "\n".join(parts)
|
| 77 |
|
| 78 |
|
| 79 |
def _run_pdf_pipeline(
|
|
|
|
| 200 |
task_registry.set_done(job_id, inserted_count=inserted_count)
|
| 201 |
|
| 202 |
|
| 203 |
+
def _run_docx_pipeline(
|
| 204 |
+
job_id: str,
|
| 205 |
+
project_id: str,
|
| 206 |
+
filename: str,
|
| 207 |
+
content_bytes: bytes,
|
| 208 |
+
uploaded_by: str | None,
|
| 209 |
+
index_name: str | None,
|
| 210 |
+
document_id: str | None,
|
| 211 |
+
) -> None:
|
| 212 |
+
"""Execute the .docx ingestion pipeline (Word -> text -> chunk -> index)."""
|
| 213 |
+
job = task_registry.get(job_id)
|
| 214 |
+
if not job:
|
| 215 |
+
return
|
| 216 |
+
|
| 217 |
+
task_registry.set_running(job_id)
|
| 218 |
+
|
| 219 |
+
# 1) Parse / extract text from the Word document
|
| 220 |
+
task_registry.set_stage(job_id, "parse")
|
| 221 |
+
job.log(f"Extracting text from Word document '{filename}'")
|
| 222 |
+
try:
|
| 223 |
+
text = _extract_text_from_docx(content_bytes)
|
| 224 |
+
except Exception as exc:
|
| 225 |
+
raise RuntimeError(f"Failed to read docx file '{filename}': {exc}") from exc
|
| 226 |
+
|
| 227 |
+
# 2) Chunk
|
| 228 |
+
task_registry.set_stage(job_id, "chunk")
|
| 229 |
+
chunk_size = settings.chunk_size
|
| 230 |
+
chunk_overlap = settings.chunk_overlap
|
| 231 |
+
chunks = _chunk_text(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
| 232 |
+
documents: List[Document] = []
|
| 233 |
+
|
| 234 |
+
chunks_total = len(chunks)
|
| 235 |
+
task_registry.set_progress(job_id, chunks_total=chunks_total, chunks_done=0)
|
| 236 |
+
|
| 237 |
+
for i, chunk in enumerate(chunks):
|
| 238 |
+
metadata = {
|
| 239 |
+
"source": filename,
|
| 240 |
+
"page_number": 1,
|
| 241 |
+
"type": f"{settings.doc_default_type}_docx",
|
| 242 |
+
"project_id": project_id,
|
| 243 |
+
"uploaded_by": uploaded_by,
|
| 244 |
+
"chunk_index": i,
|
| 245 |
+
"document_id": document_id,
|
| 246 |
+
}
|
| 247 |
+
documents.append(Document(page_content=chunk, metadata=metadata))
|
| 248 |
+
task_registry.set_progress(job_id, chunks_done=i + 1)
|
| 249 |
+
|
| 250 |
+
# 3) Embed + index
|
| 251 |
+
task_registry.set_stage(job_id, "index")
|
| 252 |
+
job.log(f"Indexing {len(documents)} docx chunks to Supabase")
|
| 253 |
+
ids = add_documents(documents, index_name=index_name)
|
| 254 |
+
inserted_count = len(ids)
|
| 255 |
+
|
| 256 |
+
task_registry.set_done(job_id, inserted_count=inserted_count)
|
| 257 |
+
|
| 258 |
+
|
| 259 |
def run_job(
|
| 260 |
job_id: str,
|
| 261 |
project_id: str,
|
|
|
|
| 270 |
try:
|
| 271 |
if file_type == "txt":
|
| 272 |
_run_text_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
|
| 273 |
+
elif file_type == "docx":
|
| 274 |
+
_run_docx_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
|
| 275 |
else:
|
| 276 |
_run_pdf_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
|
| 277 |
except Exception as exc:
|