Cyril Dupland commited on
Commit
c0cbc50
·
1 Parent(s): 33e0e5a

Add support for .docx file ingestion: Update requirements to include python-docx, modify document type detection to support .docx, and implement text extraction and indexing for Word documents in the ingestion service.

Browse files
api/routes/documents.py CHANGED
@@ -15,7 +15,13 @@ from config.settings import settings
15
  router = APIRouter(prefix="/projects/{id_project}/documents", tags=["Documents"])
16
 
17
 
18
- SupportedDocumentType = Literal["pdf", "txt"]
 
 
 
 
 
 
19
 
20
 
21
  def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
@@ -25,6 +31,7 @@ def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
25
  Currently supports:
26
  - PDF: .pdf extension or application/pdf
27
  - TXT: .txt extension or text/plain
 
28
  """
29
  filename = (file.filename or "").lower()
30
  content_type = (file.content_type or "").lower()
@@ -33,13 +40,17 @@ def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
33
  if is_pdf:
34
  return "pdf"
35
 
 
 
 
 
36
  is_txt = filename.endswith(".txt") or content_type == "text/plain"
37
  if is_txt:
38
  return "txt"
39
 
40
  raise HTTPException(
41
  status_code=status.HTTP_400_BAD_REQUEST,
42
- detail="Only PDF and plain text (.txt) files are supported at the moment.",
43
  )
44
 
45
 
@@ -58,7 +69,7 @@ async def upload_project_document(
58
  background_tasks: BackgroundTasks,
59
  current_user: dict = Depends(get_current_user),
60
  id_document: str = Form(..., description="UUID du document (obligatoire)"),
61
- file: UploadFile = File(..., description="Project document to ingest (PDF or plain text)"),
62
  index_name: str = Query("projects", description="Logical index (settings.vector_indexes key)"),
63
  ) -> UploadJobResponse:
64
  """
@@ -66,10 +77,11 @@ async def upload_project_document(
66
 
67
  Body fields (multipart/form-data):
68
  - id_document (UUID, required): stable identifier of the logical document
69
- - file (PDF or .txt): binary content
70
 
71
  Supported formats:
72
  - PDF: processed via Mistral OCR -> chunk -> embed -> index
 
73
  - Plain text (.txt): direct text -> chunk -> embed -> index (no OCR)
74
 
75
  Returns a `job_id` for status polling.
 
15
  router = APIRouter(prefix="/projects/{id_project}/documents", tags=["Documents"])
16
 
17
 
18
+ SupportedDocumentType = Literal["pdf", "txt", "docx"]
19
+
20
+
21
+ _DOCX_CONTENT_TYPES = {
22
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
23
+ "application/msword",
24
+ }
25
 
26
 
27
  def _detect_document_type(file: UploadFile) -> SupportedDocumentType:
 
31
  Currently supports:
32
  - PDF: .pdf extension or application/pdf
33
  - TXT: .txt extension or text/plain
34
+ - DOCX: .docx extension or Word OpenXML content type
35
  """
36
  filename = (file.filename or "").lower()
37
  content_type = (file.content_type or "").lower()
 
40
  if is_pdf:
41
  return "pdf"
42
 
43
+ is_docx = filename.endswith(".docx") or content_type in _DOCX_CONTENT_TYPES
44
+ if is_docx:
45
+ return "docx"
46
+
47
  is_txt = filename.endswith(".txt") or content_type == "text/plain"
48
  if is_txt:
49
  return "txt"
50
 
51
  raise HTTPException(
52
  status_code=status.HTTP_400_BAD_REQUEST,
53
+ detail="Only PDF, Word (.docx) and plain text (.txt) files are supported at the moment.",
54
  )
55
 
56
 
 
69
  background_tasks: BackgroundTasks,
70
  current_user: dict = Depends(get_current_user),
71
  id_document: str = Form(..., description="UUID du document (obligatoire)"),
72
+ file: UploadFile = File(..., description="Project document to ingest (PDF, .docx or plain text)"),
73
  index_name: str = Query("projects", description="Logical index (settings.vector_indexes key)"),
74
  ) -> UploadJobResponse:
75
  """
 
77
 
78
  Body fields (multipart/form-data):
79
  - id_document (UUID, required): stable identifier of the logical document
80
+ - file (PDF, .docx or .txt): binary content
81
 
82
  Supported formats:
83
  - PDF: processed via Mistral OCR -> chunk -> embed -> index
84
+ - DOCX (Word): text extraction -> chunk -> embed -> index (no OCR)
85
  - Plain text (.txt): direct text -> chunk -> embed -> index (no OCR)
86
 
87
  Returns a `job_id` for status polling.
requirements.txt CHANGED
@@ -51,6 +51,9 @@ reportlab>=4.4.4,<5
51
  markdown>=3.7,<4
52
  markdown-pdf>=1.13.0
53
 
 
 
 
54
  # Ecologits Carbon Footprint
55
  ecologits>=0.8.2,<1
56
 
 
51
  markdown>=3.7,<4
52
  markdown-pdf>=1.13.0
53
 
54
+ # Ingestion .docx (Microsoft Word)
55
+ python-docx>=1.2.0,<2
56
+
57
  # Ecologits Carbon Footprint
58
  ecologits>=0.8.2,<1
59
 
services/ingestion_service.py CHANGED
@@ -5,6 +5,7 @@ through the shared TaskRegistry.
5
  """
6
  from __future__ import annotations
7
 
 
8
  from typing import List, Literal
9
 
10
  from langchain_core.documents import Document
@@ -38,7 +39,41 @@ def _chunk_text(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
38
  return chunks
39
 
40
 
41
- DocumentFileType = Literal["pdf", "txt"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  def _run_pdf_pipeline(
@@ -165,6 +200,62 @@ def _run_text_pipeline(
165
  task_registry.set_done(job_id, inserted_count=inserted_count)
166
 
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  def run_job(
169
  job_id: str,
170
  project_id: str,
@@ -179,6 +270,8 @@ def run_job(
179
  try:
180
  if file_type == "txt":
181
  _run_text_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
 
 
182
  else:
183
  _run_pdf_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
184
  except Exception as exc:
 
5
  """
6
  from __future__ import annotations
7
 
8
+ import io
9
  from typing import List, Literal
10
 
11
  from langchain_core.documents import Document
 
39
  return chunks
40
 
41
 
42
+ DocumentFileType = Literal["pdf", "txt", "docx"]
43
+
44
+
45
+ def _extract_text_from_docx(content_bytes: bytes) -> str:
46
+ """Extract textual content from a .docx (Word OpenXML) byte stream.
47
+
48
+ Concatenates paragraphs and table cells in document order, separated
49
+ by line breaks. Headers/footers are intentionally skipped to keep
50
+ chunks aligned with the visible body content.
51
+ """
52
+ try:
53
+ from docx import Document as _DocxDocument # type: ignore[import-not-found]
54
+ except ImportError as exc:
55
+ raise RuntimeError(
56
+ "python-docx is required to ingest .docx files. "
57
+ "Install it with `pip install python-docx`."
58
+ ) from exc
59
+
60
+ document = _DocxDocument(io.BytesIO(content_bytes))
61
+
62
+ parts: List[str] = []
63
+
64
+ for paragraph in document.paragraphs:
65
+ text = (paragraph.text or "").strip()
66
+ if text:
67
+ parts.append(text)
68
+
69
+ for table in document.tables:
70
+ for row in table.rows:
71
+ row_cells = [(cell.text or "").strip() for cell in row.cells]
72
+ row_text = " | ".join(cell for cell in row_cells if cell)
73
+ if row_text:
74
+ parts.append(row_text)
75
+
76
+ return "\n".join(parts)
77
 
78
 
79
  def _run_pdf_pipeline(
 
200
  task_registry.set_done(job_id, inserted_count=inserted_count)
201
 
202
 
203
+ def _run_docx_pipeline(
204
+ job_id: str,
205
+ project_id: str,
206
+ filename: str,
207
+ content_bytes: bytes,
208
+ uploaded_by: str | None,
209
+ index_name: str | None,
210
+ document_id: str | None,
211
+ ) -> None:
212
+ """Execute the .docx ingestion pipeline (Word -> text -> chunk -> index)."""
213
+ job = task_registry.get(job_id)
214
+ if not job:
215
+ return
216
+
217
+ task_registry.set_running(job_id)
218
+
219
+ # 1) Parse / extract text from the Word document
220
+ task_registry.set_stage(job_id, "parse")
221
+ job.log(f"Extracting text from Word document '{filename}'")
222
+ try:
223
+ text = _extract_text_from_docx(content_bytes)
224
+ except Exception as exc:
225
+ raise RuntimeError(f"Failed to read docx file '{filename}': {exc}") from exc
226
+
227
+ # 2) Chunk
228
+ task_registry.set_stage(job_id, "chunk")
229
+ chunk_size = settings.chunk_size
230
+ chunk_overlap = settings.chunk_overlap
231
+ chunks = _chunk_text(text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
232
+ documents: List[Document] = []
233
+
234
+ chunks_total = len(chunks)
235
+ task_registry.set_progress(job_id, chunks_total=chunks_total, chunks_done=0)
236
+
237
+ for i, chunk in enumerate(chunks):
238
+ metadata = {
239
+ "source": filename,
240
+ "page_number": 1,
241
+ "type": f"{settings.doc_default_type}_docx",
242
+ "project_id": project_id,
243
+ "uploaded_by": uploaded_by,
244
+ "chunk_index": i,
245
+ "document_id": document_id,
246
+ }
247
+ documents.append(Document(page_content=chunk, metadata=metadata))
248
+ task_registry.set_progress(job_id, chunks_done=i + 1)
249
+
250
+ # 3) Embed + index
251
+ task_registry.set_stage(job_id, "index")
252
+ job.log(f"Indexing {len(documents)} docx chunks to Supabase")
253
+ ids = add_documents(documents, index_name=index_name)
254
+ inserted_count = len(ids)
255
+
256
+ task_registry.set_done(job_id, inserted_count=inserted_count)
257
+
258
+
259
  def run_job(
260
  job_id: str,
261
  project_id: str,
 
270
  try:
271
  if file_type == "txt":
272
  _run_text_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
273
+ elif file_type == "docx":
274
+ _run_docx_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
275
  else:
276
  _run_pdf_pipeline(job_id, project_id, filename, content_bytes, uploaded_by, index_name, document_id)
277
  except Exception as exc: