{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "06bb4dd3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dotenv import load_dotenv\n", "load_dotenv() # ch" ] }, { "cell_type": "code", "execution_count": 2, "id": "f4ff743f", "metadata": {}, "outputs": [], "source": [ "import os, sys, json\n", "from pathlib import Path\n", "\n", "# Ensure project root on path (notebook is under 'knowledge/')\n", "project_root = Path('..').resolve()\n", "if str(project_root) not in sys.path:\n", " sys.path.append(str(project_root))\n", "\n", "from services import mistral_service\n" ] }, { "cell_type": "markdown", "id": "ab8a16e9", "metadata": {}, "source": [ "# Building File Mistral" ] }, { "cell_type": "code", "execution_count": 6, "id": "b20b23e0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Resolved catalog path: C:\\Users\\cd\\Documents\\CAPL\\ROUTEUR\\dev\\routeur_ia_api\\knowledge\\docs\\2025_2026_Catalogue_formations.pdf\n", "PDF size (bytes): 7341273\n", "Signed URL prefix: https://mistralaifilesapiprodswe.blob.core.windows.net/fine-...\n", "{'type': 'document_url', 'document_url': 'https://mistralaifilesapiprodswe.blob.core.windows.net/fine-tune/20ee85df-97f2-4acc-90e8-d3419e046f02/3c69bf03-8111-497d-93a9-538e56ce1bb6/ea616308685a46a7868d37097680f149.pdf?se=2026-01-21T11%3A58%3A31Z&sp=r&sv=2025-01-05&sr=b&sig=b6VDM1pmgEn7Yedzvomm5cuEUxuy0o/QwlCs6f7DXLI%3D'}\n" ] } ], "source": [ "# Locate the catalog PDF\n", "catalog_filename = '2025_2026_Catalogue_formations.pdf'\n", "# Primary: relative to notebook dir (knowledge/docs)\n", "catalog_path = Path('docs') / catalog_filename\n", "if not catalog_path.exists():\n", " # Fallback: if executed from project root\n", " alt_path = Path('knowledge') / 'docs' / catalog_filename\n", " if alt_path.exists():\n", " catalog_path = alt_path\n", " else:\n", " raise FileNotFoundError(f\"Catalog not found at {catalog_path} or {alt_path}\")\n", "\n", "print(\"Resolved catalog path:\", catalog_path.resolve())\n", "\n", "# Read PDF bytes\n", "pdf_bytes = catalog_path.read_bytes()\n", "print(\"PDF size (bytes):\", len(pdf_bytes))\n", "\n", "# Upload and build document source for OCR/completion (REST-based)\n", "source_or_url = mistral_service.upload_pdf(pdf_bytes, catalog_path.name)\n", "if source_or_url.startswith(\"data:\"):\n", " # It's a data URI; use as document_url directly\n", " document_source = mistral_service.build_document_url(source_or_url)\n", " print(\"Using data URI as document source (length):\", len(source_or_url))\n", "else:\n", " print(\"Signed URL prefix:\", str(source_or_url)[:60] + \"...\")\n", " document_source = mistral_service.build_document_url(source_or_url)\n", "print(document_source)\n" ] }, { "cell_type": "markdown", "id": "8f4c33c3", "metadata": {}, "source": [ "## OCR Processing avec Mistral\n", "\n", "Utilisation de l'OCR de Mistral pour extraire le contenu structuré du PDF.\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "b29f622d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Exécution de l'OCR avec Mistral...\n", "Résultat OCR sauvegardé dans formation_ocr_result_2526.json\n", "Nombre de pages extraites : 112\n" ] } ], "source": [ "# Chemin pour stocker le résultat OCR brut (structure complète de Mistral)\n", "ocr_json_path = Path('formation_ocr_result_2526.json')\n", "\n", "# Vérifier si l'OCR a déjà été fait pour éviter de retraiter\n", "if ocr_json_path.exists():\n", " print(f\"OCR déjà effectué. Chargement depuis {ocr_json_path}\")\n", " with open(ocr_json_path, 'r', encoding='utf-8') as f:\n", " ocr_result = json.load(f)\n", "else:\n", " print(\"Exécution de l'OCR avec Mistral...\")\n", " # Appeler l'OCR de Mistral\n", " ocr_result = mistral_service.process_ocr(\n", " document_source=document_source,\n", " include_image_base64=False # Pas besoin des images base64 pour le markdown\n", " )\n", " \n", " # Sauvegarder la structure complète retournée par Mistral\n", " with open(ocr_json_path, 'w', encoding='utf-8') as f:\n", " json.dump(ocr_result, f, ensure_ascii=False, indent=2)\n", " print(f\"Résultat OCR sauvegardé dans {ocr_json_path}\")\n", "\n", "# Afficher quelques informations sur le résultat\n", "num_pages = len(ocr_result.get('pages', []))\n", "print(f\"Nombre de pages extraites : {num_pages}\")\n" ] }, { "cell_type": "markdown", "id": "d7891aac", "metadata": {}, "source": [ "# OCR to Markdown" ] }, { "cell_type": "code", "execution_count": 8, "id": "460f43d2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Contenu Markdown sauvegardé dans formation_catalog_ocr2526.md\n", "Taille du markdown : 176153 caractères\n", "\n", "Premiers 500 caractères :\n", "# 2025_2026_Catalogue_formations.pdf\n", "\n", "**Document traité par OCR Mistral** \n", "**Nombre de pages :** 112 \n", "**Date de traitement :** 1768910347.1584508\n", "\n", "---\n", "\n", "\n", "\n", "---\n", "\n", "# Page 1\n", "\n", "# PROZGRI \n", "\n", "POUR VOUS. AUJOURD'HUI. ET DEMAIN\n", "![img-0.jpeg](img-0.jpeg)\n", "\n", "## FORMATIONS\n", "\n", "AGRICULTEURS $\\cdot$ AGRICULTRICES SALARIÉ.E.S AGRICOLES\n", "\n", "## Se former pour gagner en performance\n", "\n", "2025 - 2026\n", "\n", "## FORMATIONS\n", "\n", "pays-de-la-loire.chambres-agriculture.fr\n", "(1)\n", "\n", "\n", "---\n", "\n", "# Page 2\n", "\n", "# La formation : notre outil pour transformer les dé...\n" ] } ], "source": [ "# Convertir le résultat OCR en Markdown\n", "markdown_content = mistral_service.ocr_response_to_markdown(ocr_result)\n", "\n", "# Ajouter un en-tête avec des métadonnées\n", "header = f\"\"\"# {catalog_filename}\n", "\n", "**Document traité par OCR Mistral** \n", "**Nombre de pages :** {num_pages} \n", "**Date de traitement :** {Path(ocr_json_path).stat().st_mtime if ocr_json_path.exists() else 'N/A'}\n", "\n", "---\n", "\n", "\"\"\"\n", "\n", "full_markdown = header + markdown_content\n", "\n", "# Sauvegarder dans un fichier MD\n", "md_output_path = Path('formation_catalog_ocr2526.md')\n", "with open(md_output_path, 'w', encoding='utf-8') as f:\n", " f.write(full_markdown)\n", "\n", "print(f\"Contenu Markdown sauvegardé dans {md_output_path}\")\n", "print(f\"Taille du markdown : {len(full_markdown)} caractères\")\n", "print(f\"\\nPremiers 500 caractères :\\n{full_markdown[:500]}...\")\n" ] }, { "cell_type": "markdown", "id": "f9e2e89c", "metadata": {}, "source": [ "## Extraction structurée avec JSON Schema\n", "\n", "Utilisation du modèle de chat avec JSON Schema pour extraire les formations de manière structurée.\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "3c9e46e4", "metadata": {}, "outputs": [], "source": [ "SYSTEM_PROMPT = (\n", " \"You are an expert data extractor. From the provided French training catalog (Catalogue formations), \"\n", " \"extract a clean list of trainings (formations). For each formation, extract the following fields \"\n", " \"when available: title (titre/nom), content (contenu), prerequisites (pré-requis), domain (domaine), \"\n", " \"target_audience (cible), duration (durée), associated_tools (outils associés), label, service, \"\n", " \"contact, and sessions (list of sessions with date and location/lieu). \"\n", " \"Note: Not all fields will be available for every formation - use null for missing data. \"\n", " \"Sessions can vary in number (1, 2, 3 or more per formation). \"\n", " \"Keep French text as-is. Return only strict JSON following the schema.\"\n", ")\n", "\n", "JSON_SCHEMA = {\n", " \"name\": \"CatalogFormations\",\n", " \"schema_definition\": {\n", " \"title\": \"CatalogFormations\",\n", " \"type\": \"object\",\n", " \"properties\": {\n", " \"formations\": {\n", " \"type\": \"array\",\n", " \"items\": {\n", " \"type\": \"object\",\n", " \"properties\": {\n", " \"title\": {\"type\": \"string\", \"description\": \"Titre / Nom de la formation\"},\n", " \"content\": {\"type\": [\"string\", \"null\"], \"description\": \"Contenu de la formation\"},\n", " \"prerequisites\": {\"type\": [\"string\", \"null\"], \"description\": \"Pré-requis\"},\n", " \"domain\": {\"type\": [\"string\", \"null\"], \"description\": \"Domaine de la formation\"},\n", " \"target_audience\": {\"type\": [\"string\", \"null\"], \"description\": \"Cible / Public visé\"},\n", " \"duration\": {\"type\": [\"string\", \"null\"], \"description\": \"Durée de la formation\"},\n", " \"associated_tools\": {\"type\": [\"string\", \"null\"], \"description\": \"Outils associés\"},\n", " \"label\": {\"type\": [\"string\", \"null\"], \"description\": \"Label de la formation\"},\n", " \"service\": {\"type\": [\"string\", \"null\"], \"description\": \"Service proposant la formation\"},\n", " \"contact\": {\"type\": [\"string\", \"null\"], \"description\": \"Contact pour la formation\"},\n", " \"sessions\": {\n", " \"type\": [\"array\", \"null\"],\n", " \"description\": \"Liste des sessions de formation\",\n", " \"items\": {\n", " \"type\": \"object\",\n", " \"properties\": {\n", " \"date\": {\"type\": [\"string\", \"null\"], \"description\": \"Date de la session\"},\n", " \"location\": {\"type\": [\"string\", \"null\"], \"description\": \"Lieu de la session\"}\n", " },\n", " \"required\": [\"date\", \"location\"]\n", " }\n", " }\n", " },\n", " \"required\": [\"title\"]\n", " }\n", " }\n", " },\n", " \"required\": [\"formations\"],\n", " \"additionalProperties\": False\n", " },\n", " \"description\": \"Schema for extracting training catalog data\",\n", " \"strict\": True\n", "}\n" ] }, { "cell_type": "code", "execution_count": 13, "id": "03abd999", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "📂 Chargement de formation_ocr_result_2526.json\n", "📄 Pages à traiter: 16 à 108 (93 pages sur 112 total)\n", "🔄 Traitement en 10 batchs de 10 pages max...\n", "⏱️ Pause de 2.0s entre chaque batch\n", "\n", "📦 Batch 1/10 - Pages 16 à 25... ✅ 21 formations extraites\n", "📦 Batch 2/10 - Pages 26 à 35... ✅ 13 formations extraites\n", "📦 Batch 3/10 - Pages 36 à 45... ✅ 15 formations extraites\n", "📦 Batch 4/10 - Pages 46 à 55... ✅ 16 formations extraites\n", "📦 Batch 5/10 - Pages 56 à 65... ✅ 15 formations extraites\n", "📦 Batch 6/10 - Pages 66 à 75... ✅ 12 formations extraites\n", "📦 Batch 7/10 - Pages 76 à 85... ✅ 23 formations extraites\n", "📦 Batch 8/10 - Pages 86 à 95... ✅ 22 formations extraites\n", "📦 Batch 9/10 - Pages 96 à 105... ✅ 23 formations extraites\n", "📦 Batch 10/10 - Pages 106 à 108... ✅ 22 formations extraites\n", "\n", "============================================================\n", "📊 RÉSULTAT FINAL\n", "============================================================\n", "✅ Total formations extraites: 182\n", "\n", "💾 Résultat sauvegardé dans: formations_2526.json\n", "📝 Aperçu des 3 premières formations:\n", " 1. Piloter son exploitation avec clarté et confiance\n", " 2. Réaliser un budget de trésorerie pour piloter son entreprise\n", " 3. Assurances : choisir une couverture à sa taille\n" ] } ], "source": [ "# =====================================================\n", "# EXTRACTION PAR BATCH - Contourne le timeout API\n", "# =====================================================\n", "# Cette implémentation traite le document page par page ou par lots\n", "# pour éviter les timeouts lors de l'extraction avec l'API Mistral.\n", "\n", "import time\n", "from typing import List, Dict, Any, Optional\n", "from dataclasses import dataclass, field\n", "\n", "\n", "@dataclass\n", "class BatchConfig:\n", " \"\"\"Configuration pour le traitement par lots.\"\"\"\n", " batch_size: int = 10 # Nombre de pages par batch\n", " pause_seconds: float = 2.0 # Pause entre les batchs (rate limiting)\n", " max_retries: int = 3 # Tentatives max par batch en cas d'erreur\n", " retry_delay: float = 5.0 # Délai entre les tentatives\n", " model: str = \"mistral-large-latest\"\n", " start_page: Optional[int] = None # Page de début (1-indexed, None = début)\n", " end_page: Optional[int] = None # Page de fin (1-indexed, inclusive, None = fin)\n", "\n", "\n", "@dataclass\n", "class BatchResult:\n", " \"\"\"Résultat d'un batch.\"\"\"\n", " batch_index: int\n", " start_page: int\n", " end_page: int\n", " formations: List[Dict[str, Any]] = field(default_factory=list)\n", " success: bool = True\n", " error: Optional[str] = None\n", "\n", "\n", "class FormationBatchExtractor:\n", " \"\"\"\n", " Extracteur de formations par lots.\n", " \n", " Traite un résultat OCR en lots de pages pour éviter les timeouts API.\n", " Utilise le service Mistral existant pour les appels API.\n", " \"\"\"\n", " \n", " def __init__(\n", " self,\n", " ocr_result: Dict[str, Any],\n", " system_prompt: str,\n", " json_schema: Dict[str, Any],\n", " config: Optional[BatchConfig] = None\n", " ):\n", " self.pages = ocr_result.get('pages', [])\n", " self.system_prompt = system_prompt\n", " self.json_schema = json_schema\n", " self.config = config or BatchConfig()\n", " \n", " # Accès au client Mistral via le service singleton\n", " from services.mistral_service import mistral_service as ms\n", " self.client = ms.client\n", " \n", " def _pages_to_markdown(self, page_list: List[Dict]) -> str:\n", " \"\"\"Convertit une liste de pages OCR en markdown.\"\"\"\n", " parts = []\n", " for page in page_list:\n", " idx = page.get('index', 0)\n", " md = page.get('markdown', '')\n", " parts.append(f\"\\n--- PAGE {idx + 1} ---\\n{md}\")\n", " return \"\\n\".join(parts)\n", " \n", " def _build_messages(self, batch_markdown: str) -> List[Dict[str, Any]]:\n", " \"\"\"Construit les messages pour l'API chat.\"\"\"\n", " schema_str = json.dumps(self.json_schema, indent=2)\n", " system_content = (\n", " f\"{self.system_prompt}\\n\\n\"\n", " f\"**JSON Schema to follow:**\\n```json\\n{schema_str}\\n```\"\n", " )\n", " \n", " user_content = (\n", " \"Extrait les formations de ces pages du catalogue.\\n\\n\"\n", " f\"=== DOCUMENT CONTENT (OCR) ===\\n{batch_markdown}\"\n", " )\n", " \n", " return [\n", " {\"role\": \"system\", \"content\": system_content},\n", " {\"role\": \"user\", \"content\": user_content}\n", " ]\n", " \n", " def _extract_batch(self, batch_pages: List[Dict], batch_idx: int) -> BatchResult:\n", " \"\"\"Extrait les formations d'un batch de pages.\"\"\"\n", " start_page = batch_pages[0].get('index', 0) + 1\n", " end_page = batch_pages[-1].get('index', 0) + 1\n", " \n", " result = BatchResult(\n", " batch_index=batch_idx,\n", " start_page=start_page,\n", " end_page=end_page\n", " )\n", " \n", " batch_markdown = self._pages_to_markdown(batch_pages)\n", " messages = self._build_messages(batch_markdown)\n", " \n", " for attempt in range(self.config.max_retries):\n", " try:\n", " response = self.client.chat.complete(\n", " model=self.config.model,\n", " messages=messages,\n", " response_format={\"type\": \"json_object\"},\n", " )\n", " \n", " content = response.choices[0].message.content\n", " parsed = json.loads(content) if isinstance(content, str) else content\n", " result.formations = parsed.get('formations', [])\n", " result.success = True\n", " return result\n", " \n", " except Exception as e:\n", " error_msg = str(e)\n", " if attempt < self.config.max_retries - 1:\n", " print(f\" ⚠️ Tentative {attempt + 1} échouée, retry dans {self.config.retry_delay}s...\")\n", " time.sleep(self.config.retry_delay)\n", " else:\n", " result.success = False\n", " result.error = error_msg\n", " \n", " return result\n", " \n", " def extract_all(self, progress_callback=None) -> Dict[str, Any]:\n", " \"\"\"\n", " Extrait toutes les formations par lots.\n", " \n", " Args:\n", " progress_callback: Fonction optionnelle appelée après chaque batch\n", " signature: callback(batch_idx, total_batches, batch_result)\n", " \n", " Returns:\n", " Dict avec 'formations' (liste), 'stats' et 'errors'\n", " \"\"\"\n", " # Filtrer les pages selon la plage configurée (pages 1-indexed)\n", " all_pages = self.pages\n", " start_idx = 0\n", " end_idx = len(all_pages)\n", " \n", " if self.config.start_page is not None:\n", " start_idx = max(0, self.config.start_page - 1) # Convertir en 0-indexed\n", " if self.config.end_page is not None:\n", " end_idx = min(len(all_pages), self.config.end_page) # end_page est inclusive\n", " \n", " pages_to_process = all_pages[start_idx:end_idx]\n", " total_pages = len(pages_to_process)\n", " num_batches = (total_pages + self.config.batch_size - 1) // self.config.batch_size\n", " \n", " # Afficher la plage de pages\n", " actual_start = start_idx + 1\n", " actual_end = start_idx + total_pages\n", " print(f\"📄 Pages à traiter: {actual_start} à {actual_end} ({total_pages} pages sur {len(all_pages)} total)\")\n", " print(f\"🔄 Traitement en {num_batches} batchs de {self.config.batch_size} pages max...\")\n", " print(f\"⏱️ Pause de {self.config.pause_seconds}s entre chaque batch\\n\")\n", " \n", " all_formations = []\n", " errors = []\n", " \n", " for batch_idx in range(num_batches):\n", " batch_start = batch_idx * self.config.batch_size\n", " batch_end = min(batch_start + self.config.batch_size, total_pages)\n", " batch_pages = pages_to_process[batch_start:batch_end]\n", " \n", " # Calculer les numéros de pages réels (1-indexed)\n", " real_start_page = batch_pages[0].get('index', 0) + 1\n", " real_end_page = batch_pages[-1].get('index', 0) + 1\n", " print(f\"📦 Batch {batch_idx + 1}/{num_batches} - Pages {real_start_page} à {real_end_page}...\", end=\" \")\n", " \n", " batch_result = self._extract_batch(batch_pages, batch_idx)\n", " \n", " if batch_result.success:\n", " all_formations.extend(batch_result.formations)\n", " print(f\"✅ {len(batch_result.formations)} formations extraites\")\n", " else:\n", " errors.append(batch_result)\n", " print(f\"❌ Erreur: {batch_result.error[:60]}...\")\n", " \n", " if progress_callback:\n", " progress_callback(batch_idx, num_batches, batch_result)\n", " \n", " # Pause entre les batchs (sauf le dernier)\n", " if batch_idx < num_batches - 1:\n", " time.sleep(self.config.pause_seconds)\n", " \n", " # Résumé\n", " print(f\"\\n{'='*60}\")\n", " print(f\"📊 RÉSULTAT FINAL\")\n", " print(f\"{'='*60}\")\n", " print(f\"✅ Total formations extraites: {len(all_formations)}\")\n", " if errors:\n", " print(f\"⚠️ Batchs en erreur: {len(errors)}\")\n", " for err in errors:\n", " print(f\" - Batch {err.batch_index + 1}: Pages {err.start_page}-{err.end_page}\")\n", " \n", " return {\n", " \"formations\": all_formations,\n", " \"stats\": {\n", " \"total_pages_in_document\": len(all_pages),\n", " \"pages_processed\": total_pages,\n", " \"page_range\": f\"{actual_start}-{actual_end}\",\n", " \"total_batches\": num_batches,\n", " \"successful_batches\": num_batches - len(errors),\n", " \"failed_batches\": len(errors),\n", " \"total_formations\": len(all_formations)\n", " },\n", " \"errors\": [\n", " {\n", " \"batch\": e.batch_index,\n", " \"pages\": f\"{e.start_page}-{e.end_page}\",\n", " \"error\": e.error\n", " }\n", " for e in errors\n", " ]\n", " }\n", "\n", "\n", "# =====================================================\n", "# EXÉCUTION DE L'EXTRACTION\n", "# =====================================================\n", "\n", "# Charger le résultat OCR existant\n", "ocr_json_path = Path('formation_ocr_result_2526.json')\n", "if not ocr_json_path.exists():\n", " raise FileNotFoundError(f\"Fichier OCR introuvable: {ocr_json_path}\")\n", "\n", "with open(ocr_json_path, 'r', encoding='utf-8') as f:\n", " ocr_data = json.load(f)\n", "\n", "print(f\"📂 Chargement de {ocr_json_path}\")\n", "\n", "# Configuration personnalisable\n", "# ⚠️ Pour traiter uniquement une plage de pages, définir start_page et end_page\n", "# Les numéros de pages sont 1-indexed (page 1 = première page du PDF)\n", "# Exemples:\n", "# - Pages 1 à 20: start_page=1, end_page=20\n", "# - Pages 50 à 80: start_page=50, end_page=80\n", "# - Toutes les pages: start_page=None, end_page=None (par défaut)\n", "\n", "config = BatchConfig(\n", " batch_size=10, # 10 pages par batch (ajuster si timeout persiste)\n", " pause_seconds=2.0, # 2 secondes entre les batchs\n", " max_retries=3, # 3 tentatives max par batch\n", " retry_delay=5.0, # 5 secondes entre les tentatives\n", " start_page=16, # 👈 Page de début (1-indexed) ou None pour le début\n", " end_page=108, # 👈 Page de fin (1-indexed, inclusive) ou None pour la fin\n", ")\n", "\n", "# Créer l'extracteur et lancer l'extraction\n", "extractor = FormationBatchExtractor(\n", " ocr_result=ocr_data,\n", " system_prompt=SYSTEM_PROMPT,\n", " json_schema=JSON_SCHEMA,\n", " config=config\n", ")\n", "\n", "result = extractor.extract_all()\n", "\n", "# Sauvegarder le résultat\n", "output_path = Path('formations_2526.json')\n", "with open(output_path, 'w', encoding='utf-8') as f:\n", " json.dump(result, f, ensure_ascii=False, indent=2)\n", "\n", "print(f\"\\n💾 Résultat sauvegardé dans: {output_path}\")\n", "print(f\"📝 Aperçu des 3 premières formations:\")\n", "for i, formation in enumerate(result['formations'][:3], 1):\n", " print(f\" {i}. {formation.get('title', 'Sans titre')}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "54464636", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import json\n", "\n", "# Charger le résultat OCR\n", "OCR_JSON_PATH = Path('formation_ocr_result2526.json')\n", "if not OCR_JSON_PATH.exists():\n", " alt = Path('knowledge') / 'formation_ocr_result2526.json'\n", " if alt.exists():\n", " OCR_JSON_PATH = alt\n", " else:\n", " raise FileNotFoundError(f\"Fichier introuvable: {OCR_JSON_PATH} (ou {alt})\")\n", "\n", "with open(OCR_JSON_PATH, 'r', encoding='utf-8') as f:\n", " ocr_result = json.load(f)\n", "\n", "pages = ocr_result.get('pages', [])\n", "start_index = 0\n", "end_index_inclusive = 500" ] }, { "cell_type": "code", "execution_count": null, "id": "44e3e953", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SUPABASE_URL: https://cfsbhpvvynazdxjgidxm.supabase.co\n", "SUPABASE_KEY: ***dLwc\n", "✅ Vectorstore Supabase initialisé avec succès\n" ] } ], "source": [ "from dotenv import load_dotenv\n", "from langchain_community.vectorstores import SupabaseVectorStore\n", "from langchain_openai import OpenAIEmbeddings\n", "load_dotenv()\n", "import os, re\n", "from supabase import Client, create_client\n", "\n", "# Récupération et normalisation des variables d'environnement\n", "supabase_url: str = os.getenv(\"SUPABASE_URL\") or os.getenv(\"NEXT_PUBLIC_SUPABASE_URL\")\n", "raw_key = (\n", " os.getenv(\"SUPABASE_KEY\")\n", " or os.getenv(\"SUPABASE_SERVICE_ROLE_KEY\")\n", " or os.getenv(\"SUPABASE_ANON_KEY\")\n", " or os.getenv(\"NEXT_PUBLIC_SUPABASE_ANON_KEY\")\n", ")\n", "\n", "# Journalisation (masquée)\n", "print(f\"SUPABASE_URL: {supabase_url}\")\n", "print(f\"SUPABASE_KEY: {'***' + raw_key[-4:] if raw_key else 'None'}\")\n", "\n", "if not supabase_url or not raw_key:\n", " raise ValueError(\n", " \"Variables d'environnement Supabase manquantes. Définissez SUPABASE_URL et une clé (SUPABASE_SERVICE_ROLE_KEY ou SUPABASE_ANON_KEY) dans .env\"\n", " )\n", "\n", "# Nettoyage et validation du format (les clés Supabase sont des JWT)\n", "supabase_key: str = raw_key.strip().strip('\"').strip(\"'\")\n", "jwt_regex = r\"^[A-Za-z0-9-_=]+\\.[A-Za-z0-9-_=]+\\.?[A-Za-z0-9-_.+/=]*$\"\n", "if not re.match(jwt_regex, supabase_key):\n", " raise ValueError(\n", " \"SUPABASE_KEY invalide (format non-JWT). Retirez les guillemets/espaces superflus et utilisez la clé 'anon' ou 'service_role' fournie par Supabase.\"\n", " )\n", "\n", "supabase: Client = create_client(supabase_url, supabase_key)\n", "embeddings = OpenAIEmbeddings()\n", "\n", "# Initialiser le vectorstore\n", "vector_store = SupabaseVectorStore(\n", " embedding=embeddings,\n", " client=supabase,\n", " table_name=\"documents\",\n", " query_name=\"match_documents\",\n", ")\n", "\n", "print(\"✅ Vectorstore Supabase initialisé avec succès\")" ] }, { "cell_type": "code", "execution_count": null, "id": "62f3a47f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Traitement des pages 0 à 500...\n", "✓ Page 0 traitée (284 caractères)\n", "✓ Page 1 traitée (2550 caractères)\n", "✓ Page 2 traitée (2106 caractères)\n", "✓ Page 3 traitée (2609 caractères)\n", "✓ Page 4 traitée (2618 caractères)\n", "✓ Page 5 traitée (2215 caractères)\n", "✓ Page 6 traitée (2515 caractères)\n", "✓ Page 7 traitée (1807 caractères)\n", "✓ Page 8 traitée (1170 caractères)\n", "✓ Page 9 traitée (2426 caractères)\n", "✓ Page 10 traitée (2316 caractères)\n", "✓ Page 11 traitée (1029 caractères)\n", "✓ Page 12 traitée (1150 caractères)\n", "✓ Page 13 traitée (1351 caractères)\n", "✓ Page 14 traitée (1461 caractères)\n", "✓ Page 15 traitée (937 caractères)\n", "✓ Page 16 traitée (1087 caractères)\n", "✓ Page 17 traitée (897 caractères)\n", "✓ Page 18 traitée (868 caractères)\n", "✓ Page 19 traitée (791 caractères)\n", "✓ Page 20 traitée (924 caractères)\n", "✓ Page 21 traitée (1525 caractères)\n", "✓ Page 22 traitée (1686 caractères)\n", "✓ Page 23 traitée (1285 caractères)\n", "✓ Page 24 traitée (1478 caractères)\n", "✓ Page 25 traitée (1573 caractères)\n", "✓ Page 26 traitée (835 caractères)\n", "✓ Page 27 traitée (1011 caractères)\n", "✓ Page 28 traitée (975 caractères)\n", "✓ Page 29 traitée (1085 caractères)\n", "✓ Page 30 traitée (1759 caractères)\n", "✓ Page 31 traitée (1672 caractères)\n", "✓ Page 32 traitée (962 caractères)\n", "✓ Page 33 traitée (1428 caractères)\n", "✓ Page 34 traitée (1350 caractères)\n", "✓ Page 35 traitée (840 caractères)\n", "✓ Page 36 traitée (982 caractères)\n", "✓ Page 37 traitée (1084 caractères)\n", "✓ Page 38 traitée (1276 caractères)\n", "✓ Page 39 traitée (1187 caractères)\n", "✓ Page 40 traitée (1774 caractères)\n", "✓ Page 41 traitée (1211 caractères)\n", "✓ Page 42 traitée (1521 caractères)\n", "✓ Page 43 traitée (1643 caractères)\n", "✓ Page 44 traitée (2044 caractères)\n", "✓ Page 45 traitée (1727 caractères)\n", "✓ Page 46 traitée (1358 caractères)\n", "✓ Page 47 traitée (1465 caractères)\n", "✓ Page 48 traitée (758 caractères)\n", "✓ Page 49 traitée (1041 caractères)\n", "✓ Page 50 traitée (895 caractères)\n", "✓ Page 51 traitée (1298 caractères)\n", "✓ Page 52 traitée (1977 caractères)\n", "✓ Page 53 traitée (1343 caractères)\n", "✓ Page 54 traitée (1451 caractères)\n", "✓ Page 55 traitée (1481 caractères)\n", "✓ Page 56 traitée (1595 caractères)\n", "✓ Page 57 traitée (1443 caractères)\n", "✓ Page 58 traitée (1488 caractères)\n", "✓ Page 59 traitée (1506 caractères)\n", "✓ Page 60 traitée (1151 caractères)\n", "✓ Page 61 traitée (1298 caractères)\n", "✓ Page 62 traitée (1738 caractères)\n", "✓ Page 63 traitée (1676 caractères)\n", "✓ Page 64 traitée (1210 caractères)\n", "✓ Page 65 traitée (1361 caractères)\n", "✓ Page 66 traitée (1622 caractères)\n", "✓ Page 67 traitée (1420 caractères)\n", "✓ Page 68 traitée (1206 caractères)\n", "✓ Page 69 traitée (1198 caractères)\n", "✓ Page 70 traitée (1653 caractères)\n", "✓ Page 71 traitée (965 caractères)\n", "✓ Page 72 traitée (1338 caractères)\n", "✓ Page 73 traitée (1327 caractères)\n", "✓ Page 74 traitée (1368 caractères)\n", "✓ Page 75 traitée (870 caractères)\n", "✓ Page 76 traitée (1409 caractères)\n", "✓ Page 77 traitée (1900 caractères)\n", "✓ Page 78 traitée (1773 caractères)\n", "✓ Page 79 traitée (1266 caractères)\n", "✓ Page 80 traitée (1652 caractères)\n", "✓ Page 81 traitée (1501 caractères)\n", "✓ Page 82 traitée (1124 caractères)\n", "✓ Page 83 traitée (1659 caractères)\n", "✓ Page 84 traitée (1184 caractères)\n", "✓ Page 85 traitée (859 caractères)\n", "✓ Page 86 traitée (1552 caractères)\n", "✓ Page 87 traitée (1633 caractères)\n", "✓ Page 88 traitée (1810 caractères)\n", "✓ Page 89 traitée (1688 caractères)\n", "✓ Page 90 traitée (1211 caractères)\n", "✓ Page 91 traitée (1389 caractères)\n", "✓ Page 92 traitée (1300 caractères)\n", "✓ Page 93 traitée (856 caractères)\n", "✓ Page 94 traitée (1146 caractères)\n", "✓ Page 95 traitée (594 caractères)\n", "✓ Page 96 traitée (1186 caractères)\n", "✓ Page 97 traitée (1716 caractères)\n", "✓ Page 98 traitée (1558 caractères)\n", "✓ Page 99 traitée (947 caractères)\n", "✓ Page 100 traitée (1385 caractères)\n", "✓ Page 101 traitée (23644 caractères)\n", "✓ Page 102 traitée (2341 caractères)\n", "✓ Page 103 traitée (3265 caractères)\n", "✓ Page 104 traitée (1752 caractères)\n", "✓ Page 105 traitée (2802 caractères)\n", "✓ Page 106 traitée (823 caractères)\n", "✓ Page 107 traitée (1081 caractères)\n", "\n", "📊 Total: 108 documents prêts à être ajoutés au vectorstore\n", "\n", "📄 Aperçu du premier document:\n", " - Contenu: ![img-0.jpeg](img-0.jpeg)\n", "\n", "# FORMATIONS AGRICULTEURS - AGRICULTRICES SALARIÉ.E.S AGRICOLES\n", "\n", "## Se former pour gagner en performance\n", "\n", "### 2024 - 2025\n", "\n", "![img-1.jpeg](img-1.jpeg)\n", "\n", "pays-de-la-loire.chambr...\n", " - Métadonnées: {'source': 'CAPL Catalogue FORMATION 2024_2025.pdf', 'page_number': 1, 'type': 'formation'}\n" ] } ], "source": [ "from langchain_core.documents import Document\n", "from typing import List\n", "\n", "def process_page(page: dict) -> Document:\n", " \"\"\"\n", " Transforme une page OCR en document LangChain.\n", " Retourne un Document avec le contenu markdown et les métadonnées.\n", " \"\"\"\n", " idx = page.get('index')\n", " md = (page.get('markdown') or '').replace('$\\\\checkmark$', '-')\n", "\n", " metadata = {\n", " \"source\": \"CAPL Catalogue FORMATION 2024_2025.pdf\",\n", " \"page_number\": idx + 1,\n", " \"type\": \"formation\",\n", " }\n", "\n", " # Créer un document LangChain\n", " return Document(\n", " page_content=md,\n", " metadata=metadata\n", " )\n", "\n", "\n", "# Collecter tous les documents\n", "documents: List[Document] = []\n", "\n", "if not pages:\n", " print(\"Aucune page dans le JSON.\")\n", "else:\n", " print(f\"Traitement des pages {start_index} à {end_index_inclusive}...\")\n", " max_index = min(len(pages) - 1, end_index_inclusive)\n", " \n", " for page_index in range(start_index, max_index + 1):\n", " try:\n", " page = pages[page_index]\n", " doc = process_page(page)\n", " documents.append(doc)\n", " print(f\"✓ Page {page.get('index')} traitée ({len(doc.page_content)} caractères)\")\n", " except IndexError:\n", " break\n", " except Exception as e:\n", " print(f\"⚠ Erreur sur la page {page_index}: {e}\")\n", " continue\n", " \n", " print(f\"\\n📊 Total: {len(documents)} documents prêts à être ajoutés au vectorstore\")\n", "\n", "# Afficher un aperçu\n", "if documents:\n", " print(f\"\\n📄 Aperçu du premier document:\")\n", " print(f\" - Contenu: {documents[0].page_content[:200]}...\")\n", " print(f\" - Métadonnées: {documents[0].metadata}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1eaae3a8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🚀 Ajout de 108 documents au vectorstore Supabase...\n", "⏳ Cela peut prendre quelques instants (génération des embeddings)...\n", "\n", "✅ Succès ! 108 documents ajoutés au vectorstore\n", "📝 IDs des documents: ['f49244db-ef11-4751-908a-77e4191120e3', '3218a85a-f4cb-4822-afb0-8f83d1093e5b', 'fc0dbda7-ef5e-4c1f-8d6a-0f45adfa7af1', 'e0e84128-28af-4072-b20e-da7e4441d989', '2a080874-af15-4873-a96b-b81843acf21c']...\n" ] } ], "source": [ "# Ajouter les documents au vectorstore Supabase\n", "if documents:\n", " print(f\"🚀 Ajout de {len(documents)} documents au vectorstore Supabase...\")\n", " print(\"⏳ Cela peut prendre quelques instants (génération des embeddings)...\\n\")\n", " \n", " try:\n", " # Ajouter les documents en batch\n", " # La méthode add_documents crée automatiquement les embeddings\n", " ids = vector_store.add_documents(documents)\n", " \n", " print(f\"✅ Succès ! {len(ids)} documents ajoutés au vectorstore\")\n", " print(f\"📝 IDs des documents: {ids[:5]}...\" if len(ids) > 5 else f\"📝 IDs des documents: {ids}\")\n", " \n", " except Exception as e:\n", " print(f\"❌ Erreur lors de l'ajout des documents: {e}\")\n", " import traceback\n", " traceback.print_exc()\n", "else:\n", " print(\"⚠ Aucun document à ajouter\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6a768f25", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🔍 Recherche: 'formations sur le leadership'\n", "\n", "vector_store\n", "results\n", "📊 3 résultats trouvés:\n", "\n", "============================================================\n", "Résultat #1\n", "============================================================\n", "📄 Source: CAPL Catalogue FORMATION 2024_2025.pdf\n", "📖 Page: 103\n", "📝 Contenu (extrait):\n", "# CONDITIONS GÉNÉRALES \n", "\n", "## PUBLIC ET PRÉ-REQUIS\n", "\n", "Lesformationss'adressentenprioritéaux agriculteurs, agricultrices, conjoint.e.s collaborateur contributeurs du VIVEA. Elles sont ouvertes aux salarié.e.s d'exploitation agricole ou à d'autres catégories de personnes selon des modalités à définir. Lor...\n", "\n", "============================================================\n", "Résultat #2\n", "============================================================\n", "📄 Source: CAPL Catalogue FORMATION 2024_2025.pdf\n", "📖 Page: 28\n", "📝 Contenu (extrait):\n", "# RH ET ORGANISATION DU TRAVAIL \n", "\n", "## Manager, communiquer, entretenir la motivation de son.ses salarié.e.s\n", "\n", "## J'ai des relations de confiance avec mon.mes salarié.e.s et mon entreprise gagne en performance\n", "\n", "## Objectifs\n", "\n", "- Savoir déléguer, transmettre des consignes, motiver, faire adhérer son.ses s...\n", "\n", "============================================================\n", "Résultat #3\n", "============================================================\n", "📄 Source: CAPL Catalogue FORMATION 2024_2025.pdf\n", "📖 Page: 70\n", "📝 Contenu (extrait):\n", "# TOUS ÉLEVAGES \n", "\n", "## S'initier à l'éducation et au dressage de son chien de conduite de troupeau\n", "\n", "Je fais équipe avec mon chien et j'améliore mes conditions de travail\n", "\n", "## Objectifs\n", "\n", "- Avoir des repères pour comprendre le fonctionnement du troupeau\n", "- Connaître les bases du fonctionnement d'un chien ...\n", "\n" ] } ], "source": [ "# 🔍 Test de recherche dans le vectorstore\n", "query = \"formations sur le leadership\"\n", "\n", "print(f\"🔍 Recherche: '{query}'\\n\")\n", "\n", "vector_store = SupabaseVectorStore(\n", " embedding=embeddings,\n", " client=supabase,\n", " table_name=\"documents\",\n", " query_name=\"match_documents\",\n", ")\n", "print(\"vector_store\")\n", "# Effectuer une recherche de similarité\n", "results = vector_store.similarity_search(\n", " query,\n", " k=3 # Nombre de résultats à retourner\n", ")\n", "print(\"results\")\n", "\n", "print(f\"📊 {len(results)} résultats trouvés:\\n\")\n", "\n", "for i, doc in enumerate(results, 1):\n", " print(f\"{'='*60}\")\n", " print(f\"Résultat #{i}\")\n", " print(f\"{'='*60}\")\n", " print(f\"📄 Source: {doc.metadata.get('source', 'N/A')}\")\n", " print(f\"📖 Page: {doc.metadata.get('page_number', 'N/A')}\")\n", " print(f\"📝 Contenu (extrait):\")\n", " print(doc.page_content[:300] + \"...\" if len(doc.page_content) > 300 else doc.page_content)\n", " print()\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }