{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extraction des Prestations de Service - Catalogue CAPL\n", "\n", "Ce notebook extrait les données structurées des prestations de service depuis le catalogue PDF.\n", "\n", "**Étapes:**\n", "1. Chargement et upload du PDF\n", "2. OCR avec Mistral\n", "3. Conversion en Markdown\n", "4. Extraction structurée par lots avec JSON Schema\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dotenv import load_dotenv\n", "load_dotenv()\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os, sys, json\n", "from pathlib import Path\n", "\n", "# Ensure project root on path (notebook is under 'knowledge/')\n", "project_root = Path('..').resolve()\n", "if str(project_root) not in sys.path:\n", " sys.path.append(str(project_root))\n", "\n", "from services import mistral_service\n", "from services.batch_extractor_service import BatchExtractor, BatchConfig, load_ocr_result\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Chargement et upload du PDF\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Resolved catalog path: C:\\Users\\cd\\Documents\\CAPL\\ROUTEUR\\dev\\routeur_ia_api\\knowledge\\docs\\CAPL Catalogue SERVICE 2024_2025.pdf\n", "PDF size (bytes): 31633158\n", "Signed URL prefix: https://mistralaifilesapiprodswe.blob.core.windows.net/fine-...\n", "{'type': 'document_url', 'document_url': 'https://mistralaifilesapiprodswe.blob.core.windows.net/fine-tune/20ee85df-97f2-4acc-90e8-d3419e046f02/3c69bf03-8111-497d-93a9-538e56ce1bb6/21115a1395af4a2caca44bd322d2825b.pdf?se=2026-01-21T16%3A00%3A28Z&sp=r&sv=2025-01-05&sr=b&sig=Xdn5oRg6FkJyG61Y8orNDSKLVj/7XnIV8IM29yrWCwo%3D'}\n" ] } ], "source": [ "# Locate the catalog PDF\n", "catalog_filename = 'CAPL Catalogue SERVICE 2024_2025.pdf'\n", "\n", "# Primary: relative to notebook dir (knowledge/docs)\n", "catalog_path = Path('docs') / catalog_filename\n", "if not catalog_path.exists():\n", " # Fallback: if executed from project root\n", " alt_path = Path('knowledge') / 'docs' / catalog_filename\n", " if alt_path.exists():\n", " catalog_path = alt_path\n", " else:\n", " raise FileNotFoundError(f\"Catalog not found at {catalog_path} or {alt_path}\")\n", "\n", "print(\"Resolved catalog path:\", catalog_path.resolve())\n", "\n", "# Read PDF bytes\n", "pdf_bytes = catalog_path.read_bytes()\n", "print(\"PDF size (bytes):\", len(pdf_bytes))\n", "\n", "# Upload and build document source for OCR/completion (REST-based)\n", "source_or_url = mistral_service.upload_pdf(pdf_bytes, catalog_path.name)\n", "if source_or_url.startswith(\"data:\"):\n", " document_source = mistral_service.build_document_url(source_or_url)\n", " print(\"Using data URI as document source (length):\", len(source_or_url))\n", "else:\n", " print(\"Signed URL prefix:\", str(source_or_url)[:60] + \"...\")\n", " document_source = mistral_service.build_document_url(source_or_url)\n", "\n", "print(document_source)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. OCR Processing avec Mistral\n", "\n", "Utilisation de l'OCR de Mistral pour extraire le contenu structuré du PDF.\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Exécution de l'OCR avec Mistral...\n", "Résultat OCR sauvegardé dans prestations_ocr_result.json\n", "Nombre de pages extraites : 52\n" ] } ], "source": [ "# Chemin pour stocker le résultat OCR brut\n", "ocr_json_path = Path('prestations_ocr_result.json')\n", "\n", "# Vérifier si l'OCR a déjà été fait pour éviter de retraiter\n", "if ocr_json_path.exists():\n", " print(f\"OCR déjà effectué. Chargement depuis {ocr_json_path}\")\n", " with open(ocr_json_path, 'r', encoding='utf-8') as f:\n", " ocr_result = json.load(f)\n", "else:\n", " print(\"Exécution de l'OCR avec Mistral...\")\n", " ocr_result = mistral_service.process_ocr(\n", " document_source=document_source,\n", " include_image_base64=False\n", " )\n", " \n", " # Sauvegarder la structure complète\n", " with open(ocr_json_path, 'w', encoding='utf-8') as f:\n", " json.dump(ocr_result, f, ensure_ascii=False, indent=2)\n", " print(f\"Résultat OCR sauvegardé dans {ocr_json_path}\")\n", "\n", "# Afficher quelques informations sur le résultat\n", "num_pages = len(ocr_result.get('pages', []))\n", "print(f\"Nombre de pages extraites : {num_pages}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Conversion OCR → Markdown\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Contenu Markdown sauvegardé dans prestations_catalog_ocr.md\n", "Taille du markdown : 70778 caractères\n", "\n", "Premiers 500 caractères :\n", "# CAPL Catalogue SERVICE 2024_2025.pdf\n", "\n", "**Document traité par OCR Mistral** \n", "**Nombre de pages :** 52 \n", "**Date de traitement :** 1768924910.856373\n", "\n", "---\n", "\n", "\n", "\n", "---\n", "\n", "# Page 1\n", "\n", "# PROZGRI \n", "\n", "POUR VOUS. AUJOURD'HUI. ET DEMAIN\n", "![img-0.jpeg](img-0.jpeg)\n", "\n", "OFFRE DE SERVICES AUX AGRICULTEURS\n", "\n", "## Des
solutions
pour réussir vos projets\n", "\n", "\n", "---\n", "\n", "# Page 2\n", "\n", "# Le réseau Chambre d'agriculture \n", "\n", "des Pays de la Loire\n", "\n", "## Les Chambres d'agriculture\n", "\n", "$1^{\\text {ER }}$ RÉSEAU de conseil agricole et territorial ce...\n" ] } ], "source": [ "# Convertir le résultat OCR en Markdown\n", "markdown_content = mistral_service.ocr_response_to_markdown(ocr_result)\n", "\n", "# Ajouter un en-tête avec des métadonnées\n", "header = f\"\"\"# {catalog_filename}\n", "\n", "**Document traité par OCR Mistral** \n", "**Nombre de pages :** {num_pages} \n", "**Date de traitement :** {Path(ocr_json_path).stat().st_mtime if ocr_json_path.exists() else 'N/A'}\n", "\n", "---\n", "\n", "\"\"\"\n", "\n", "full_markdown = header + markdown_content\n", "\n", "# Sauvegarder dans un fichier MD\n", "md_output_path = Path('prestations_catalog_ocr.md')\n", "with open(md_output_path, 'w', encoding='utf-8') as f:\n", " f.write(full_markdown)\n", "\n", "print(f\"Contenu Markdown sauvegardé dans {md_output_path}\")\n", "print(f\"Taille du markdown : {len(full_markdown)} caractères\")\n", "print(f\"\\nPremiers 500 caractères :\\n{full_markdown[:500]}...\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Extraction structurée avec JSON Schema\n", "\n", "Utilisation du service BatchExtractor pour extraire les prestations de manière structurée.\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Prompts et schéma JSON définis\n", "\n", "📋 Champs à extraire:\n", " - title\n", " - content\n", " - domain\n", " - target_audience\n", " - associated_tools\n", " - label\n", " - service\n", " - contact\n" ] } ], "source": [ "# =====================================================\n", "# CONFIGURATION DU PROMPT ET SCHEMA POUR PRESTATIONS\n", "# =====================================================\n", "\n", "SYSTEM_PROMPT = (\n", " \"You are an expert data extractor. From the provided French service catalog (Catalogue des prestations), \"\n", " \"extract a clean list of services (prestations). For each prestation, extract the following fields \"\n", " \"when available: title (titre/nom), content (contenu/description), domain (domaine), \"\n", " \"target_audience (cible), associated_tools (outils associés), label, service (service proposant), \"\n", " \"and contact. \"\n", " \"Note: Not all fields will be available for every prestation - use null for missing data. \"\n", " \"Keep French text as-is. Return only strict JSON following the schema.\"\n", ")\n", "\n", "JSON_SCHEMA = {\n", " \"name\": \"CatalogPrestations\",\n", " \"schema_definition\": {\n", " \"title\": \"CatalogPrestations\",\n", " \"type\": \"object\",\n", " \"properties\": {\n", " \"prestations\": {\n", " \"type\": \"array\",\n", " \"items\": {\n", " \"type\": \"object\",\n", " \"properties\": {\n", " \"title\": {\n", " \"type\": \"string\", \n", " \"description\": \"Titre / Nom de la prestation\"\n", " },\n", " \"content\": {\n", " \"type\": [\"string\", \"null\"], \n", " \"description\": \"Contenu / Description de la prestation\"\n", " },\n", " \"domain\": {\n", " \"type\": [\"string\", \"null\"], \n", " \"description\": \"Domaine de la prestation\"\n", " },\n", " \"target_audience\": {\n", " \"type\": [\"string\", \"null\"], \n", " \"description\": \"Cible / Public visé\"\n", " },\n", " \"associated_tools\": {\n", " \"type\": [\"string\", \"null\"], \n", " \"description\": \"Outils associés\"\n", " },\n", " \"label\": {\n", " \"type\": [\"string\", \"null\"], \n", " \"description\": \"Label de la prestation\"\n", " },\n", " \"service\": {\n", " \"type\": [\"string\", \"null\"], \n", " \"description\": \"Service proposant la prestation\"\n", " },\n", " \"contact\": {\n", " \"type\": [\"string\", \"null\"], \n", " \"description\": \"Contact pour la prestation\"\n", " }\n", " },\n", " \"required\": [\"title\"]\n", " }\n", " }\n", " },\n", " \"required\": [\"prestations\"],\n", " \"additionalProperties\": False\n", " },\n", " \"description\": \"Schema for extracting service catalog data\",\n", " \"strict\": True\n", "}\n", "\n", "print(\"✅ Prompts et schéma JSON définis\")\n", "print(f\"\\n📋 Champs à extraire:\")\n", "for field_name in JSON_SCHEMA['schema_definition']['properties']['prestations']['items']['properties'].keys():\n", " print(f\" - {field_name}\")\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "📂 Chargement de prestations_ocr_result.json\n", "📄 Pages à traiter: 6 à 44 (39 pages sur 52 total)\n", "🔄 Traitement en 4 batchs de 10 pages max...\n", "⏱️ Pause de 2.0s entre chaque batch\n", "\n", "📦 Batch 1/4 - Pages 6 à 15... ✅ 65 éléments extraits\n", "📦 Batch 2/4 - Pages 16 à 25... ✅ 85 éléments extraits\n", "📦 Batch 3/4 - Pages 26 à 35... ✅ 32 éléments extraits\n", "📦 Batch 4/4 - Pages 36 à 44... ✅ 39 éléments extraits\n", "\n", "============================================================\n", "📊 RÉSULTAT FINAL\n", "============================================================\n", "✅ Total éléments extraits: 221\n", "\n", "💾 Résultat sauvegardé dans: prestations_2425.json\n", "\n", "📝 Aperçu des 3 premières prestations:\n", " 1. Point accueil installation (PAI)\n", " 2. Trouver son site d'exploitation\n", " 3. Renforcer ses compétences de chef d'entreprise\n" ] } ], "source": [ "# =====================================================\n", "# EXTRACTION PAR BATCH\n", "# =====================================================\n", "\n", "# Charger le résultat OCR\n", "ocr_json_path = Path('prestations_ocr_result.json')\n", "ocr_data = load_ocr_result(ocr_json_path)\n", "print(f\"📂 Chargement de {ocr_json_path}\")\n", "\n", "# Configuration personnalisable\n", "# ⚠️ Pour traiter uniquement une plage de pages, définir start_page et end_page\n", "# Les numéros de pages sont 1-indexed (page 1 = première page du PDF)\n", "# Exemples:\n", "# - Pages 1 à 20: start_page=1, end_page=20\n", "# - Pages 50 à 80: start_page=50, end_page=80\n", "# - Toutes les pages: start_page=None, end_page=None (par défaut)\n", "\n", "config = BatchConfig(\n", " batch_size=10, # 10 pages par batch (ajuster si timeout persiste)\n", " pause_seconds=2.0, # 2 secondes entre les batchs\n", " max_retries=3, # 3 tentatives max par batch\n", " retry_delay=5.0, # 5 secondes entre les tentatives\n", " start_page=6, # 👈 Page de début (1-indexed) ou None pour le début\n", " end_page=44, # 👈 Page de fin (1-indexed, inclusive) ou None pour la fin\n", ")\n", "\n", "# Créer l'extracteur avec le service réutilisable\n", "extractor = BatchExtractor(\n", " ocr_result=ocr_data,\n", " system_prompt=SYSTEM_PROMPT,\n", " json_schema=JSON_SCHEMA,\n", " items_key=\"prestations\", # Clé dans la réponse JSON\n", " user_prompt=\"Extrait les prestations de service de ces pages du catalogue.\",\n", " config=config\n", ")\n", "\n", "# Lancer l'extraction et sauvegarder\n", "output_path = Path('prestations_2425.json')\n", "result = extractor.extract_to_file(output_path, items_key=\"prestations\")\n", "\n", "# Aperçu des résultats\n", "print(f\"\\n📝 Aperçu des 3 premières prestations:\")\n", "for i, prestation in enumerate(result.items[:3], 1):\n", " print(f\" {i}. {prestation.get('title', 'Sans titre')}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Exploration des résultats\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "📊 Statistiques d'extraction:\n", " - total_pages_in_document: 52\n", " - pages_processed: 39\n", " - page_range: 6-44\n", " - total_batches: 4\n", " - successful_batches: 4\n", " - failed_batches: 0\n", " - total_items: 221\n", "\n", "📋 Nombre total de prestations: 221\n", "\n", "🏷️ Répartition par domaine:\n", " - Environnement et réglementation: 10\n", " - Installation: 8\n", " - Élevage et traite: 8\n", " - PRODUCTIONS ANIMALES: 8\n", " - Transformation et qualité: 7\n", " - Numérique et productions végétales: 7\n", " - ÉNERGIE: 7\n", " - Bâtiments et infrastructures: 6\n", " - Productions végétales: 6\n", " - Productions végétales (Viticulture): 6\n", " - Climat: 6\n", " - AGRONOMIE: 6\n", " - Transmission: 5\n", " - Démarches administratives et réglementaires: 5\n", " - Numérique et productions animales: 5\n", " - ÉCONOMIE CIRCULAIRE: 5\n", " - FORMATION: 5\n", " - Relations humaines, Association: 4\n", " - Réglementation et environnement: 4\n", " - Échanges parcellaires: 4\n", " - Arbre et biodiversité: 4\n", " - DIFFUSION DE CONNAISSANCES: 4\n", " - Transmission, Association: 3\n", " - Démarches administratives et réglementaires, Élevage: 3\n", " - Management/Ressources humaines: 3\n", " - Commercialisation: 3\n", " - Commercialisation et aménagement: 3\n", " - Export et international: 3\n", " - Énergie et environnement: 3\n", " - Conseil stratégique phytosanitaire: 3\n", " - Maraîchage: 3\n", " - Irrigation: 3\n", " - Projets et stratégie d'entreprise: 2\n", " - Organisation du travail: 2\n", " - Management/Ressources humaines, Formation: 2\n", " - Agriculture biologique, Conversion: 2\n", " - Commercialisation et stratégie: 2\n", " - Financement et aides: 2\n", " - Communication commerciale: 2\n", " - Élevage et performance économique: 2\n", " - Élevage et fourrages: 2\n", " - Élevage et performance: 2\n", " - Environnement et certification: 2\n", " - Non spécifié: 2\n", " - AGROÉCOLOGIE: 2\n", " - Installation, Formation: 1\n", " - Installation, Transmission: 1\n", " - Agriculture biologique, Installation: 1\n", " - Démarches administratives et réglementaires, Transformation: 1\n", " - Environnement: 1\n", " - Environnement, Formation: 1\n", " - Laboratoire, Productions végétales: 1\n", " - Laboratoire, Productions animales: 1\n", " - Laboratoire, Microbiologie: 1\n", " - Projets et stratégie d'entreprise, Solidarité: 1\n", " - Agriculture biologique: 1\n", " - Agriculture biologique, Bâtiment: 1\n", " - Agriculture biologique, Productions végétales: 1\n", " - Agriculture biologique, Viticulture: 1\n", " - Agriculture biologique, Maraîchage: 1\n", " - Agriculture biologique, Élevage: 1\n", " - Agriculture biologique, Élevage, Santé animale: 1\n", " - Agriculture biologique, Diversification, Transformation: 1\n", " - Agriculture biologique, Diversification, Commercialisation: 1\n", " - Agriculture biologique, Diversification, Tourisme: 1\n", " - Agriculture biologique, Formation, Conseil: 1\n", " - Agriculture biologique, Installation, Projets: 1\n", " - Agriculture biologique, Conversion, Formation: 1\n", " - Agriculture biologique, Conseil technique: 1\n", " - Agriculture biologique, Stratégie d'entreprise: 1\n", " - Agriculture biologique, Réglementation: 1\n", " - Installation et développement d'activité: 1\n", " - Réseaux et commercialisation: 1\n", " - Accompagnement et formation: 1\n", " - Environnement et climat: 1\n", " - Élevage et santé animale: 1\n", " - Environnement et agronomie: 1\n", " - Environnement et financement: 1\n", " - Numérique et traçabilité: 1\n", " - Numérique et commercialisation: 1\n" ] } ], "source": [ "# Charger et explorer les résultats\n", "with open('prestations_2425.json', 'r', encoding='utf-8') as f:\n", " data = json.load(f)\n", "\n", "prestations = data.get('prestations', [])\n", "stats = data.get('stats', {})\n", "\n", "print(f\"📊 Statistiques d'extraction:\")\n", "for key, value in stats.items():\n", " print(f\" - {key}: {value}\")\n", "\n", "print(f\"\\n📋 Nombre total de prestations: {len(prestations)}\")\n", "\n", "# Analyser les domaines\n", "domains = {}\n", "for p in prestations:\n", " domain = p.get('domain') or 'Non spécifié'\n", " domains[domain] = domains.get(domain, 0) + 1\n", "\n", "print(f\"\\n🏷️ Répartition par domaine:\")\n", "for domain, count in sorted(domains.items(), key=lambda x: -x[1]):\n", " print(f\" - {domain}: {count}\")\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Export CSV sauvegardé dans: prestations_2425.csv\n", " 221 lignes exportées\n" ] } ], "source": [ "# Export en CSV pour analyse dans Excel\n", "import csv\n", "\n", "csv_output_path = Path('prestations_2425.csv')\n", "\n", "# Mapping des champs JSON vers les noms de colonnes CSV en français\n", "COLUMN_MAPPING = {\n", " 'title': 'Titre',\n", " 'content': 'Contenu',\n", " 'domain': 'Domaine',\n", " 'target_audience': 'Cible',\n", " 'associated_tools': 'Outils associés',\n", " 'label': 'label',\n", " 'service': 'service',\n", " 'contact': 'contact'\n", "}\n", "\n", "# Colonnes CSV en français\n", "fieldnames = ['Titre', 'Contenu', 'Domaine', 'Cible', 'Outils associés', 'label', 'service', 'contact']\n", "\n", "with open(csv_output_path, 'w', newline='', encoding='utf-8-sig') as csvfile:\n", " writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';', extrasaction='ignore')\n", " writer.writeheader()\n", " \n", " for prestation in prestations:\n", " # Mapper les clés JSON vers les noms de colonnes français\n", " row = {\n", " COLUMN_MAPPING[k]: (v if v is not None else '')\n", " for k, v in prestation.items()\n", " if k in COLUMN_MAPPING\n", " }\n", " writer.writerow(row)\n", "\n", "print(f\"✅ Export CSV sauvegardé dans: {csv_output_path}\")\n", "print(f\" {len(prestations)} lignes exportées\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }