{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extraction des Prestations de Service - Catalogue CAPL\n",
"\n",
"Ce notebook extrait les données structurées des prestations de service depuis le catalogue PDF.\n",
"\n",
"**Étapes:**\n",
"1. Chargement et upload du PDF\n",
"2. OCR avec Mistral\n",
"3. Conversion en Markdown\n",
"4. Extraction structurée par lots avec JSON Schema\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dotenv import load_dotenv\n",
"load_dotenv()\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os, sys, json\n",
"from pathlib import Path\n",
"\n",
"# Ensure project root on path (notebook is under 'knowledge/')\n",
"project_root = Path('..').resolve()\n",
"if str(project_root) not in sys.path:\n",
" sys.path.append(str(project_root))\n",
"\n",
"from services import mistral_service\n",
"from services.batch_extractor_service import BatchExtractor, BatchConfig, load_ocr_result\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Chargement et upload du PDF\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Resolved catalog path: C:\\Users\\cd\\Documents\\CAPL\\ROUTEUR\\dev\\routeur_ia_api\\knowledge\\docs\\CAPL Catalogue SERVICE 2024_2025.pdf\n",
"PDF size (bytes): 31633158\n",
"Signed URL prefix: https://mistralaifilesapiprodswe.blob.core.windows.net/fine-...\n",
"{'type': 'document_url', 'document_url': 'https://mistralaifilesapiprodswe.blob.core.windows.net/fine-tune/20ee85df-97f2-4acc-90e8-d3419e046f02/3c69bf03-8111-497d-93a9-538e56ce1bb6/21115a1395af4a2caca44bd322d2825b.pdf?se=2026-01-21T16%3A00%3A28Z&sp=r&sv=2025-01-05&sr=b&sig=Xdn5oRg6FkJyG61Y8orNDSKLVj/7XnIV8IM29yrWCwo%3D'}\n"
]
}
],
"source": [
"# Locate the catalog PDF\n",
"catalog_filename = 'CAPL Catalogue SERVICE 2024_2025.pdf'\n",
"\n",
"# Primary: relative to notebook dir (knowledge/docs)\n",
"catalog_path = Path('docs') / catalog_filename\n",
"if not catalog_path.exists():\n",
" # Fallback: if executed from project root\n",
" alt_path = Path('knowledge') / 'docs' / catalog_filename\n",
" if alt_path.exists():\n",
" catalog_path = alt_path\n",
" else:\n",
" raise FileNotFoundError(f\"Catalog not found at {catalog_path} or {alt_path}\")\n",
"\n",
"print(\"Resolved catalog path:\", catalog_path.resolve())\n",
"\n",
"# Read PDF bytes\n",
"pdf_bytes = catalog_path.read_bytes()\n",
"print(\"PDF size (bytes):\", len(pdf_bytes))\n",
"\n",
"# Upload and build document source for OCR/completion (REST-based)\n",
"source_or_url = mistral_service.upload_pdf(pdf_bytes, catalog_path.name)\n",
"if source_or_url.startswith(\"data:\"):\n",
" document_source = mistral_service.build_document_url(source_or_url)\n",
" print(\"Using data URI as document source (length):\", len(source_or_url))\n",
"else:\n",
" print(\"Signed URL prefix:\", str(source_or_url)[:60] + \"...\")\n",
" document_source = mistral_service.build_document_url(source_or_url)\n",
"\n",
"print(document_source)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. OCR Processing avec Mistral\n",
"\n",
"Utilisation de l'OCR de Mistral pour extraire le contenu structuré du PDF.\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exécution de l'OCR avec Mistral...\n",
"Résultat OCR sauvegardé dans prestations_ocr_result.json\n",
"Nombre de pages extraites : 52\n"
]
}
],
"source": [
"# Chemin pour stocker le résultat OCR brut\n",
"ocr_json_path = Path('prestations_ocr_result.json')\n",
"\n",
"# Vérifier si l'OCR a déjà été fait pour éviter de retraiter\n",
"if ocr_json_path.exists():\n",
" print(f\"OCR déjà effectué. Chargement depuis {ocr_json_path}\")\n",
" with open(ocr_json_path, 'r', encoding='utf-8') as f:\n",
" ocr_result = json.load(f)\n",
"else:\n",
" print(\"Exécution de l'OCR avec Mistral...\")\n",
" ocr_result = mistral_service.process_ocr(\n",
" document_source=document_source,\n",
" include_image_base64=False\n",
" )\n",
" \n",
" # Sauvegarder la structure complète\n",
" with open(ocr_json_path, 'w', encoding='utf-8') as f:\n",
" json.dump(ocr_result, f, ensure_ascii=False, indent=2)\n",
" print(f\"Résultat OCR sauvegardé dans {ocr_json_path}\")\n",
"\n",
"# Afficher quelques informations sur le résultat\n",
"num_pages = len(ocr_result.get('pages', []))\n",
"print(f\"Nombre de pages extraites : {num_pages}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Conversion OCR → Markdown\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Contenu Markdown sauvegardé dans prestations_catalog_ocr.md\n",
"Taille du markdown : 70778 caractères\n",
"\n",
"Premiers 500 caractères :\n",
"# CAPL Catalogue SERVICE 2024_2025.pdf\n",
"\n",
"**Document traité par OCR Mistral** \n",
"**Nombre de pages :** 52 \n",
"**Date de traitement :** 1768924910.856373\n",
"\n",
"---\n",
"\n",
"\n",
"\n",
"---\n",
"\n",
"# Page 1\n",
"\n",
"# PROZGRI \n",
"\n",
"POUR VOUS. AUJOURD'HUI. ET DEMAIN\n",
"\n",
"\n",
"OFFRE DE SERVICES AUX AGRICULTEURS\n",
"\n",
"## Des
solutions
pour réussir vos projets\n",
"\n",
"\n",
"---\n",
"\n",
"# Page 2\n",
"\n",
"# Le réseau Chambre d'agriculture \n",
"\n",
"des Pays de la Loire\n",
"\n",
"## Les Chambres d'agriculture\n",
"\n",
"$1^{\\text {ER }}$ RÉSEAU de conseil agricole et territorial ce...\n"
]
}
],
"source": [
"# Convertir le résultat OCR en Markdown\n",
"markdown_content = mistral_service.ocr_response_to_markdown(ocr_result)\n",
"\n",
"# Ajouter un en-tête avec des métadonnées\n",
"header = f\"\"\"# {catalog_filename}\n",
"\n",
"**Document traité par OCR Mistral** \n",
"**Nombre de pages :** {num_pages} \n",
"**Date de traitement :** {Path(ocr_json_path).stat().st_mtime if ocr_json_path.exists() else 'N/A'}\n",
"\n",
"---\n",
"\n",
"\"\"\"\n",
"\n",
"full_markdown = header + markdown_content\n",
"\n",
"# Sauvegarder dans un fichier MD\n",
"md_output_path = Path('prestations_catalog_ocr.md')\n",
"with open(md_output_path, 'w', encoding='utf-8') as f:\n",
" f.write(full_markdown)\n",
"\n",
"print(f\"Contenu Markdown sauvegardé dans {md_output_path}\")\n",
"print(f\"Taille du markdown : {len(full_markdown)} caractères\")\n",
"print(f\"\\nPremiers 500 caractères :\\n{full_markdown[:500]}...\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Extraction structurée avec JSON Schema\n",
"\n",
"Utilisation du service BatchExtractor pour extraire les prestations de manière structurée.\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ Prompts et schéma JSON définis\n",
"\n",
"📋 Champs à extraire:\n",
" - title\n",
" - content\n",
" - domain\n",
" - target_audience\n",
" - associated_tools\n",
" - label\n",
" - service\n",
" - contact\n"
]
}
],
"source": [
"# =====================================================\n",
"# CONFIGURATION DU PROMPT ET SCHEMA POUR PRESTATIONS\n",
"# =====================================================\n",
"\n",
"SYSTEM_PROMPT = (\n",
" \"You are an expert data extractor. From the provided French service catalog (Catalogue des prestations), \"\n",
" \"extract a clean list of services (prestations). For each prestation, extract the following fields \"\n",
" \"when available: title (titre/nom), content (contenu/description), domain (domaine), \"\n",
" \"target_audience (cible), associated_tools (outils associés), label, service (service proposant), \"\n",
" \"and contact. \"\n",
" \"Note: Not all fields will be available for every prestation - use null for missing data. \"\n",
" \"Keep French text as-is. Return only strict JSON following the schema.\"\n",
")\n",
"\n",
"JSON_SCHEMA = {\n",
" \"name\": \"CatalogPrestations\",\n",
" \"schema_definition\": {\n",
" \"title\": \"CatalogPrestations\",\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"prestations\": {\n",
" \"type\": \"array\",\n",
" \"items\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"title\": {\n",
" \"type\": \"string\", \n",
" \"description\": \"Titre / Nom de la prestation\"\n",
" },\n",
" \"content\": {\n",
" \"type\": [\"string\", \"null\"], \n",
" \"description\": \"Contenu / Description de la prestation\"\n",
" },\n",
" \"domain\": {\n",
" \"type\": [\"string\", \"null\"], \n",
" \"description\": \"Domaine de la prestation\"\n",
" },\n",
" \"target_audience\": {\n",
" \"type\": [\"string\", \"null\"], \n",
" \"description\": \"Cible / Public visé\"\n",
" },\n",
" \"associated_tools\": {\n",
" \"type\": [\"string\", \"null\"], \n",
" \"description\": \"Outils associés\"\n",
" },\n",
" \"label\": {\n",
" \"type\": [\"string\", \"null\"], \n",
" \"description\": \"Label de la prestation\"\n",
" },\n",
" \"service\": {\n",
" \"type\": [\"string\", \"null\"], \n",
" \"description\": \"Service proposant la prestation\"\n",
" },\n",
" \"contact\": {\n",
" \"type\": [\"string\", \"null\"], \n",
" \"description\": \"Contact pour la prestation\"\n",
" }\n",
" },\n",
" \"required\": [\"title\"]\n",
" }\n",
" }\n",
" },\n",
" \"required\": [\"prestations\"],\n",
" \"additionalProperties\": False\n",
" },\n",
" \"description\": \"Schema for extracting service catalog data\",\n",
" \"strict\": True\n",
"}\n",
"\n",
"print(\"✅ Prompts et schéma JSON définis\")\n",
"print(f\"\\n📋 Champs à extraire:\")\n",
"for field_name in JSON_SCHEMA['schema_definition']['properties']['prestations']['items']['properties'].keys():\n",
" print(f\" - {field_name}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"📂 Chargement de prestations_ocr_result.json\n",
"📄 Pages à traiter: 6 à 44 (39 pages sur 52 total)\n",
"🔄 Traitement en 4 batchs de 10 pages max...\n",
"⏱️ Pause de 2.0s entre chaque batch\n",
"\n",
"📦 Batch 1/4 - Pages 6 à 15... ✅ 65 éléments extraits\n",
"📦 Batch 2/4 - Pages 16 à 25... ✅ 85 éléments extraits\n",
"📦 Batch 3/4 - Pages 26 à 35... ✅ 32 éléments extraits\n",
"📦 Batch 4/4 - Pages 36 à 44... ✅ 39 éléments extraits\n",
"\n",
"============================================================\n",
"📊 RÉSULTAT FINAL\n",
"============================================================\n",
"✅ Total éléments extraits: 221\n",
"\n",
"💾 Résultat sauvegardé dans: prestations_2425.json\n",
"\n",
"📝 Aperçu des 3 premières prestations:\n",
" 1. Point accueil installation (PAI)\n",
" 2. Trouver son site d'exploitation\n",
" 3. Renforcer ses compétences de chef d'entreprise\n"
]
}
],
"source": [
"# =====================================================\n",
"# EXTRACTION PAR BATCH\n",
"# =====================================================\n",
"\n",
"# Charger le résultat OCR\n",
"ocr_json_path = Path('prestations_ocr_result.json')\n",
"ocr_data = load_ocr_result(ocr_json_path)\n",
"print(f\"📂 Chargement de {ocr_json_path}\")\n",
"\n",
"# Configuration personnalisable\n",
"# ⚠️ Pour traiter uniquement une plage de pages, définir start_page et end_page\n",
"# Les numéros de pages sont 1-indexed (page 1 = première page du PDF)\n",
"# Exemples:\n",
"# - Pages 1 à 20: start_page=1, end_page=20\n",
"# - Pages 50 à 80: start_page=50, end_page=80\n",
"# - Toutes les pages: start_page=None, end_page=None (par défaut)\n",
"\n",
"config = BatchConfig(\n",
" batch_size=10, # 10 pages par batch (ajuster si timeout persiste)\n",
" pause_seconds=2.0, # 2 secondes entre les batchs\n",
" max_retries=3, # 3 tentatives max par batch\n",
" retry_delay=5.0, # 5 secondes entre les tentatives\n",
" start_page=6, # 👈 Page de début (1-indexed) ou None pour le début\n",
" end_page=44, # 👈 Page de fin (1-indexed, inclusive) ou None pour la fin\n",
")\n",
"\n",
"# Créer l'extracteur avec le service réutilisable\n",
"extractor = BatchExtractor(\n",
" ocr_result=ocr_data,\n",
" system_prompt=SYSTEM_PROMPT,\n",
" json_schema=JSON_SCHEMA,\n",
" items_key=\"prestations\", # Clé dans la réponse JSON\n",
" user_prompt=\"Extrait les prestations de service de ces pages du catalogue.\",\n",
" config=config\n",
")\n",
"\n",
"# Lancer l'extraction et sauvegarder\n",
"output_path = Path('prestations_2425.json')\n",
"result = extractor.extract_to_file(output_path, items_key=\"prestations\")\n",
"\n",
"# Aperçu des résultats\n",
"print(f\"\\n📝 Aperçu des 3 premières prestations:\")\n",
"for i, prestation in enumerate(result.items[:3], 1):\n",
" print(f\" {i}. {prestation.get('title', 'Sans titre')}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Exploration des résultats\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"📊 Statistiques d'extraction:\n",
" - total_pages_in_document: 52\n",
" - pages_processed: 39\n",
" - page_range: 6-44\n",
" - total_batches: 4\n",
" - successful_batches: 4\n",
" - failed_batches: 0\n",
" - total_items: 221\n",
"\n",
"📋 Nombre total de prestations: 221\n",
"\n",
"🏷️ Répartition par domaine:\n",
" - Environnement et réglementation: 10\n",
" - Installation: 8\n",
" - Élevage et traite: 8\n",
" - PRODUCTIONS ANIMALES: 8\n",
" - Transformation et qualité: 7\n",
" - Numérique et productions végétales: 7\n",
" - ÉNERGIE: 7\n",
" - Bâtiments et infrastructures: 6\n",
" - Productions végétales: 6\n",
" - Productions végétales (Viticulture): 6\n",
" - Climat: 6\n",
" - AGRONOMIE: 6\n",
" - Transmission: 5\n",
" - Démarches administratives et réglementaires: 5\n",
" - Numérique et productions animales: 5\n",
" - ÉCONOMIE CIRCULAIRE: 5\n",
" - FORMATION: 5\n",
" - Relations humaines, Association: 4\n",
" - Réglementation et environnement: 4\n",
" - Échanges parcellaires: 4\n",
" - Arbre et biodiversité: 4\n",
" - DIFFUSION DE CONNAISSANCES: 4\n",
" - Transmission, Association: 3\n",
" - Démarches administratives et réglementaires, Élevage: 3\n",
" - Management/Ressources humaines: 3\n",
" - Commercialisation: 3\n",
" - Commercialisation et aménagement: 3\n",
" - Export et international: 3\n",
" - Énergie et environnement: 3\n",
" - Conseil stratégique phytosanitaire: 3\n",
" - Maraîchage: 3\n",
" - Irrigation: 3\n",
" - Projets et stratégie d'entreprise: 2\n",
" - Organisation du travail: 2\n",
" - Management/Ressources humaines, Formation: 2\n",
" - Agriculture biologique, Conversion: 2\n",
" - Commercialisation et stratégie: 2\n",
" - Financement et aides: 2\n",
" - Communication commerciale: 2\n",
" - Élevage et performance économique: 2\n",
" - Élevage et fourrages: 2\n",
" - Élevage et performance: 2\n",
" - Environnement et certification: 2\n",
" - Non spécifié: 2\n",
" - AGROÉCOLOGIE: 2\n",
" - Installation, Formation: 1\n",
" - Installation, Transmission: 1\n",
" - Agriculture biologique, Installation: 1\n",
" - Démarches administratives et réglementaires, Transformation: 1\n",
" - Environnement: 1\n",
" - Environnement, Formation: 1\n",
" - Laboratoire, Productions végétales: 1\n",
" - Laboratoire, Productions animales: 1\n",
" - Laboratoire, Microbiologie: 1\n",
" - Projets et stratégie d'entreprise, Solidarité: 1\n",
" - Agriculture biologique: 1\n",
" - Agriculture biologique, Bâtiment: 1\n",
" - Agriculture biologique, Productions végétales: 1\n",
" - Agriculture biologique, Viticulture: 1\n",
" - Agriculture biologique, Maraîchage: 1\n",
" - Agriculture biologique, Élevage: 1\n",
" - Agriculture biologique, Élevage, Santé animale: 1\n",
" - Agriculture biologique, Diversification, Transformation: 1\n",
" - Agriculture biologique, Diversification, Commercialisation: 1\n",
" - Agriculture biologique, Diversification, Tourisme: 1\n",
" - Agriculture biologique, Formation, Conseil: 1\n",
" - Agriculture biologique, Installation, Projets: 1\n",
" - Agriculture biologique, Conversion, Formation: 1\n",
" - Agriculture biologique, Conseil technique: 1\n",
" - Agriculture biologique, Stratégie d'entreprise: 1\n",
" - Agriculture biologique, Réglementation: 1\n",
" - Installation et développement d'activité: 1\n",
" - Réseaux et commercialisation: 1\n",
" - Accompagnement et formation: 1\n",
" - Environnement et climat: 1\n",
" - Élevage et santé animale: 1\n",
" - Environnement et agronomie: 1\n",
" - Environnement et financement: 1\n",
" - Numérique et traçabilité: 1\n",
" - Numérique et commercialisation: 1\n"
]
}
],
"source": [
"# Charger et explorer les résultats\n",
"with open('prestations_2425.json', 'r', encoding='utf-8') as f:\n",
" data = json.load(f)\n",
"\n",
"prestations = data.get('prestations', [])\n",
"stats = data.get('stats', {})\n",
"\n",
"print(f\"📊 Statistiques d'extraction:\")\n",
"for key, value in stats.items():\n",
" print(f\" - {key}: {value}\")\n",
"\n",
"print(f\"\\n📋 Nombre total de prestations: {len(prestations)}\")\n",
"\n",
"# Analyser les domaines\n",
"domains = {}\n",
"for p in prestations:\n",
" domain = p.get('domain') or 'Non spécifié'\n",
" domains[domain] = domains.get(domain, 0) + 1\n",
"\n",
"print(f\"\\n🏷️ Répartition par domaine:\")\n",
"for domain, count in sorted(domains.items(), key=lambda x: -x[1]):\n",
" print(f\" - {domain}: {count}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ Export CSV sauvegardé dans: prestations_2425.csv\n",
" 221 lignes exportées\n"
]
}
],
"source": [
"# Export en CSV pour analyse dans Excel\n",
"import csv\n",
"\n",
"csv_output_path = Path('prestations_2425.csv')\n",
"\n",
"# Mapping des champs JSON vers les noms de colonnes CSV en français\n",
"COLUMN_MAPPING = {\n",
" 'title': 'Titre',\n",
" 'content': 'Contenu',\n",
" 'domain': 'Domaine',\n",
" 'target_audience': 'Cible',\n",
" 'associated_tools': 'Outils associés',\n",
" 'label': 'label',\n",
" 'service': 'service',\n",
" 'contact': 'contact'\n",
"}\n",
"\n",
"# Colonnes CSV en français\n",
"fieldnames = ['Titre', 'Contenu', 'Domaine', 'Cible', 'Outils associés', 'label', 'service', 'contact']\n",
"\n",
"with open(csv_output_path, 'w', newline='', encoding='utf-8-sig') as csvfile:\n",
" writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';', extrasaction='ignore')\n",
" writer.writeheader()\n",
" \n",
" for prestation in prestations:\n",
" # Mapper les clés JSON vers les noms de colonnes français\n",
" row = {\n",
" COLUMN_MAPPING[k]: (v if v is not None else '')\n",
" for k, v in prestation.items()\n",
" if k in COLUMN_MAPPING\n",
" }\n",
" writer.writerow(row)\n",
"\n",
"print(f\"✅ Export CSV sauvegardé dans: {csv_output_path}\")\n",
"print(f\" {len(prestations)} lignes exportées\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}