{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "06bb4dd3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from dotenv import load_dotenv\n", "load_dotenv() # ch" ] }, { "cell_type": "code", "execution_count": 2, "id": "f4ff743f", "metadata": {}, "outputs": [], "source": [ "import os, sys, json\n", "from pathlib import Path\n", "\n", "# Ensure project root on path (notebook is under 'knowledge/')\n", "project_root = Path('..').resolve()\n", "if str(project_root) not in sys.path:\n", " sys.path.append(str(project_root))\n", "\n", "from services import mistral_service\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "b20b23e0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Resolved catalog path: C:\\Users\\cd\\Documents\\CAPL\\ROUTEUR\\dev\\routeur_ia_api\\knowledge\\docs\\CAPL Catalogue SERVICE 2024_2025.pdf\n", "PDF size (bytes): 31633158\n", "Signed URL prefix: https://mistralaifilesapiprodswe.blob.core.windows.net/fine-...\n", "{'type': 'document_url', 'document_url': 'https://mistralaifilesapiprodswe.blob.core.windows.net/fine-tune/20ee85df-97f2-4acc-90e8-d3419e046f02/3c69bf03-8111-497d-93a9-538e56ce1bb6/21115a1395af4a2caca44bd322d2825b.pdf?se=2025-10-21T15%3A12%3A50Z&sp=r&sv=2025-01-05&sr=b&sig=Hda0qGknPl9mA6jv9zhic%2BBNaY/4fFKfZq6fkywcK0w%3D'}\n" ] } ], "source": [ "# Locate the catalog PDF\n", "catalog_filename = 'CAPL Catalogue SERVICE 2024_2025.pdf'\n", "# Primary: relative to notebook dir (knowledge/docs)\n", "catalog_path = Path('docs') / catalog_filename\n", "if not catalog_path.exists():\n", " # Fallback: if executed from project root\n", " alt_path = Path('knowledge') / 'docs' / catalog_filename\n", " if alt_path.exists():\n", " catalog_path = alt_path\n", " else:\n", " raise FileNotFoundError(f\"Catalog not found at {catalog_path} or {alt_path}\")\n", "\n", "print(\"Resolved catalog path:\", catalog_path.resolve())\n", "\n", "# Read PDF bytes\n", "pdf_bytes = catalog_path.read_bytes()\n", "print(\"PDF size (bytes):\", len(pdf_bytes))\n", "\n", "# Upload and build document source for OCR/completion (REST-based)\n", "source_or_url = mistral_service.upload_pdf(pdf_bytes, catalog_path.name)\n", "if source_or_url.startswith(\"data:\"):\n", " # It's a data URI; use as document_url directly\n", " document_source = mistral_service.build_document_url(source_or_url)\n", " print(\"Using data URI as document source (length):\", len(source_or_url))\n", "else:\n", " print(\"Signed URL prefix:\", str(source_or_url)[:60] + \"...\")\n", " document_source = mistral_service.build_document_url(source_or_url)\n", "print(document_source)\n" ] }, { "cell_type": "markdown", "id": "8f4c33c3", "metadata": {}, "source": [ "## OCR Processing avec Mistral\n", "\n", "Utilisation de l'OCR de Mistral pour extraire le contenu structuré du PDF.\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "b29f622d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Exécution de l'OCR avec Mistral...\n", "Résultat OCR sauvegardé dans prestations_ocr_result.json\n", "Nombre de pages extraites : 52\n" ] } ], "source": [ "# Chemin pour stocker le résultat OCR brut (structure complète de Mistral)\n", "ocr_json_path = Path('prestations_ocr_result.json')\n", "\n", "# Vérifier si l'OCR a déjà été fait pour éviter de retraiter\n", "if ocr_json_path.exists():\n", " print(f\"OCR déjà effectué. Chargement depuis {ocr_json_path}\")\n", " with open(ocr_json_path, 'r', encoding='utf-8') as f:\n", " ocr_result = json.load(f)\n", "else:\n", " print(\"Exécution de l'OCR avec Mistral...\")\n", " # Appeler l'OCR de Mistral\n", " ocr_result = mistral_service.process_ocr(\n", " document_source=document_source,\n", " include_image_base64=False # Pas besoin des images base64 pour le markdown\n", " )\n", " \n", " # Sauvegarder la structure complète retournée par Mistral\n", " with open(ocr_json_path, 'w', encoding='utf-8') as f:\n", " json.dump(ocr_result, f, ensure_ascii=False, indent=2)\n", " print(f\"Résultat OCR sauvegardé dans {ocr_json_path}\")\n", "\n", "# Afficher quelques informations sur le résultat\n", "num_pages = len(ocr_result.get('pages', []))\n", "print(f\"Nombre de pages extraites : {num_pages}\")\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "460f43d2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Contenu Markdown sauvegardé dans prestations_catalog_ocr.md\n", "Taille du markdown : 70594 caractères\n", "\n", "Premiers 500 caractères :\n", "# CAPL Catalogue SERVICE 2024_2025.pdf\n", "\n", "**Document traité par OCR Mistral** \n", "**Nombre de pages :** 52 \n", "**Date de traitement :** 1760973259.8553593\n", "\n", "---\n", "\n", "\n", "\n", "---\n", "\n", "# Page 1\n", "\n", "# PROAGRI \n", "\n", "POUR VOUS. AUJOURD'HUI. ET DEMAIN\n", "![img-0.jpeg](img-0.jpeg)\n", "\n", "## Des\n", "\n", "solutions pour réussir vos projets\n", "\n", "\n", "---\n", "\n", "# Page 2\n", "\n", "# Le réseau Chambre d'agriculture des Pays de la Loire \n", "\n", "## Les Chambres d'agriculture 1ER RÉSEAU de conseil agricole et territorial certifié en France\n", "\n", "![img-1.jpeg](img-1.jpeg)\n", "\n", "La Chambre d'ag...\n" ] } ], "source": [ "# Convertir le résultat OCR en Markdown\n", "markdown_content = mistral_service.ocr_response_to_markdown(ocr_result)\n", "\n", "# Ajouter un en-tête avec des métadonnées\n", "header = f\"\"\"# {catalog_filename}\n", "\n", "**Document traité par OCR Mistral** \n", "**Nombre de pages :** {num_pages} \n", "**Date de traitement :** {Path(ocr_json_path).stat().st_mtime if ocr_json_path.exists() else 'N/A'}\n", "\n", "---\n", "\n", "\"\"\"\n", "\n", "full_markdown = header + markdown_content\n", "\n", "# Sauvegarder dans un fichier MD\n", "md_output_path = Path('prestations_catalog_ocr.md')\n", "with open(md_output_path, 'w', encoding='utf-8') as f:\n", " f.write(full_markdown)\n", "\n", "print(f\"Contenu Markdown sauvegardé dans {md_output_path}\")\n", "print(f\"Taille du markdown : {len(full_markdown)} caractères\")\n", "print(f\"\\nPremiers 500 caractères :\\n{full_markdown[:500]}...\")\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "54464636", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import json\n", "\n", "# Charger le résultat OCR\n", "OCR_JSON_PATH = Path('prestations_ocr_result.json')\n", "if not OCR_JSON_PATH.exists():\n", " alt = Path('knowledge') / 'prestations_ocr_result.json'\n", " if alt.exists():\n", " OCR_JSON_PATH = alt\n", " else:\n", " raise FileNotFoundError(f\"Fichier introuvable: {OCR_JSON_PATH} (ou {alt})\")\n", "\n", "with open(OCR_JSON_PATH, 'r', encoding='utf-8') as f:\n", " ocr_result = json.load(f)\n", "\n", "pages = ocr_result.get('pages', [])\n", "start_index = 0\n", "end_index_inclusive = 500" ] }, { "cell_type": "code", "execution_count": 2, "id": "44e3e953", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\cd\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "SUPABASE_URL: https://cfsbhpvvynazdxjgidxm.supabase.co\n", "SUPABASE_KEY: ***dLwc\n", "✅ Vectorstore Supabase initialisé avec succès\n" ] } ], "source": [ "from dotenv import load_dotenv\n", "from langchain_community.vectorstores import SupabaseVectorStore\n", "from langchain_openai import OpenAIEmbeddings\n", "load_dotenv()\n", "import os, re\n", "from supabase import Client, create_client\n", "\n", "# Récupération et normalisation des variables d'environnement\n", "supabase_url: str = os.getenv(\"SUPABASE_URL\") or os.getenv(\"NEXT_PUBLIC_SUPABASE_URL\")\n", "raw_key = (\n", " os.getenv(\"SUPABASE_KEY\")\n", " or os.getenv(\"SUPABASE_SERVICE_ROLE_KEY\")\n", " or os.getenv(\"SUPABASE_ANON_KEY\")\n", " or os.getenv(\"NEXT_PUBLIC_SUPABASE_ANON_KEY\")\n", ")\n", "\n", "# Journalisation (masquée)\n", "print(f\"SUPABASE_URL: {supabase_url}\")\n", "print(f\"SUPABASE_KEY: {'***' + raw_key[-4:] if raw_key else 'None'}\")\n", "\n", "if not supabase_url or not raw_key:\n", " raise ValueError(\n", " \"Variables d'environnement Supabase manquantes. Définissez SUPABASE_URL et une clé (SUPABASE_SERVICE_ROLE_KEY ou SUPABASE_ANON_KEY) dans .env\"\n", " )\n", "\n", "# Nettoyage et validation du format (les clés Supabase sont des JWT)\n", "supabase_key: str = raw_key.strip().strip('\"').strip(\"'\")\n", "jwt_regex = r\"^[A-Za-z0-9-_=]+\\.[A-Za-z0-9-_=]+\\.?[A-Za-z0-9-_.+/=]*$\"\n", "if not re.match(jwt_regex, supabase_key):\n", " raise ValueError(\n", " \"SUPABASE_KEY invalide (format non-JWT). Retirez les guillemets/espaces superflus et utilisez la clé 'anon' ou 'service_role' fournie par Supabase.\"\n", " )\n", "\n", "supabase: Client = create_client(supabase_url, supabase_key)\n", "embeddings = OpenAIEmbeddings()\n", "\n", "# Initialiser le vectorstore\n", "vector_store = SupabaseVectorStore(\n", " embedding=embeddings,\n", " client=supabase,\n", " table_name=\"documents\",\n", " query_name=\"match_documents\",\n", ")\n", "\n", "print(\"✅ Vectorstore Supabase initialisé avec succès\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "62f3a47f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Traitement des pages 0 à 500...\n", "✓ Page 0 traitée (115 caractères)\n", "✓ Page 1 traitée (833 caractères)\n", "✓ Page 2 traitée (1821 caractères)\n", "✓ Page 3 traitée (597 caractères)\n", "✓ Page 4 traitée (1348 caractères)\n", "✓ Page 5 traitée (1326 caractères)\n", "✓ Page 6 traitée (1540 caractères)\n", "✓ Page 7 traitée (1468 caractères)\n", "✓ Page 8 traitée (1514 caractères)\n", "✓ Page 9 traitée (1234 caractères)\n", "✓ Page 10 traitée (1286 caractères)\n", "✓ Page 11 traitée (1159 caractères)\n", "✓ Page 12 traitée (1871 caractères)\n", "✓ Page 13 traitée (1674 caractères)\n", "✓ Page 14 traitée (1340 caractères)\n", "✓ Page 15 traitée (1500 caractères)\n", "✓ Page 16 traitée (1393 caractères)\n", "✓ Page 17 traitée (1406 caractères)\n", "✓ Page 18 traitée (978 caractères)\n", "✓ Page 19 traitée (1085 caractères)\n", "✓ Page 20 traitée (1195 caractères)\n", "✓ Page 21 traitée (1539 caractères)\n", "✓ Page 22 traitée (917 caractères)\n", "✓ Page 23 traitée (1282 caractères)\n", "✓ Page 24 traitée (1299 caractères)\n", "✓ Page 25 traitée (1083 caractères)\n", "✓ Page 26 traitée (1659 caractères)\n", "✓ Page 27 traitée (1226 caractères)\n", "✓ Page 28 traitée (933 caractères)\n", "✓ Page 29 traitée (1604 caractères)\n", "✓ Page 30 traitée (1048 caractères)\n", "✓ Page 31 traitée (985 caractères)\n", "✓ Page 32 traitée (1409 caractères)\n", "✓ Page 33 traitée (1486 caractères)\n", "✓ Page 34 traitée (898 caractères)\n", "✓ Page 35 traitée (1394 caractères)\n", "✓ Page 36 traitée (1473 caractères)\n", "✓ Page 37 traitée (1311 caractères)\n", "✓ Page 38 traitée (898 caractères)\n", "✓ Page 39 traitée (1440 caractères)\n", "✓ Page 40 traitée (1799 caractères)\n", "✓ Page 41 traitée (1250 caractères)\n", "✓ Page 42 traitée (1470 caractères)\n", "✓ Page 43 traitée (1226 caractères)\n", "✓ Page 44 traitée (595 caractères)\n", "✓ Page 45 traitée (1187 caractères)\n", "✓ Page 46 traitée (1252 caractères)\n", "✓ Page 47 traitée (513 caractères)\n", "✓ Page 48 traitée (936 caractères)\n", "✓ Page 49 traitée (1735 caractères)\n", "✓ Page 50 traitée (2136 caractères)\n", "✓ Page 51 traitée (681 caractères)\n", "\n", "📊 Total: 52 documents prêts à être ajoutés au vectorstore\n", "\n", "📄 Aperçu du premier document:\n", " - Contenu: # PROAGRI \n", "\n", "POUR VOUS. AUJOURD'HUI. ET DEMAIN\n", "![img-0.jpeg](img-0.jpeg)\n", "\n", "## Des\n", "\n", "solutions pour réussir vos projets...\n", " - Métadonnées: {'source': 'CAPL Catalogue SERVICE 2024_2025.pdf', 'page_number': 1, 'type': 'prestation'}\n" ] } ], "source": [ "from langchain_core.documents import Document\n", "from typing import List\n", "\n", "def process_page(page: dict) -> Document:\n", " \"\"\"\n", " Transforme une page OCR en document LangChain.\n", " Retourne un Document avec le contenu markdown et les métadonnées.\n", " \"\"\"\n", " idx = page.get('index')\n", " md = (page.get('markdown') or '').replace('$\\\\checkmark$', '-')\n", "\n", " metadata = {\n", " \"source\": \"CAPL Catalogue SERVICE 2024_2025.pdf\",\n", " \"page_number\": idx + 1,\n", " \"type\": \"prestation\",\n", " }\n", "\n", " # Créer un document LangChain\n", " return Document(\n", " page_content=md,\n", " metadata=metadata\n", " )\n", "\n", "\n", "# Collecter tous les documents\n", "documents: List[Document] = []\n", "\n", "if not pages:\n", " print(\"Aucune page dans le JSON.\")\n", "else:\n", " print(f\"Traitement des pages {start_index} à {end_index_inclusive}...\")\n", " max_index = min(len(pages) - 1, end_index_inclusive)\n", " \n", " for page_index in range(start_index, max_index + 1):\n", " try:\n", " page = pages[page_index]\n", " doc = process_page(page)\n", " # print(doc)\n", " documents.append(doc)\n", " print(f\"✓ Page {page.get('index')} traitée ({len(doc.page_content)} caractères)\")\n", " except IndexError:\n", " break\n", " except Exception as e:\n", " print(f\"⚠ Erreur sur la page {page_index}: {e}\")\n", " continue\n", " \n", " print(f\"\\n📊 Total: {len(documents)} documents prêts à être ajoutés au vectorstore\")\n", "\n", "# Afficher un aperçu\n", "if documents:\n", " print(f\"\\n📄 Aperçu du premier document:\")\n", " print(f\" - Contenu: {documents[0].page_content[:200]}...\")\n", " print(f\" - Métadonnées: {documents[0].metadata}\")\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "1eaae3a8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🚀 Ajout de 52 documents au vectorstore Supabase...\n", "⏳ Cela peut prendre quelques instants (génération des embeddings)...\n", "\n", "✅ Succès ! 52 documents ajoutés au vectorstore\n", "📝 IDs des documents: ['7e712a4f-5370-40b6-b68d-72068a003c3e', 'ebf26ff1-d79d-4f99-bfc6-618efa3f22f4', 'b30869ae-dafb-4e12-a1e9-d4254cb44d47', 'c3ab18eb-6579-4695-b44b-5d8c45e1d2cc', '2d76fe7f-797f-4c64-b902-869beeb0ec96']...\n" ] } ], "source": [ "# Ajouter les documents au vectorstore Supabase\n", "if documents:\n", " print(f\"🚀 Ajout de {len(documents)} documents au vectorstore Supabase...\")\n", " print(\"⏳ Cela peut prendre quelques instants (génération des embeddings)...\\n\")\n", " \n", " try:\n", " # Ajouter les documents en batch\n", " # La méthode add_documents crée automatiquement les embeddings\n", " ids = vector_store.add_documents(documents)\n", " \n", " print(f\"✅ Succès ! {len(ids)} documents ajoutés au vectorstore\")\n", " print(f\"📝 IDs des documents: {ids[:5]}...\" if len(ids) > 5 else f\"📝 IDs des documents: {ids}\")\n", " \n", " except Exception as e:\n", " print(f\"❌ Erreur lors de l'ajout des documents: {e}\")\n", " import traceback\n", " traceback.print_exc()\n", "else:\n", " print(\"⚠ Aucun document à ajouter\")\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "6a768f25", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🔍 Recherche: 'formations sur le leadership'\n", "\n", "vector_store\n", "results\n", "📊 3 résultats trouvés:\n", "\n", "============================================================\n", "Résultat #1\n", "============================================================\n", "📄 Source: CAPL Catalogue FORMATION 2024_2025.pdf\n", "📖 Page: 103\n", "📝 Contenu (extrait):\n", "# CONDITIONS GÉNÉRALES \n", "\n", "## PUBLIC ET PRÉ-REQUIS\n", "\n", "Lesformationss'adressentenprioritéaux agriculteurs, agricultrices, conjoint.e.s collaborateur contributeurs du VIVEA. Elles sont ouvertes aux salarié.e.s d'exploitation agricole ou à d'autres catégories de personnes selon des modalités à définir. Lor...\n", "\n", "============================================================\n", "Résultat #2\n", "============================================================\n", "📄 Source: CAPL Catalogue FORMATION 2024_2025.pdf\n", "📖 Page: 28\n", "📝 Contenu (extrait):\n", "# RH ET ORGANISATION DU TRAVAIL \n", "\n", "## Manager, communiquer, entretenir la motivation de son.ses salarié.e.s\n", "\n", "## J'ai des relations de confiance avec mon.mes salarié.e.s et mon entreprise gagne en performance\n", "\n", "## Objectifs\n", "\n", "- Savoir déléguer, transmettre des consignes, motiver, faire adhérer son.ses s...\n", "\n", "============================================================\n", "Résultat #3\n", "============================================================\n", "📄 Source: CAPL Catalogue FORMATION 2024_2025.pdf\n", "📖 Page: 70\n", "📝 Contenu (extrait):\n", "# TOUS ÉLEVAGES \n", "\n", "## S'initier à l'éducation et au dressage de son chien de conduite de troupeau\n", "\n", "Je fais équipe avec mon chien et j'améliore mes conditions de travail\n", "\n", "## Objectifs\n", "\n", "- Avoir des repères pour comprendre le fonctionnement du troupeau\n", "- Connaître les bases du fonctionnement d'un chien ...\n", "\n" ] } ], "source": [ "# 🔍 Test de recherche dans le vectorstore\n", "query = \"formations sur le leadership\"\n", "\n", "print(f\"🔍 Recherche: '{query}'\\n\")\n", "\n", "vector_store = SupabaseVectorStore(\n", " embedding=embeddings,\n", " client=supabase,\n", " table_name=\"documents\",\n", " query_name=\"match_documents\",\n", ")\n", "print(\"vector_store\")\n", "# Effectuer une recherche de similarité\n", "results = vector_store.similarity_search(\n", " query,\n", " k=3 # Nombre de résultats à retourner\n", ")\n", "print(\"results\")\n", "\n", "print(f\"📊 {len(results)} résultats trouvés:\\n\")\n", "\n", "for i, doc in enumerate(results, 1):\n", " print(f\"{'='*60}\")\n", " print(f\"Résultat #{i}\")\n", " print(f\"{'='*60}\")\n", " print(f\"📄 Source: {doc.metadata.get('source', 'N/A')}\")\n", " print(f\"📖 Page: {doc.metadata.get('page_number', 'N/A')}\")\n", " print(f\"📝 Contenu (extrait):\")\n", " print(doc.page_content[:300] + \"...\" if len(doc.page_content) > 300 else doc.page_content)\n", " print()\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }