Spaces:

ChambreAgriculturePaysLoire
/

routeur_ia_api

Running

Cyril Dupland

Implement PDF summary generation tool: add a new tool for generating project-aware PDF summaries from conversations, enhancing the summarization capabilities of the agent. Update retrieval tools for improved metadata handling and streamline the agent's tool integration. Refactor PDF generation logic to handle link sanitization and improve error handling during PDF creation.

2337dfb about 2 months ago

raw

history blame

3.26 kB

	"""PDF generation from Markdown using markdown-pdf."""
	import logging
	import re
	from io import BytesIO
	from pathlib import Path
	from typing import Optional

	from markdown_pdf import MarkdownPdf, Section

	_PROJECT_ROOT = Path(__file__).resolve().parent.parent

	# markdown_pdf (via pymupdf) resolves every link target at build time. A link
	# with an empty href or a dangling "#anchor" raises
	# RuntimeError: No destination with id=...
	# The LLM sometimes produces such placeholders (e.g. `[contact]()`,
	# `[lien](#)`, `[foo](mailto:)`). We strip them before building the PDF.
	_EMPTY_HREF_RE = re.compile(r"\[([^\]])\]\(\s\)")
	_HASH_HREF_RE = re.compile(r"\[([^\]])\]\(#[^)]\)")
	_EMPTY_SCHEME_HREF_RE = re.compile(
	r"\[([^\]])\]\((?:mailto\|tel\|https?\|ftp):\s\)",
	re.IGNORECASE,
	)
	_ANY_LINK_RE = re.compile(r"\[([^\]])\]\([^)]\)")


	def _sanitize_markdown_for_pdf(md: str) -> str:
	"""Remove link patterns that confuse pymupdf's anchor resolver."""
	md = _EMPTY_HREF_RE.sub(r"\1", md)
	md = _HASH_HREF_RE.sub(r"\1", md)
	md = _EMPTY_SCHEME_HREF_RE.sub(r"\1", md)
	return md


	def _strip_all_links(md: str) -> str:
	"""Last-resort fallback: turn every `[text](url)` into plain `text`."""
	return _ANY_LINK_RE.sub(r"\1", md)


	def _build_pdf(md_text: str, toc_level: int, logo_path: Optional[str]) -> BytesIO:
	pdf = MarkdownPdf(toc_level=toc_level, optimize=True)

	logo_file = _PROJECT_ROOT / logo_path if logo_path else None
	if logo_file and logo_file.is_file():
	logo_md = f"![Logo]({logo_path})"
	logo_css = "img { display: block; margin-left: auto; margin-right: auto; }"
	logo_section = Section(logo_md, toc=False)
	logo_section.root = str(_PROJECT_ROOT)
	pdf.add_section(logo_section, user_css=logo_css)

	content_css = (
	"body, p, li, ul, ol, td, th { font-size: 9pt; } "
	"h1 { font-size: 14pt; } h2 { font-size: 12pt; } h3 { font-size: 10pt; }"
	)
	content_section = Section(md_text)
	content_section.root = str(_PROJECT_ROOT)
	pdf.add_section(content_section, user_css=content_css)

	buffer = BytesIO()
	pdf.save_bytes(buffer)
	buffer.seek(0)
	return buffer


	def markdown_to_pdf(
	md_text: str,
	toc_level: int = 2,
	logo_path: Optional[str] = None,
	) -> BytesIO:
	"""Convert Markdown to PDF using markdown-pdf (PyMuPDF + markdown-it-py).

	Supports UTF-8, tables, links, images, and TOC from headings.
	Optionally adds a dedicated logo section at the top (centered) if logo_path is set.
	Links with empty or dangling hrefs are stripped to avoid pymupdf errors; if
	the build still fails, every link is converted to plain text as a fallback.
	"""
	if logo_path is None:
	try:
	from config.settings import settings
	logo_path = getattr(settings, "pdf_logo_path", None)
	except Exception:
	logo_path = None

	sanitized = _sanitize_markdown_for_pdf(md_text)
	try:
	return _build_pdf(sanitized, toc_level, logo_path)
	except RuntimeError as exc:
	logging.warning(
	"markdown_to_pdf: retrying without links after pymupdf error: %s", exc
	)
	return _build_pdf(_strip_all_links(sanitized), toc_level, logo_path)