Cyril Dupland
Implement PDF summary generation tool: add a new tool for generating project-aware PDF summaries from conversations, enhancing the summarization capabilities of the agent. Update retrieval tools for improved metadata handling and streamline the agent's tool integration. Refactor PDF generation logic to handle link sanitization and improve error handling during PDF creation.
2337dfb
raw
history blame
3.26 kB
"""PDF generation from Markdown using markdown-pdf."""
import logging
import re
from io import BytesIO
from pathlib import Path
from typing import Optional
from markdown_pdf import MarkdownPdf, Section
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
# markdown_pdf (via pymupdf) resolves every link target at build time. A link
# with an empty href or a dangling "#anchor" raises
# RuntimeError: No destination with id=...
# The LLM sometimes produces such placeholders (e.g. `[contact]()`,
# `[lien](#)`, `[foo](mailto:)`). We strip them before building the PDF.
_EMPTY_HREF_RE = re.compile(r"\[([^\]]*)\]\(\s*\)")
_HASH_HREF_RE = re.compile(r"\[([^\]]*)\]\(#[^)]*\)")
_EMPTY_SCHEME_HREF_RE = re.compile(
r"\[([^\]]*)\]\((?:mailto|tel|https?|ftp):\s*\)",
re.IGNORECASE,
)
_ANY_LINK_RE = re.compile(r"\[([^\]]*)\]\([^)]*\)")
def _sanitize_markdown_for_pdf(md: str) -> str:
"""Remove link patterns that confuse pymupdf's anchor resolver."""
md = _EMPTY_HREF_RE.sub(r"\1", md)
md = _HASH_HREF_RE.sub(r"\1", md)
md = _EMPTY_SCHEME_HREF_RE.sub(r"\1", md)
return md
def _strip_all_links(md: str) -> str:
"""Last-resort fallback: turn every `[text](url)` into plain `text`."""
return _ANY_LINK_RE.sub(r"\1", md)
def _build_pdf(md_text: str, toc_level: int, logo_path: Optional[str]) -> BytesIO:
pdf = MarkdownPdf(toc_level=toc_level, optimize=True)
logo_file = _PROJECT_ROOT / logo_path if logo_path else None
if logo_file and logo_file.is_file():
logo_md = f"![Logo]({logo_path})"
logo_css = "img { display: block; margin-left: auto; margin-right: auto; }"
logo_section = Section(logo_md, toc=False)
logo_section.root = str(_PROJECT_ROOT)
pdf.add_section(logo_section, user_css=logo_css)
content_css = (
"body, p, li, ul, ol, td, th { font-size: 9pt; } "
"h1 { font-size: 14pt; } h2 { font-size: 12pt; } h3 { font-size: 10pt; }"
)
content_section = Section(md_text)
content_section.root = str(_PROJECT_ROOT)
pdf.add_section(content_section, user_css=content_css)
buffer = BytesIO()
pdf.save_bytes(buffer)
buffer.seek(0)
return buffer
def markdown_to_pdf(
md_text: str,
toc_level: int = 2,
logo_path: Optional[str] = None,
) -> BytesIO:
"""Convert Markdown to PDF using markdown-pdf (PyMuPDF + markdown-it-py).
Supports UTF-8, tables, links, images, and TOC from headings.
Optionally adds a dedicated logo section at the top (centered) if logo_path is set.
Links with empty or dangling hrefs are stripped to avoid pymupdf errors; if
the build still fails, every link is converted to plain text as a fallback.
"""
if logo_path is None:
try:
from config.settings import settings
logo_path = getattr(settings, "pdf_logo_path", None)
except Exception:
logo_path = None
sanitized = _sanitize_markdown_for_pdf(md_text)
try:
return _build_pdf(sanitized, toc_level, logo_path)
except RuntimeError as exc:
logging.warning(
"markdown_to_pdf: retrying without links after pymupdf error: %s", exc
)
return _build_pdf(_strip_all_links(sanitized), toc_level, logo_path)