Cyril Dupland
Implement PDF summary generation tool: add a new tool for generating project-aware PDF summaries from conversations, enhancing the summarization capabilities of the agent. Update retrieval tools for improved metadata handling and streamline the agent's tool integration. Refactor PDF generation logic to handle link sanitization and improve error handling during PDF creation.
2337dfb | """PDF generation from Markdown using markdown-pdf.""" | |
| import logging | |
| import re | |
| from io import BytesIO | |
| from pathlib import Path | |
| from typing import Optional | |
| from markdown_pdf import MarkdownPdf, Section | |
| _PROJECT_ROOT = Path(__file__).resolve().parent.parent | |
| # markdown_pdf (via pymupdf) resolves every link target at build time. A link | |
| # with an empty href or a dangling "#anchor" raises | |
| # RuntimeError: No destination with id=... | |
| # The LLM sometimes produces such placeholders (e.g. `[contact]()`, | |
| # `[lien](#)`, `[foo](mailto:)`). We strip them before building the PDF. | |
| _EMPTY_HREF_RE = re.compile(r"\[([^\]]*)\]\(\s*\)") | |
| _HASH_HREF_RE = re.compile(r"\[([^\]]*)\]\(#[^)]*\)") | |
| _EMPTY_SCHEME_HREF_RE = re.compile( | |
| r"\[([^\]]*)\]\((?:mailto|tel|https?|ftp):\s*\)", | |
| re.IGNORECASE, | |
| ) | |
| _ANY_LINK_RE = re.compile(r"\[([^\]]*)\]\([^)]*\)") | |
| def _sanitize_markdown_for_pdf(md: str) -> str: | |
| """Remove link patterns that confuse pymupdf's anchor resolver.""" | |
| md = _EMPTY_HREF_RE.sub(r"\1", md) | |
| md = _HASH_HREF_RE.sub(r"\1", md) | |
| md = _EMPTY_SCHEME_HREF_RE.sub(r"\1", md) | |
| return md | |
| def _strip_all_links(md: str) -> str: | |
| """Last-resort fallback: turn every `[text](url)` into plain `text`.""" | |
| return _ANY_LINK_RE.sub(r"\1", md) | |
| def _build_pdf(md_text: str, toc_level: int, logo_path: Optional[str]) -> BytesIO: | |
| pdf = MarkdownPdf(toc_level=toc_level, optimize=True) | |
| logo_file = _PROJECT_ROOT / logo_path if logo_path else None | |
| if logo_file and logo_file.is_file(): | |
| logo_md = f"" | |
| logo_css = "img { display: block; margin-left: auto; margin-right: auto; }" | |
| logo_section = Section(logo_md, toc=False) | |
| logo_section.root = str(_PROJECT_ROOT) | |
| pdf.add_section(logo_section, user_css=logo_css) | |
| content_css = ( | |
| "body, p, li, ul, ol, td, th { font-size: 9pt; } " | |
| "h1 { font-size: 14pt; } h2 { font-size: 12pt; } h3 { font-size: 10pt; }" | |
| ) | |
| content_section = Section(md_text) | |
| content_section.root = str(_PROJECT_ROOT) | |
| pdf.add_section(content_section, user_css=content_css) | |
| buffer = BytesIO() | |
| pdf.save_bytes(buffer) | |
| buffer.seek(0) | |
| return buffer | |
| def markdown_to_pdf( | |
| md_text: str, | |
| toc_level: int = 2, | |
| logo_path: Optional[str] = None, | |
| ) -> BytesIO: | |
| """Convert Markdown to PDF using markdown-pdf (PyMuPDF + markdown-it-py). | |
| Supports UTF-8, tables, links, images, and TOC from headings. | |
| Optionally adds a dedicated logo section at the top (centered) if logo_path is set. | |
| Links with empty or dangling hrefs are stripped to avoid pymupdf errors; if | |
| the build still fails, every link is converted to plain text as a fallback. | |
| """ | |
| if logo_path is None: | |
| try: | |
| from config.settings import settings | |
| logo_path = getattr(settings, "pdf_logo_path", None) | |
| except Exception: | |
| logo_path = None | |
| sanitized = _sanitize_markdown_for_pdf(md_text) | |
| try: | |
| return _build_pdf(sanitized, toc_level, logo_path) | |
| except RuntimeError as exc: | |
| logging.warning( | |
| "markdown_to_pdf: retrying without links after pymupdf error: %s", exc | |
| ) | |
| return _build_pdf(_strip_all_links(sanitized), toc_level, logo_path) | |