"""PDF generation from Markdown using markdown-pdf.""" import logging import re from io import BytesIO from pathlib import Path from typing import Optional from markdown_pdf import MarkdownPdf, Section _PROJECT_ROOT = Path(__file__).resolve().parent.parent # markdown_pdf (via pymupdf) resolves every link target at build time. A link # with an empty href or a dangling "#anchor" raises # RuntimeError: No destination with id=... # The LLM sometimes produces such placeholders (e.g. `[contact]()`, # `[lien](#)`, `[foo](mailto:)`). We strip them before building the PDF. _EMPTY_HREF_RE = re.compile(r"\[([^\]]*)\]\(\s*\)") _HASH_HREF_RE = re.compile(r"\[([^\]]*)\]\(#[^)]*\)") _EMPTY_SCHEME_HREF_RE = re.compile( r"\[([^\]]*)\]\((?:mailto|tel|https?|ftp):\s*\)", re.IGNORECASE, ) _ANY_LINK_RE = re.compile(r"\[([^\]]*)\]\([^)]*\)") def _sanitize_markdown_for_pdf(md: str) -> str: """Remove link patterns that confuse pymupdf's anchor resolver.""" md = _EMPTY_HREF_RE.sub(r"\1", md) md = _HASH_HREF_RE.sub(r"\1", md) md = _EMPTY_SCHEME_HREF_RE.sub(r"\1", md) return md def _strip_all_links(md: str) -> str: """Last-resort fallback: turn every `[text](url)` into plain `text`.""" return _ANY_LINK_RE.sub(r"\1", md) def _build_pdf(md_text: str, toc_level: int, logo_path: Optional[str]) -> BytesIO: pdf = MarkdownPdf(toc_level=toc_level, optimize=True) logo_file = _PROJECT_ROOT / logo_path if logo_path else None if logo_file and logo_file.is_file(): logo_md = f"![Logo]({logo_path})" logo_css = "img { display: block; margin-left: auto; margin-right: auto; }" logo_section = Section(logo_md, toc=False) logo_section.root = str(_PROJECT_ROOT) pdf.add_section(logo_section, user_css=logo_css) content_css = ( "body, p, li, ul, ol, td, th { font-size: 9pt; } " "h1 { font-size: 14pt; } h2 { font-size: 12pt; } h3 { font-size: 10pt; }" ) content_section = Section(md_text) content_section.root = str(_PROJECT_ROOT) pdf.add_section(content_section, user_css=content_css) buffer = BytesIO() pdf.save_bytes(buffer) buffer.seek(0) return buffer def markdown_to_pdf( md_text: str, toc_level: int = 2, logo_path: Optional[str] = None, ) -> BytesIO: """Convert Markdown to PDF using markdown-pdf (PyMuPDF + markdown-it-py). Supports UTF-8, tables, links, images, and TOC from headings. Optionally adds a dedicated logo section at the top (centered) if logo_path is set. Links with empty or dangling hrefs are stripped to avoid pymupdf errors; if the build still fails, every link is converted to plain text as a fallback. """ if logo_path is None: try: from config.settings import settings logo_path = getattr(settings, "pdf_logo_path", None) except Exception: logo_path = None sanitized = _sanitize_markdown_for_pdf(md_text) try: return _build_pdf(sanitized, toc_level, logo_path) except RuntimeError as exc: logging.warning( "markdown_to_pdf: retrying without links after pymupdf error: %s", exc ) return _build_pdf(_strip_all_links(sanitized), toc_level, logo_path)