""" PAGE XML Exporter Exports line segmentation and transcription data to PAGE XML format. Compatible with party and other PAGE XML processors. """ import xml.etree.ElementTree as ET from xml.dom import minidom from pathlib import Path from typing import List, Optional from datetime import datetime from inference_page import LineSegment class PageXMLExporter: """Export line segmentation data to PAGE XML format.""" # PAGE XML namespace NAMESPACE = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" def __init__(self, image_path: str, image_width: int, image_height: int): """ Initialize PAGE XML exporter. Args: image_path: Path to the page image file image_width: Width of the page image in pixels image_height: Height of the page image in pixels """ self.image_path = Path(image_path) self.image_width = image_width self.image_height = image_height def _make_root(self, creator: str, comments: Optional[str]) -> tuple: """Build root PcGts element with Metadata and Page. Returns (root, page).""" ET.register_namespace('', self.NAMESPACE) root = ET.Element('PcGts', { 'xmlns': self.NAMESPACE, 'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'xsi:schemaLocation': ( f'{self.NAMESPACE} ' 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd' ), 'pcGtsId': f'pc-{self.image_path.stem}' }) metadata = ET.SubElement(root, 'Metadata') ET.SubElement(metadata, 'Creator').text = creator ET.SubElement(metadata, 'Created').text = datetime.now().isoformat() ET.SubElement(metadata, 'LastChange').text = datetime.now().isoformat() if comments: ET.SubElement(metadata, 'Comments').text = comments page = ET.SubElement(root, 'Page', { 'imageFilename': str(self.image_path.name), 'imageWidth': str(self.image_width), 'imageHeight': str(self.image_height) }) return root, page @staticmethod def _write_xml(root: ET.Element, output_path: str) -> None: xml_str = ET.tostring(root, encoding='utf-8', method='xml') dom = minidom.parseString(xml_str) pretty_xml = dom.toprettyxml(indent=' ', encoding='utf-8') with open(output_path, 'wb') as f: f.write(pretty_xml) @staticmethod def _baseline_points(segment) -> str: """Return PAGE XML baseline points string for a segment.""" if hasattr(segment, 'baseline') and segment.baseline: return ' '.join(f'{x},{y}' for x, y in segment.baseline) x1, y1, x2, y2 = segment.bbox bl_y = y2 - 5 return f'{x1},{bl_y} {x2},{bl_y}' @staticmethod def _coords_points(segment) -> str: """Return PAGE XML coords points string for a segment.""" if hasattr(segment, 'coords') and segment.coords: return ' '.join(f'{x},{y}' for x, y in segment.coords) x1, y1, x2, y2 = segment.bbox return f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}' def _add_text_line(self, parent: ET.Element, line_id: str, segment, text: Optional[str], line_idx: int) -> None: """Add a TextLine element to parent with coords, baseline and optional text.""" line_elem = ET.SubElement(parent, 'TextLine', { 'id': line_id, 'custom': f'readingOrder {{index:{line_idx};}}' }) ET.SubElement(line_elem, 'Coords').set('points', self._coords_points(segment)) ET.SubElement(line_elem, 'Baseline').set('points', self._baseline_points(segment)) if text: conf = '1.0' if hasattr(segment, 'confidence') and segment.confidence is not None: conf = str(segment.confidence) text_equiv = ET.SubElement(line_elem, 'TextEquiv', {'conf': conf}) ET.SubElement(text_equiv, 'Unicode').text = text def export(self, segments: List[LineSegment], output_path: str, creator: str = "TrOCR-GUI", comments: Optional[str] = None) -> None: """ Export line segments to PAGE XML (single TextRegion, no region info). Args: segments: List of LineSegment objects (may carry .text attribute) output_path: Path where to save the PAGE XML file creator: Software/tool that created this PAGE XML comments: Optional comments about the document """ root, page = self._make_root(creator, comments) # Reading order reading_order = ET.SubElement(page, 'ReadingOrder') ordered_group = ET.SubElement(reading_order, 'OrderedGroup', { 'id': 'ro_1', 'caption': 'Regions reading order' }) ET.SubElement(ordered_group, 'RegionRefIndexed', { 'index': '0', 'regionRef': 'region_1' }) # Single text region spanning all lines text_region = ET.SubElement(page, 'TextRegion', { 'id': 'region_1', 'type': 'paragraph', 'custom': 'readingOrder {index:0;}' }) if segments: x1 = min(seg.bbox[0] for seg in segments) y1 = min(seg.bbox[1] for seg in segments) x2 = max(seg.bbox[2] for seg in segments) y2 = max(seg.bbox[3] for seg in segments) ET.SubElement(text_region, 'Coords').set( 'points', f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}' ) for idx, segment in enumerate(segments): text = getattr(segment, 'text', None) or None self._add_text_line(text_region, f'line_{idx + 1}', segment, text, idx) self._write_xml(root, output_path) def export_with_regions( self, regions, lines, output_path: str, transcriptions: Optional[List[str]] = None, creator: str = "TrOCR-GUI", comments: Optional[str] = None, ) -> None: """ Export with proper multi-region PAGE XML structure. Creates one TextRegion per detected region (e.g. columns, marginalia), with TextLines nested inside their region and actual baseline polylines. ReadingOrder lists regions left-to-right and lines top-to-bottom within each region, matching how blla / column clustering ordered them. Args: regions: List of SegRegion objects (duck-typed: .id, .line_ids, .bbox, optional .polygon). lines: Flat list of LineSegment objects, already ordered by region (region[0]'s lines first, then region[1]'s, …). The count of lines per region is len(region.line_ids). output_path: Where to write the PAGE XML file. transcriptions: Optional list of text strings, parallel to *lines*. Pass self.transcriptions from the GUI when available. creator: Creator string for Metadata. comments: Optional comments string for Metadata. """ root, page = self._make_root(creator, comments) # ReadingOrder — one RegionRefIndexed per region reading_order = ET.SubElement(page, 'ReadingOrder') ordered_group = ET.SubElement(reading_order, 'OrderedGroup', { 'id': 'ro_1', 'caption': 'Regions reading order' }) for ri, region in enumerate(regions): ET.SubElement(ordered_group, 'RegionRefIndexed', { 'index': str(ri), 'regionRef': region.id }) # TextRegions — one per region, lines nested inside line_offset = 0 for ri, region in enumerate(regions): n = len(region.line_ids) if hasattr(region, 'line_ids') else 0 region_lines = lines[line_offset:line_offset + n] line_offset += n text_region = ET.SubElement(page, 'TextRegion', { 'id': region.id, 'type': 'paragraph', 'custom': f'readingOrder {{index:{ri};}}' }) # Region polygon (prefer neural boundary over convex hull over bbox) if hasattr(region, 'polygon') and region.polygon and len(region.polygon) >= 3: pts = ' '.join(f'{x},{y}' for x, y in region.polygon) else: x1, y1, x2, y2 = region.bbox pts = f'{x1},{y1} {x2},{y1} {x2},{y2} {x1},{y2}' ET.SubElement(text_region, 'Coords').set('points', pts) for li, segment in enumerate(region_lines): global_line_idx = line_offset - n + li # index in the flat lines list text = None if transcriptions and global_line_idx < len(transcriptions): text = transcriptions[global_line_idx] or None elif hasattr(segment, 'text'): text = getattr(segment, 'text', None) or None self._add_text_line( text_region, f'line_{ri + 1}_{li + 1}', segment, text, li, ) self._write_xml(root, output_path) @staticmethod def quick_export(image_path: str, segments: List[LineSegment], output_path: Optional[str] = None) -> str: """ Quick export helper that automatically determines output path and image dimensions. Args: image_path: Path to the page image segments: List of LineSegment objects output_path: Optional output path (default: same as image with .xml extension) Returns: Path to the exported PAGE XML file """ from PIL import Image # Load image to get dimensions img = Image.open(image_path) width, height = img.size # Determine output path if output_path is None: output_path = Path(image_path).with_suffix('.xml') # Export exporter = PageXMLExporter(image_path, width, height) exporter.export(segments, str(output_path)) return str(output_path) if __name__ == "__main__": # Example usage from PIL import Image # Create a dummy segment for testing dummy_img = Image.new('L', (100, 30)) dummy_segment = LineSegment( image=dummy_img, bbox=(10, 10, 200, 40), text="Example text", confidence=0.95 ) exporter = PageXMLExporter("test_page.jpg", 800, 1200) exporter.export([dummy_segment], "test_output.xml", creator="PAGE XML Exporter Test", comments="This is a test export") print("Test PAGE XML created: test_output.xml")