Spaces:
Running
Running
| import fitz # PyMuPDF | |
| import io | |
| class PdfHandler: | |
| def __init__(self, ocr_engine): | |
| """ | |
| :param ocr_engine: Instance of OcrEngine to handle scanned pages. | |
| """ | |
| self.ocr_engine = ocr_engine | |
| print("[OK] PDF Handler loaded.") | |
| def get_total_pages(self, file_bytes: bytes) -> int: | |
| try: | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| return len(doc) | |
| except: | |
| return 0 | |
| def get_page_text(self, file_bytes: bytes, page_num: int) -> str: | |
| """ | |
| Extracts text from a specific page. Falls back to OCR if text is empty. | |
| """ | |
| try: | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| if not (0 <= page_num < len(doc)): return "" | |
| page = doc[page_num] | |
| text = page.get_text("text") | |
| # OCR Fallback for scanned PDFs | |
| if not text.strip() and self.ocr_engine.available: | |
| print(f"[WARN] Page {page_num+1} appears empty/scanned. Running OCR...") | |
| pix = page.get_pixmap() | |
| img_bytes = pix.tobytes("png") | |
| text = self.ocr_engine.extract_text(img_bytes) | |
| return text | |
| except Exception as e: | |
| print(f"PDF Text Error: {e}") | |
| return "" | |
| def render_labeled_image(self, file_bytes: bytes, page_num: int, matches: list, color_map: dict) -> bytes: | |
| """ | |
| Draws bounding boxes around detected PII on the PDF page image. | |
| """ | |
| try: | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| if not (0 <= page_num < len(doc)): return None | |
| page = doc[page_num] | |
| # Draw rectangles for each match | |
| for m in matches: | |
| # Get color for this PII type (normalize 0-255 rgb to 0-1 for PyMuPDF) | |
| # color_map values are hex strings or tuples. Assuming the backend passes hex or we default. | |
| # Simplification: Use Red for all boxes for visibility, or logic below: | |
| color_norm = (1, 0, 0) # Default Red | |
| # Search for the text string on the page | |
| quads = page.search_for(m['text']) | |
| for q in quads: | |
| # Draw Box | |
| page.draw_rect(q, color=color_norm, width=1.5, fill=color_norm, fill_opacity=0.2) | |
| # Add Label | |
| page.insert_text(fitz.Point(q.x0, q.y0-2), m['label'], fontsize=6, color=(0,0,0)) | |
| # Render page to image | |
| pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Zoom=2 for higher quality | |
| return pix.tobytes("png") | |
| except Exception as e: | |
| print(f"PDF Render Error: {e}") | |
| return None |