Spaces:
Running
Running
File size: 2,894 Bytes
42af764 a90aec8 42af764 a90aec8 42af764 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | import fitz # PyMuPDF
import io
class PdfHandler:
def __init__(self, ocr_engine):
"""
:param ocr_engine: Instance of OcrEngine to handle scanned pages.
"""
self.ocr_engine = ocr_engine
print("[OK] PDF Handler loaded.")
def get_total_pages(self, file_bytes: bytes) -> int:
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
return len(doc)
except:
return 0
def get_page_text(self, file_bytes: bytes, page_num: int) -> str:
"""
Extracts text from a specific page. Falls back to OCR if text is empty.
"""
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
if not (0 <= page_num < len(doc)): return ""
page = doc[page_num]
text = page.get_text("text")
# OCR Fallback for scanned PDFs
if not text.strip() and self.ocr_engine.available:
print(f"[WARN] Page {page_num+1} appears empty/scanned. Running OCR...")
pix = page.get_pixmap()
img_bytes = pix.tobytes("png")
text = self.ocr_engine.extract_text(img_bytes)
return text
except Exception as e:
print(f"PDF Text Error: {e}")
return ""
def render_labeled_image(self, file_bytes: bytes, page_num: int, matches: list, color_map: dict) -> bytes:
"""
Draws bounding boxes around detected PII on the PDF page image.
"""
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
if not (0 <= page_num < len(doc)): return None
page = doc[page_num]
# Draw rectangles for each match
for m in matches:
# Get color for this PII type (normalize 0-255 rgb to 0-1 for PyMuPDF)
# color_map values are hex strings or tuples. Assuming the backend passes hex or we default.
# Simplification: Use Red for all boxes for visibility, or logic below:
color_norm = (1, 0, 0) # Default Red
# Search for the text string on the page
quads = page.search_for(m['text'])
for q in quads:
# Draw Box
page.draw_rect(q, color=color_norm, width=1.5, fill=color_norm, fill_opacity=0.2)
# Add Label
page.insert_text(fitz.Point(q.x0, q.y0-2), m['label'], fontsize=6, color=(0,0,0))
# Render page to image
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Zoom=2 for higher quality
return pix.tobytes("png")
except Exception as e:
print(f"PDF Render Error: {e}")
return None |