Spaces:
Running on Zero
Running on Zero
File size: 839 Bytes
e7fd66f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | #!/usr/bin/env python3
"""CLI: extract text from a PDF."""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
from researchmind.scrape_pdf import extract_pdf
def main() -> int:
parser = argparse.ArgumentParser(description="Extract PDF text for ResearchMind")
parser.add_argument("path", type=Path, help="Path to PDF file")
parser.add_argument("--out", help="Write full text to this file")
args = parser.parse_args()
doc = extract_pdf(args.path)
if args.out:
Path(args.out).write_text(doc.text, encoding="utf-8")
print(f"Title: {doc.title}")
print(f"Pages metadata: {doc.metadata.get('page_count', '?')}")
print(f"Chars: {len(doc.text)}")
if not args.out:
print(doc.text[:2000])
return 0
if __name__ == "__main__":
sys.exit(main())
|