File size: 839 Bytes
e7fd66f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#!/usr/bin/env python3
"""CLI: extract text from a PDF."""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

from researchmind.scrape_pdf import extract_pdf


def main() -> int:
    parser = argparse.ArgumentParser(description="Extract PDF text for ResearchMind")
    parser.add_argument("path", type=Path, help="Path to PDF file")
    parser.add_argument("--out", help="Write full text to this file")
    args = parser.parse_args()

    doc = extract_pdf(args.path)
    if args.out:
        Path(args.out).write_text(doc.text, encoding="utf-8")
    print(f"Title: {doc.title}")
    print(f"Pages metadata: {doc.metadata.get('page_count', '?')}")
    print(f"Chars: {len(doc.text)}")
    if not args.out:
        print(doc.text[:2000])
    return 0


if __name__ == "__main__":
    sys.exit(main())