#!/bin/bash # One-shot pipeline: ingest local directory → JSONL → tau-rag server. # # Runs everything: # 1. Installs Python deps (python-docx pypdf python-pptx openpyxl pyarrow) # 2. Starts the server (if not running) # 3. Runs the standalone ingester (with resume + caps) # 4. Uploads the resulting JSONL to the server # 5. Verifies end-to-end with a test query # # Usage: # ./scripts/run_all.sh /Users/avrahambarzel/LawDBHeb_local # ./scripts/run_all.sh /Users/avrahambarzel/LawDBHeb_local ~/my_corpus.jsonl set -eu DATA_DIR="${1:-}" OUTPUT="${2:-$HOME/tau_corpus.jsonl}" ERRORS="${OUTPUT%.jsonl}_errors.txt" MAX_CHUNKS="${MAX_CHUNKS:-5000}" MAX_FILE_MB="${MAX_FILE_MB:-100}" EXTENSIONS="${EXTENSIONS:-}" # e.g. ".pdf .docx .txt" PORT="${PORT:-8000}" if [ -z "$DATA_DIR" ]; then echo "Usage: $0 [output_jsonl]" echo "" echo "Env vars you can override:" echo " MAX_CHUNKS=5000 cap chunks per file" echo " MAX_FILE_MB=100 skip files bigger than this" echo " EXTENSIONS='.pdf .docx .txt' whitelist" echo " PORT=8000 server port" exit 1 fi if [ ! -d "$DATA_DIR" ]; then echo "❌ Data dir not found: $DATA_DIR" exit 2 fi # Figure out the script's directory (the tau_rag root) SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" TAU_RAG_DIR="$( cd "$SCRIPT_DIR/.." && pwd )" PARENT_DIR="$( cd "$TAU_RAG_DIR/.." && pwd )" echo "┌──────────────────────────────────────────" echo "│ tau-rag · run-all" echo "│ Data: $DATA_DIR" echo "│ Output: $OUTPUT" echo "│ Errors: $ERRORS" echo "│ Caps: $MAX_CHUNKS chunks/file, $MAX_FILE_MB MB/file" echo "└──────────────────────────────────────────" echo # ============================= Step 1: dependencies ============== echo "→ [1/4] Checking Python dependencies..." python3 -c " import importlib.util, sys needed = {'pypdf':'pypdf', 'docx':'python-docx', 'pptx':'python-pptx', 'openpyxl':'openpyxl', 'pyarrow':'pyarrow', 'requests':'requests'} missing = [] for mod, pkg in needed.items(): if importlib.util.find_spec(mod) is None: missing.append(pkg) if missing: print('missing:', ' '.join(missing)) sys.exit(1) print(' ✓ all deps present') " || { echo " Installing missing packages..." pip install python-docx pypdf python-pptx openpyxl pyarrow requests \ --break-system-packages 2>/dev/null \ || pip install python-docx pypdf python-pptx openpyxl pyarrow requests } echo # ============================= Step 2: start server ============= echo "→ [2/4] Server on :$PORT ..." SERVER_PID="" if curl -s --max-time 2 "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then echo " ✓ server already running" else echo " starting server in background..." cd "$TAU_RAG_DIR" (TAU_RAG_CORS_ORIGINS="*" PYTHONPATH="$PARENT_DIR" \ python3 -m uvicorn tau_rag.api.fastapi_app:app \ --host 127.0.0.1 --port "$PORT" \ > /tmp/tau_rag_server.log 2>&1) & SERVER_PID=$! echo " PID: $SERVER_PID, log: /tmp/tau_rag_server.log" for i in $(seq 1 30); do sleep 1 if curl -s --max-time 2 "http://127.0.0.1:$PORT/health" >/dev/null 2>&1; then echo " ✓ server up after ${i}s" break fi [ $i -eq 30 ] && { echo " ❌ server failed to start — see /tmp/tau_rag_server.log"; exit 3; } done fi echo # ============================= Step 3: ingest =================== echo "→ [3/4] Ingesting $DATA_DIR..." cd "$TAU_RAG_DIR" EXT_ARGS="" if [ -n "$EXTENSIONS" ]; then EXT_ARGS="--extensions $EXTENSIONS" fi RESUME_FLAG="" if [ -f "$OUTPUT" ]; then echo " (resuming from existing $OUTPUT)" RESUME_FLAG="--resume" fi # shellcheck disable=SC2086 PYTHONPATH="$PARENT_DIR" python3 -m scripts.ingest_local "$DATA_DIR" \ -o "$OUTPUT" \ --errors "$ERRORS" \ --max-chunks-per-file "$MAX_CHUNKS" \ --max-file-size-mb "$MAX_FILE_MB" \ $RESUME_FLAG $EXT_ARGS if [ ! -f "$OUTPUT" ] || [ ! -s "$OUTPUT" ]; then echo "❌ no output produced — nothing to upload" exit 4 fi LINES=$(wc -l < "$OUTPUT" | tr -d ' ') SIZE=$(ls -lh "$OUTPUT" | awk '{print $5}') echo " ✓ $LINES chunks in $OUTPUT ($SIZE)" echo # ============================= Step 4: upload to server ========= echo "→ [4/4] Uploading to server..." UPLOAD_RES=$(curl -s -X POST "http://127.0.0.1:$PORT/v1/data/upload" \ -F "file=@$OUTPUT" \ --max-time 600) echo " Response:" echo "$UPLOAD_RES" | python3 -c " import json, sys try: d = json.loads(sys.stdin.read()) print(f' ok: {d.get(\"ok\")}') print(f' kind: {d.get(\"kind\")}') print(f' parsed: {d.get(\"n_rows_parsed\")}') print(f' indexed: {d.get(\"n_indexed_in_pipeline\")}') print(f' saved to: {d.get(\"saved_to\")}') except Exception as e: print(f' ⚠ response not JSON: {e}') print(f' raw: {sys.stdin.read()[:400] if hasattr(sys.stdin, \"read\") else \"\"}')" echo # ============================= Smoke test ====================== echo "→ Sanity query to /v1/query..." QUERY_RES=$(curl -s -X POST "http://127.0.0.1:$PORT/v1/query" \ -H "Content-Type: application/json" \ -d '{"query":"מה אומר סעיף 39","top_k":3}' \ --max-time 30) echo "$QUERY_RES" | python3 -m json.tool 2>/dev/null | head -25 \ || echo " ⚠ query response was not valid JSON" echo echo "┌──────────────────────────────────────────" echo "│ ✓ Done!" echo "│ Open: http://127.0.0.1:$PORT/" echo "│ Admin: http://127.0.0.1:$PORT/admin" echo "└──────────────────────────────────────────" [ -n "$SERVER_PID" ] && echo "Server PID: $SERVER_PID (kill with: kill $SERVER_PID)"