import gradio as gr
import time
import spaces
from PIL import Image
from transformers import Qwen3VLForConditionalGeneration, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText
from qwen_vl_utils import process_vision_info
import torch
import uuid
import os
import numpy as np

# Model configurations
MODEL_CONFIGS = {

    "KATIB OCR 0.8B 0.1": {
        "name": "oddadmix/Katib-Qwen3.5-0.8B-0.3",
        "class": AutoModelForImageTextToText,
        "prompt": "Free OCR.",
        "use_qwen3": True
    }
}

# Load models
models = {}
processors = {}

for model_key, config in MODEL_CONFIGS.items():
    print(f"Loading {model_key}...")
    models[model_key] = config["class"].from_pretrained(
        config["name"],
        torch_dtype="auto",
        device_map="auto" 
    )
    processors[model_key] = AutoProcessor.from_pretrained(config["name"])

max_tokens = 2000

def resizeImage(image):
    if image.height > 1500:
        image = image.resize((int(image.width * 1500 / image.height), 1500), Image.Resampling.LANCZOS)
    return image

@spaces.GPU
def perform_ocr(image, model_choice):
    inputArray = np.any(image)
    if inputArray == False:
        return "Error Processing"
    
    """Process image and extract text using selected OCR model"""
    image = Image.fromarray(image)
    
    # Get model configuration
    config = MODEL_CONFIGS[model_choice]
    model = models[model_choice]
    processor = processors[model_choice]
    prompt = config["prompt"]
    use_qwen3 = config["use_qwen3"]
    
    # Resize image for Qwen3 model

    # image = resizeImage(image)
    print("Image resized")
    
    src = str(uuid.uuid4()) + ".png"
    image.save(src)
    print(src)
    # Prepare messages based on model type
    if use_qwen3:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": f"./{src}"},
                    {"type": "text", "text": prompt},
                ],
            }
        ]
    else:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": f"file://{src}"},
                    {"type": "text", "text": prompt},
                ],
            }
        ]
    
    # Process inputs based on model type
    if use_qwen3:
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        )
        inputs = inputs.to(model.device)
    else:
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda")
    
    # Generate text
    generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    # Cleanup
    os.remove(src)
    return output_text

# Create Gradio interface
with gr.Blocks(title="Arabic OCR Models Demo") as demo:
    gr.Markdown("# Arabic OCR Models Demo")
    gr.Markdown("Upload an image to extract Arabic text in real-time. Choose between different OCR models.")
    
    with gr.Row():
        with gr.Column(scale=1):
            # Model selection dropdown
            model_dropdown = gr.Dropdown(
                choices=list(MODEL_CONFIGS.keys()),
                value=list(MODEL_CONFIGS.keys())[0],
                label="Select OCR Model",
                interactive=True
            )
            
            # Input image
            image_input = gr.Image(type="numpy", label="Upload Image")
            
            # Example gallery
            gr.Examples(
                examples=[
                    ["0.4.png"],
                    ["2.jpg"],
                    ["3.jpg"]
                ],
                inputs=image_input,
                label="Example Images",
                examples_per_page=4
            )
            
            # Submit button
            submit_btn = gr.Button("Extract Text")
        
        with gr.Column(scale=1):
            # Output text
            output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)
            
            # Model details
            with gr.Accordion("Model Information", open=False):
                gr.Markdown("""
                **Available Models:**
                
                1. **KATIB OCR 0.1 0.8B **
                   - Model: oddadmix/Katib-Qwen3.5-0.8B-0.1
                   - Based on Qwen3.5
                   - Size: 0.8B parameters
                
                2. **Qari OCR 0.2.2.1**
                   - Model: NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct
                   - Based on Qwen2-VL architecture
                   - Size: 2B parameters
                
                **Context window:** Supports up to 2000 output tokens
                """)
    
    # Set up processing flow
    submit_btn.click(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)
    image_input.change(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)

demo.launch()