import gradio as gr import time import spaces from PIL import Image from transformers import Qwen3VLForConditionalGeneration, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText from qwen_vl_utils import process_vision_info import torch import uuid import os import numpy as np # Model configurations MODEL_CONFIGS = { "KATIB OCR 0.8B 0.1": { "name": "oddadmix/Katib-Qwen3.5-0.8B-0.3", "class": AutoModelForImageTextToText, "prompt": "Free OCR.", "use_qwen3": True } } # Load models models = {} processors = {} for model_key, config in MODEL_CONFIGS.items(): print(f"Loading {model_key}...") models[model_key] = config["class"].from_pretrained( config["name"], torch_dtype="auto", device_map="auto" ) processors[model_key] = AutoProcessor.from_pretrained(config["name"]) max_tokens = 2000 def resizeImage(image): if image.height > 1500: image = image.resize((int(image.width * 1500 / image.height), 1500), Image.Resampling.LANCZOS) return image @spaces.GPU def perform_ocr(image, model_choice): inputArray = np.any(image) if inputArray == False: return "Error Processing" """Process image and extract text using selected OCR model""" image = Image.fromarray(image) # Get model configuration config = MODEL_CONFIGS[model_choice] model = models[model_choice] processor = processors[model_choice] prompt = config["prompt"] use_qwen3 = config["use_qwen3"] # Resize image for Qwen3 model # image = resizeImage(image) print("Image resized") src = str(uuid.uuid4()) + ".png" image.save(src) print(src) # Prepare messages based on model type if use_qwen3: messages = [ { "role": "user", "content": [ {"type": "image", "image": f"./{src}"}, {"type": "text", "text": prompt}, ], } ] else: messages = [ { "role": "user", "content": [ {"type": "image", "image": f"file://{src}"}, {"type": "text", "text": prompt}, ], } ] # Process inputs based on model type if use_qwen3: inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ) inputs = inputs.to(model.device) else: text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") # Generate text generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True) generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] # Cleanup os.remove(src) return output_text # Create Gradio interface with gr.Blocks(title="Arabic OCR Models Demo") as demo: gr.Markdown("# Arabic OCR Models Demo") gr.Markdown("Upload an image to extract Arabic text in real-time. Choose between different OCR models.") with gr.Row(): with gr.Column(scale=1): # Model selection dropdown model_dropdown = gr.Dropdown( choices=list(MODEL_CONFIGS.keys()), value=list(MODEL_CONFIGS.keys())[0], label="Select OCR Model", interactive=True ) # Input image image_input = gr.Image(type="numpy", label="Upload Image") # Example gallery gr.Examples( examples=[ ["0.4.png"], ["2.jpg"], ["3.jpg"] ], inputs=image_input, label="Example Images", examples_per_page=4 ) # Submit button submit_btn = gr.Button("Extract Text") with gr.Column(scale=1): # Output text output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True) # Model details with gr.Accordion("Model Information", open=False): gr.Markdown(""" **Available Models:** 1. **KATIB OCR 0.1 0.8B ** - Model: oddadmix/Katib-Qwen3.5-0.8B-0.1 - Based on Qwen3.5 - Size: 0.8B parameters 2. **Qari OCR 0.2.2.1** - Model: NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct - Based on Qwen2-VL architecture - Size: 2B parameters **Context window:** Supports up to 2000 output tokens """) # Set up processing flow submit_btn.click(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output) image_input.change(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output) demo.launch()