Arabic-OCR / app.py
666satan666's picture
Update app.py
6fc2cf1 verified
import gradio as gr
import time
import spaces
from PIL import Image
from transformers import Qwen3VLForConditionalGeneration, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText
from qwen_vl_utils import process_vision_info
import torch
import uuid
import os
import numpy as np
# Model configurations
MODEL_CONFIGS = {
"KATIB OCR 0.8B 0.1": {
"name": "oddadmix/Katib-Qwen3.5-0.8B-0.3",
"class": AutoModelForImageTextToText,
"prompt": "Free OCR.",
"use_qwen3": True
}
}
# Load models
models = {}
processors = {}
for model_key, config in MODEL_CONFIGS.items():
print(f"Loading {model_key}...")
models[model_key] = config["class"].from_pretrained(
config["name"],
torch_dtype="auto",
device_map="auto"
)
processors[model_key] = AutoProcessor.from_pretrained(config["name"])
max_tokens = 2000
def resizeImage(image):
if image.height > 1500:
image = image.resize((int(image.width * 1500 / image.height), 1500), Image.Resampling.LANCZOS)
return image
@spaces.GPU
def perform_ocr(image, model_choice):
inputArray = np.any(image)
if inputArray == False:
return "Error Processing"
"""Process image and extract text using selected OCR model"""
image = Image.fromarray(image)
# Get model configuration
config = MODEL_CONFIGS[model_choice]
model = models[model_choice]
processor = processors[model_choice]
prompt = config["prompt"]
use_qwen3 = config["use_qwen3"]
# Resize image for Qwen3 model
# image = resizeImage(image)
print("Image resized")
src = str(uuid.uuid4()) + ".png"
image.save(src)
print(src)
# Prepare messages based on model type
if use_qwen3:
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": f"./{src}"},
{"type": "text", "text": prompt},
],
}
]
else:
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": f"file://{src}"},
{"type": "text", "text": prompt},
],
}
]
# Process inputs based on model type
if use_qwen3:
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(model.device)
else:
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Generate text
generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
# Cleanup
os.remove(src)
return output_text
# Create Gradio interface
with gr.Blocks(title="Arabic OCR Models Demo") as demo:
gr.Markdown("# Arabic OCR Models Demo")
gr.Markdown("Upload an image to extract Arabic text in real-time. Choose between different OCR models.")
with gr.Row():
with gr.Column(scale=1):
# Model selection dropdown
model_dropdown = gr.Dropdown(
choices=list(MODEL_CONFIGS.keys()),
value=list(MODEL_CONFIGS.keys())[0],
label="Select OCR Model",
interactive=True
)
# Input image
image_input = gr.Image(type="numpy", label="Upload Image")
# Example gallery
gr.Examples(
examples=[
["0.4.png"],
["2.jpg"],
["3.jpg"]
],
inputs=image_input,
label="Example Images",
examples_per_page=4
)
# Submit button
submit_btn = gr.Button("Extract Text")
with gr.Column(scale=1):
# Output text
output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)
# Model details
with gr.Accordion("Model Information", open=False):
gr.Markdown("""
**Available Models:**
1. **KATIB OCR 0.1 0.8B **
- Model: oddadmix/Katib-Qwen3.5-0.8B-0.1
- Based on Qwen3.5
- Size: 0.8B parameters
2. **Qari OCR 0.2.2.1**
- Model: NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct
- Based on Qwen2-VL architecture
- Size: 2B parameters
**Context window:** Supports up to 2000 output tokens
""")
# Set up processing flow
submit_btn.click(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)
image_input.change(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)
demo.launch()