Arabic-OCR

Sleeping

App Files Files Community

Arabic-OCR / app.py

666satan666

Update app.py

6fc2cf1 verified 2 months ago

raw

history blame contribute delete

5.68 kB

	import gradio as gr
	import time
	import spaces
	from PIL import Image
	from transformers import Qwen3VLForConditionalGeneration, Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForImageTextToText
	from qwen_vl_utils import process_vision_info
	import torch
	import uuid
	import os
	import numpy as np

	# Model configurations
	MODEL_CONFIGS = {

	"KATIB OCR 0.8B 0.1": {
	"name": "oddadmix/Katib-Qwen3.5-0.8B-0.3",
	"class": AutoModelForImageTextToText,
	"prompt": "Free OCR.",
	"use_qwen3": True
	}
	}

	# Load models
	models = {}
	processors = {}

	for model_key, config in MODEL_CONFIGS.items():
	print(f"Loading {model_key}...")
	models[model_key] = config["class"].from_pretrained(
	config["name"],
	torch_dtype="auto",
	device_map="auto"
	)
	processors[model_key] = AutoProcessor.from_pretrained(config["name"])

	max_tokens = 2000

	def resizeImage(image):
	if image.height > 1500:
	image = image.resize((int(image.width * 1500 / image.height), 1500), Image.Resampling.LANCZOS)
	return image

	@spaces.GPU
	def perform_ocr(image, model_choice):
	inputArray = np.any(image)
	if inputArray == False:
	return "Error Processing"

	"""Process image and extract text using selected OCR model"""
	image = Image.fromarray(image)

	# Get model configuration
	config = MODEL_CONFIGS[model_choice]
	model = models[model_choice]
	processor = processors[model_choice]
	prompt = config["prompt"]
	use_qwen3 = config["use_qwen3"]

	# Resize image for Qwen3 model

	# image = resizeImage(image)
	print("Image resized")

	src = str(uuid.uuid4()) + ".png"
	image.save(src)
	print(src)
	# Prepare messages based on model type
	if use_qwen3:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": f"./{src}"},
	{"type": "text", "text": prompt},
	],
	}
	]
	else:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": f"file://{src}"},
	{"type": "text", "text": prompt},
	],
	}
	]

	# Process inputs based on model type
	if use_qwen3:
	inputs = processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	)
	inputs = inputs.to(model.device)
	else:
	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to("cuda")

	# Generate text
	generated_ids = model.generate(**inputs, max_new_tokens=max_tokens, use_cache=True)
	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	# Cleanup
	os.remove(src)
	return output_text

	# Create Gradio interface
	with gr.Blocks(title="Arabic OCR Models Demo") as demo:
	gr.Markdown("# Arabic OCR Models Demo")
	gr.Markdown("Upload an image to extract Arabic text in real-time. Choose between different OCR models.")

	with gr.Row():
	with gr.Column(scale=1):
	# Model selection dropdown
	model_dropdown = gr.Dropdown(
	choices=list(MODEL_CONFIGS.keys()),
	value=list(MODEL_CONFIGS.keys())[0],
	label="Select OCR Model",
	interactive=True
	)

	# Input image
	image_input = gr.Image(type="numpy", label="Upload Image")

	# Example gallery
	gr.Examples(
	examples=[
	["0.4.png"],
	["2.jpg"],
	["3.jpg"]
	],
	inputs=image_input,
	label="Example Images",
	examples_per_page=4
	)

	# Submit button
	submit_btn = gr.Button("Extract Text")

	with gr.Column(scale=1):
	# Output text
	output = gr.Textbox(label="Extracted Text", lines=20, show_copy_button=True)

	# Model details
	with gr.Accordion("Model Information", open=False):
	gr.Markdown("""
	Available Models:

	1. KATIB OCR 0.1 0.8B
	- Model: oddadmix/Katib-Qwen3.5-0.8B-0.1
	- Based on Qwen3.5
	- Size: 0.8B parameters

	2. Qari OCR 0.2.2.1
	- Model: NAMAA-Space/Qari-OCR-0.2.2.1-VL-2B-Instruct
	- Based on Qwen2-VL architecture
	- Size: 2B parameters

	Context window: Supports up to 2000 output tokens
	""")

	# Set up processing flow
	submit_btn.click(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)
	image_input.change(fn=perform_ocr, inputs=[image_input, model_dropdown], outputs=output)

	demo.launch()