import gradio as gr from transformers import BlipProcessor, BlipForConditionalGeneration, MarianMTModel, MarianTokenizer from PIL import Image import torch # Load translation model translator_model_ar = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-ar") translator_tokenizer_ar = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar") # Load BLIP model (fine-tuned) model_path = "saja003/MuniVis" processor_en = BlipProcessor.from_pretrained(model_path) model_en = BlipForConditionalGeneration.from_pretrained(model_path) model_en.eval() # Function to describe image def describe_image(image, language): if language == "Arabic": inputs = processor_en(image, return_tensors="pt") with torch.no_grad(): out = model_en.generate(**inputs) description = processor_en.decode(out[0], skip_special_tokens=True) inputs_ar = translator_tokenizer_ar(description, return_tensors="pt") with torch.no_grad(): translated_tokens = translator_model_ar.generate(**inputs_ar) arabic_description = translator_tokenizer_ar.decode(translated_tokens[0], skip_special_tokens=True) return arabic_description elif language == "English": inputs_en = processor_en(image, return_tensors="pt") with torch.no_grad(): out_en = model_en.generate(**inputs_en) description_en = processor_en.decode(out_en[0], skip_special_tokens=True) return description_en # Gradio UI iface = gr.Interface( fn=describe_image, inputs=[ gr.Image(type="pil", label="Upload an Image"), gr.Dropdown(choices=["Arabic", "English"], label="Select Language") ], outputs="text", title="Image Captioning with Arabic Translation", description="Select the language and upload an image to get a description." ) if __name__ == "__main__": iface.launch()