# ========================================================= # OMEGA MULTILINGUAL VISION AI ENGINE # LMLM + BNN + BPE + OCR + DOCUMENT REASONING # ========================================================= from typing import Dict, Any from transformers import ( DonutProcessor, VisionEncoderDecoderModel ) from PIL import Image import torch import logging import json import os # ========================================================= # LOGGING # ========================================================= logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s" ) # ========================================================= # DEVICE CONFIGURATION # ========================================================= DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # ========================================================= # MAIN ENGINE # ========================================================= class OmegaMultilingualVisionAI: def __init__(self, model_path="./model"): logging.info("Initializing Omega AI Engine") self.model_path = model_path # ================================================= # LOAD PROCESSOR # ================================================= self.processor = DonutProcessor.from_pretrained( model_path, use_fast=True ) # ================================================= # LOAD MODEL # ================================================= self.model = VisionEncoderDecoderModel.from_pretrained( model_path, torch_dtype=( torch.float16 if DEVICE == "cuda" else torch.float32 ), low_cpu_mem_usage=True ) self.model.to(DEVICE) self.model.eval() # ================================================= # TASK TOKENS # ================================================= self.task_tokens = { # OCR "receipt": "", "invoice": "", "document": "", "form": "", # MULTILINGUAL "multilingual": "", # REASONING "reasoning": "", # BNN "binary": "", # WEB4 AGENT "web4": "" } logging.info(f"Model loaded on {DEVICE}") # ===================================================== # IMAGE PREPROCESSING # ===================================================== def preprocess( self, image: Image.Image ): image = image.convert("RGB") pixel_values = self.processor( image, return_tensors="pt" ).pixel_values return pixel_values.to(DEVICE) # ===================================================== # TASK ROUTER # ===================================================== def get_decoder_ids( self, task="receipt" ): token = self.task_tokens.get( task, "" ) decoder_input_ids = self.processor.tokenizer( token, add_special_tokens=False, return_tensors="pt" ).input_ids return decoder_input_ids.to(DEVICE) # ===================================================== # GENERATION ENGINE # ===================================================== @torch.no_grad() def inference( self, pixel_values, decoder_input_ids ): outputs = self.model.generate( pixel_values, decoder_input_ids=decoder_input_ids, max_length=2048, early_stopping=True, pad_token_id=self.processor.tokenizer.pad_token_id, eos_token_id=self.processor.tokenizer.eos_token_id, use_cache=True, num_beams=4, do_sample=False, temperature=1.0, repetition_penalty=1.1, bad_words_ids=[ [self.processor.tokenizer.unk_token_id] ], return_dict_in_generate=True ) return outputs # ===================================================== # OUTPUT PARSER # ===================================================== def postprocess( self, outputs ): prediction = self.processor.batch_decode( outputs.sequences )[0] prediction = prediction.replace( self.processor.tokenizer.eos_token, "" ) prediction = prediction.replace( self.processor.tokenizer.pad_token, "" ) prediction = prediction.strip() try: parsed = self.processor.token2json( prediction ) return parsed except Exception: return { "raw_output": prediction } # ===================================================== # MAIN EXECUTION PIPELINE # ===================================================== def __call__( self, payload: Dict[str, Any] ): try: # ============================================= # INPUT EXTRACTION # ============================================= inputs = payload.get("inputs") task = payload.get( "task", "receipt" ) if inputs is None: raise ValueError( "Missing 'inputs'" ) # ============================================= # IMAGE LOADING # ============================================= if isinstance(inputs, str): if not os.path.exists(inputs): raise FileNotFoundError(inputs) image = Image.open(inputs) elif isinstance(inputs, Image.Image): image = inputs else: raise TypeError( "Unsupported input type" ) # ============================================= # PREPROCESS # ============================================= pixel_values = self.preprocess( image ) # ============================================= # TASK ROUTING # ============================================= decoder_input_ids = self.get_decoder_ids( task ) # ============================================= # AI INFERENCE # ============================================= outputs = self.inference( pixel_values, decoder_input_ids ) # ============================================= # POSTPROCESS # ============================================= prediction = self.postprocess( outputs ) return { "success": True, "device": DEVICE, "task": task, "prediction": prediction } except Exception as error: logging.exception( "Omega inference failure" ) return { "success": False, "error": str(error) } # ========================================================= # TEST EXECUTION # ========================================================= if __name__ == "__main__": engine = OmegaMultilingualVisionAI( "./model" ) payload = { "inputs": "sample.png", "task": "receipt" } result = engine(payload) print( json.dumps( result, indent=2 ) )