""" MODA-Fashion-Vision-FP16 — Standalone Inference Script Model: ViT-B/16-SigLIP (vision tower ONLY, no text encoder) Parameters: 92.9M (54% fewer than full CLIP) Dimensions: 768 Precision: float16 weights (loaded as fp32 for CPU, or fp16 on GPU) Size: 186 MB (4.2x smaller than full CLIP) Benchmark: 67.42% Fine R@1 on LookBench (-0.21 pp vs full model) Ideal for: Edge / mobile / serverless deployment where model size matters. The text encoder is removed because image-to-image retrieval never uses it. Usage: python inference.py --image path/to/image.jpg python inference.py --image img1.jpg img2.jpg --similarity """ import argparse from pathlib import Path import open_clip import torch import torch.nn.functional as F from PIL import Image from safetensors.torch import load_file MODEL_DIR = Path(__file__).parent def load_model(device="cpu"): """Load MODA-Fashion-Vision-FP16 from local directory.""" base_model, _, preprocess = open_clip.create_model_and_transforms( "hf-hub:Marqo/marqo-fashionSigLIP" ) vision_sd = load_file(str(MODEL_DIR / "vision_encoder.safetensors")) vision_sd_fp32 = {k: v.float() for k, v in vision_sd.items()} full_sd = base_model.state_dict() for k, v in vision_sd_fp32.items(): full_sd[k] = v base_model.load_state_dict(full_sd, strict=True) encoder = base_model.visual.to(device).eval() return encoder, preprocess def encode_images(encoder, preprocess, image_paths, device="cpu"): """Encode images to 768-d L2-normalized embeddings.""" images = torch.stack([ preprocess(Image.open(p).convert("RGB")) for p in image_paths ]).to(device) with torch.no_grad(): emb = encoder(images) emb = F.normalize(emb, p=2, dim=-1) return emb def main(): parser = argparse.ArgumentParser(description="MODA-Fashion-Vision-FP16 inference") parser.add_argument("--image", nargs="+", required=True, help="Image path(s)") parser.add_argument("--similarity", action="store_true", help="Print pairwise cosine similarity matrix") parser.add_argument("--device", default="cpu", help="Device: cpu, cuda, mps") args = parser.parse_args() print("Loading MODA-Fashion-Vision-FP16 ...") encoder, preprocess = load_model(args.device) print(f" Parameters : 92.9M (vision tower only)") print(f" Embedding : 768-d, L2-normalized") print(f" Precision : FP16 weights (loaded to fp32 on CPU)") print(f" Size : 186 MB (4.2x smaller than full CLIP)") print(f" Device : {args.device}") embeddings = encode_images(encoder, preprocess, args.image, args.device) print(f"\nEncoded {len(args.image)} image(s) → shape {embeddings.shape}") for i, path in enumerate(args.image): norm = embeddings[i].norm().item() print(f" [{i}] {Path(path).name} norm={norm:.4f} first5={embeddings[i, :5].tolist()}") if args.similarity and len(args.image) > 1: sim = embeddings @ embeddings.T print(f"\nCosine similarity matrix:") names = [Path(p).stem[:20] for p in args.image] header = "".ljust(22) + "".join(n.rjust(12) for n in names) print(header) for i, name in enumerate(names): row = name.ljust(22) + "".join(f"{sim[i,j].item():12.4f}" for j in range(len(names))) print(row) if __name__ == "__main__": main()