ReSiReg Mini
Compact vision-language model (25M parameter vision-path), as described in the paper ReSiReg: Towards Spatially Consistent Semantics in Language-Conditioned Robotic Tasks by Simon Schwaiger, David Seyser, Alessandro Scherl, Wilfried Wöber, and Gerald Steinbauer-Wagner.
The vision-path is kept as small as possible while retaining language-grounding and spatial consistency in patch token. This is attractive for robotic control tasks, where prompts are typically sparsely encoded with many frequent vision-path queries during controller updates.
Model details
- Architecture: EUPE image backbone + Vision-language tower. Projections from vision and SigLIP2 to a shared embedding space.
- Base models:
facebook/EUPE-ViT-Sandgoogle/siglip2-base-patch16-224. - Trained on ~1M image caption pairs from the cauldron, COCO caption, pexels_568, and datacomp_100k datasets.
Single-View ReSiReg Feature Reconstruction Demo
Minimal Example: Dense Image-Prompt Similarity
import torch
import torch.nn.functional as F
from PIL import Image
import requests
import matplotlib.pyplot as plt
from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
#-----------------------------------------------
## Instantiate model and load to target device
if torch.cuda.is_available(): device = "cuda"
elif torch.backends.mps.is_available(): device = "mps"
else: device = "cpu"
model = AutoModel.from_pretrained("SimonSchwaiger/resireg_mini", trust_remote_code=True).to(device).eval()
#-----------------------------------------------
## Image inference
url = "https://raw.githubusercontent.com/simonschwaiger/otas/main/img/demo/dataset_demo.png"
img = Image.open(requests.get(url, stream=True).raw).convert("RGB")
dense = model.encode_image(img) # [1, C, H, W]
#-----------------------------------------------
## Text inference
prompt = "wooden bridge"
prompts_neg = ["thing", "object", "stuff"] # Optional negative prompts for multi-class segmentation
prompts = [prompt] + [p for p in prompts_neg]
text = model.encode_text(prompts) # [K, C] (K = 1 + n_negatives)
#-----------------------------------------------
## Calculate dense similarity and perform optional multi-class voting
dense_n = F.normalize(dense, dim=1)
text_n = F.normalize(text, dim=-1)
sims = torch.einsum("bchw,kc->bkhw", dense_n, text_n)[0] # [K, H, W]
if len(prompts) > 1:
probs = F.softmax(sims / 2, dim=0)
sim = probs[0]
else:
sim = sims[0]
sim_range = torch.clamp(sim.max() - sim.min(), min=0.2)
sim = (sim - sim.min()) / (sim_range + 1e-8)
#-----------------------------------------------
## Visualise resluting similarity heatmap
heat = F.interpolate(
sim.unsqueeze(0).unsqueeze(0),
size=(img.height, img.width),
mode="bilinear",
align_corners=False,
)[0, 0].cpu().numpy()
plt.figure(figsize=(8, 8))
plt.imshow(img)
plt.imshow(heat, cmap="jet", alpha=0.45)
plt.axis("off")
plt.title(f"Patch/Text Cosine Similarity (ReSiReg-Lite): '{prompt}'")
plt.tight_layout()
plt.show()
Citation
@article{schwaiger2026_resireg,
title = {{ReSiReg}: Towards Spatially Consistent Semantics in Language-Conditioned Robotic Tasks},
author = {Schwaiger, Simon and Seyser, David and Scherl, Alessandro and W{\"o}ber, Wilfried and Steinbauer-Wagner, Gerald},
journal = {arXiv preprint arXiv:2606.19088},
year = {2026},
url = {https://arxiv.org/abs/2606.19088}
}
License and attribution
This repository is released under CC-BY-SA-4.0 for original code and documentation in this repo.
This model build depends on upstream components with their own licenses:
- EUPE (
facebook/EUPE,facebook/EUPE-ViT-S): FAIR Noncommercial Research License- Source: EUPE LICENSE.md
- Weights card: facebook/EUPE-ViT-S
- SigLIP2 (
google/siglip2-base-patch16-224): Apache-2.0 - C-RADIOv3-B (used for RADSeg teacher distillation): NVIDIA Open Model License
- Source: nvidia/C-RADIOv3-B README
- Also big thanks to RadSeg for their improved dense language grounding mechanism for RADIO
To note: EUPE is noncommercial-research licensed, so downstream usage of this combined model must comply with that restriction.
Tensor-Only API
The model also has a Tensor-only API that carries gradients on our trained vision-language head and the linear projection for potential fine-tuning.
import torch
import torch.nn.functional as F
from PIL import Image
import requests
import matplotlib.pyplot as plt
from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
if torch.cuda.is_available(): device = "cuda"
elif torch.backends.mps.is_available(): device = "mps"
else: device = "cpu"
#-----------------------------------------------
## Instantiate model and load to target device
model = AutoModel.from_pretrained("SimonSchwaiger/resireg_mini", trust_remote_code=True).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained("SimonSchwaiger/resireg_mini", trust_remote_code=True)
image_processor = AutoImageProcessor.from_pretrained("SimonSchwaiger/resireg_mini", trust_remote_code=True)
#-----------------------------------------------
## Load image, positive and optional negative prompts
url = "https://raw.githubusercontent.com/simonschwaiger/otas/main/img/demo/dataset_demo.png"
img = Image.open(requests.get(url, stream=True).raw).convert("RGB")
prompt = "wooden bridge"
prompts_neg = ["thing", "object", "stuff"] # Optional negative prompts for multi-class segmentation
#-----------------------------------------------
## Combined inference. The model.forward also supports individually encoding images or text
prompts = [prompt] + [p for p in prompts_neg]
pixel_values = image_processor(images=img, return_tensors="pt")["pixel_values"].to(device)
tok = tokenizer(prompts, padding="max_length", truncation=True, max_length=64, return_tensors="pt")
tok = {k: v.to(device) for k, v in tok.items()}
with torch.no_grad():
out = model(
pixel_values=pixel_values,
input_ids=tok["input_ids"],
attention_mask=tok.get("attention_mask"),
)
dense = out.dense_embeds_resireg_lite # [1, C, H, W]
text = out.text_embeds # [K, C] (K = 1 + n_negatives)
del out
#-----------------------------------------------
## Calculate dense similarity and perform optional multi-class voting
dense_n = F.normalize(dense, dim=1)
text_n = F.normalize(text, dim=-1)
sims = torch.einsum("bchw,kc->bkhw", dense_n, text_n)[0] # [K, H, W]
if len(prompts) > 1:
probs = F.softmax(sims / 2, dim=0)
sim = probs[0]
else:
sim = sims[0]
sim_range = torch.clamp(sim.max() - sim.min(), min=0.2)
sim = (sim - sim.min()) / (sim_range + 1e-8)
#-----------------------------------------------
## Visualise resluting similarity heatmap
heat = F.interpolate(
sim.unsqueeze(0).unsqueeze(0),
size=(img.height, img.width),
mode="bilinear",
align_corners=False,
)[0, 0].cpu().numpy()
plt.figure(figsize=(8, 8))
plt.imshow(img)
plt.imshow(heat, cmap="jet", alpha=0.45)
plt.axis("off")
plt.title(f"Patch/Text Cosine Similarity (ReSiReg-Lite): '{prompt}'")
plt.tight_layout()
plt.show()
- Downloads last month
- 522