katerynaCh commited on
Commit
9591148
·
verified ·
1 Parent(s): 9fa0db6

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ ---
4
+ # Model Overview
5
+
6
+ ### Description:
7
+ NVIDIA Nemotron Parse v1.2 is designed to understand document semantics and extract text and tables elements with spatial grounding. Given an image, NVIDIA Nemotron Parse v1.2 produces structured annotations, including formatted text, bounding-boxes and the corresponding semantic classes, ordered according to the document's reading flow. It overcomes the shortcomings of traditional OCR technologies that struggle with complex document layouts with structural variability, and helps transform unstructured documents into actionable and machine-usable representations. This has several downstream benefits such as increasing the availability of training-data for Large Language Models (LLMs), improving the accuracy of extractor, curator, retriever and AI agentic applications, and enhancing document understanding pipelines.<br>
8
+
9
+ This model is ready for commercial use. <br>
10
+
11
+ ## Quick Start
12
+
13
+ ### Install dependencies in your environment
14
+
15
+ You can use a public image _nvcr.io/nvidia/pytorch:25.03-py3_ with the following library versions installed on top:
16
+
17
+ ```bash
18
+ pip install accelerate==1.12.0
19
+ pip install albumentations==2.0.8
20
+ pip install transformers==4.51.3
21
+ pip install timm==1.0.22
22
+ ```
23
+
24
+ ### Usage example
25
+
26
+ ```python
27
+ import torch
28
+ from PIL import Image, ImageDraw
29
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig
30
+ from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
31
+
32
+ # Load model and processor
33
+ model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.2" # Or use a local path
34
+ device = "cuda:0"
35
+
36
+ model = AutoModel.from_pretrained(
37
+ model_path,
38
+ trust_remote_code=True,
39
+ torch_dtype=torch.bfloat16
40
+ ).to(device).eval()
41
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
42
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
43
+
44
+ # Load image
45
+ image = Image.open("path/to/your/image.jpg")
46
+ task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown><predict_no_text_in_pic>"
47
+ # task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown><predict_text_in_pic>"
48
+
49
+ # Process image
50
+ inputs = processor(images=[image], text=task_prompt, return_tensors="pt", add_special_tokens=False).to(device)
51
+
52
+ generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
53
+ # Generate text
54
+ outputs = model.generate(**inputs, generation_config=generation_config)
55
+
56
+ # Decode the generated text
57
+ generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
58
+ ```
59
+ ### Postprocessing
60
+
61
+ ```python
62
+ from PIL import Image, ImageDraw
63
+ from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
64
+
65
+ classes, bboxes, texts = extract_classes_bboxes(generated_text)
66
+ bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
67
+
68
+ # Specify output formats for postprocessing
69
+ table_format = 'latex' # latex | HTML | markdown
70
+ text_format = 'markdown' # markdown | plain
71
+ blank_text_in_figures = False # remove text inside 'Picture' class
72
+ texts = [postprocess_text(text, cls = cls, table_format=table_format, text_format=text_format, blank_text_in_figures=blank_text_in_figures) for text, cls in zip(texts, classes)]
73
+
74
+ for cl, bb, txt in zip(classes, bboxes, texts):
75
+ print(cl, ': ', txt)
76
+
77
+ draw = ImageDraw.Draw(image)
78
+ for bbox in bboxes:
79
+ draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red")
80
+ ```
81
+
82
+ ## Inference with VLLM
83
+ Nemotron-Parse-v1.2 is [available in vllm main](https://github.com/vllm-project/vllm/pull/30864) and can be found in [vllm/vllm-openai:v0.14.1 docker image](https://hub.docker.com/layers/vllm/vllm-openai/v0.14.1/images/sha256-8e67731819426f7df194e5a0dfd6649d3aa3474f80c44f75b1e8711e76f8030a).
84
+
85
+ Note: when running on A100/A10 we recommend running vllm serve with _--attention-backend=TRITON_ATTN_
86
+
87
+ You will need to install the following dependencies on top, and then follow the VLLM Inference example below:
88
+ ```bash
89
+ pip install albumentations timm open_clip_torch
90
+ ```
91
+
92
+ ### VLLM Inference example
93
+
94
+ #### Option 1: end-to-end python inference
95
+ ```python
96
+ from vllm import LLM, SamplingParams
97
+ from PIL import Image
98
+
99
+ def main():
100
+ sampling_params = SamplingParams(
101
+ temperature=0,
102
+ top_k=1,
103
+ repetition_penalty=1.1,
104
+ max_tokens=9000,
105
+ skip_special_tokens=False,
106
+ )
107
+
108
+ llm = LLM(
109
+ model="nvidia/NVIDIA-Nemotron-Parse-v1.2",
110
+ max_num_seqs=64,
111
+ limit_mm_per_prompt={"image": 1},
112
+ dtype="bfloat16",
113
+ trust_remote_code=True,
114
+ )
115
+
116
+ image = Image.open("<YOUR-IMAGE-PATH>")
117
+
118
+ prompts = [
119
+ { # Implicit prompt
120
+ "prompt": "</s><s><predict_bbox><predict_classes><output_markdown><predict_no_text_in_pic>",
121
+ "multi_modal_data": {
122
+ "image": image
123
+ },
124
+ },
125
+ { # Explicit encoder/decoder prompt
126
+ "encoder_prompt": {
127
+ "prompt": "",
128
+ "multi_modal_data": {
129
+ "image": image
130
+ },
131
+ },
132
+ "decoder_prompt": "</s><s><predict_bbox><predict_classes><output_markdown><predict_no_text_in_pic>",
133
+ },
134
+ ]
135
+
136
+ outputs = llm.generate(prompts, sampling_params)
137
+
138
+ for output in outputs:
139
+ prompt = output.prompt
140
+ generated_text = output.outputs[0].text
141
+ print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")
142
+
143
+ if __name__ == "__main__":
144
+ main()
145
+ ```
146
+
147
+ #### Option 2: vllm serve
148
+
149
+ Alternatively, you can start a vllm server as:
150
+ ```bash
151
+ vllm serve nvidia/NVIDIA-Nemotron-Parse-v1.2 \
152
+ --dtype bfloat16 \
153
+ --max-num-seqs 8 \
154
+ --limit-mm-per-prompt '{"image": 1}' \
155
+ --trust-remote-code \
156
+ --port 8000 \
157
+ --chat-template chat_template.jinja
158
+ ```
159
+ with *chat_template.jinja* provided in this repository. Then, you can run inference as:
160
+
161
+ ```python
162
+ import base64
163
+ from openai import OpenAI
164
+
165
+ client = OpenAI(
166
+ base_url="http://localhost:8000/v1",
167
+ )
168
+
169
+ # Read and base64-encode the image
170
+ with open(<your-image-path>, "rb") as f:
171
+ img_b64 = base64.b64encode(f.read()).decode("utf-8")
172
+ prompt_text = "</s><s><predict_bbox><predict_classes><output_markdown><predict_no_text_in_pic>"
173
+
174
+ resp = client.chat.completions.create(
175
+ model="nvidia/NVIDIA-Nemotron-Parse-v1.2",
176
+ messages=[
177
+ {
178
+ "role": "user",
179
+ "content": [
180
+ {
181
+ "type": "text",
182
+ "text": prompt_text,
183
+ },
184
+ {
185
+ "type": "image_url",
186
+ "image_url": {
187
+ "url": f"data:image/png;base64,{img_b64}",
188
+ },
189
+ },
190
+ ],
191
+ }
192
+ ],
193
+ max_tokens=9000,
194
+ temperature=0.0,
195
+ extra_body={
196
+ "repetition_penalty": 1.1,
197
+ "top_k": 1,
198
+ "skip_special_tokens": False,
199
+ },
200
+ )
201
+ print(resp.choices[0].message.content)
202
+ ```
203
+
204
+ *Note:* we recommend using the default prompt that extracts bounding boxes, classes, and text in markdown formatting for all use cases (`</s><s><predict_bbox><predict_classes><output_markdown><predict_no_text_in_pic>` or `</s><s><predict_bbox><predict_classes><output_markdown><predict_text_in_pic>`). If necessary, optionally the prompt that omits text extraction and only outputs bounding boxes and classes could be used: `</s><s><predict_bbox><predict_classes><output_no_text><predict_no_text_in_pic>`.
205
+
206
+ #### Logits processors
207
+ With Nemotron-Parse-v1.2 we share 2 logits processors available in logitsprocessors/ dir for vllm and in hf_logits_processor.py for the python model.
208
+ NemotronParseRepetitionStopProcessor - when used during generation, detects repeating n-grams and forces the model to close the <x_><y_> block when detecting potential hallucination.
209
+ NemotronParseTableInsertionLogitsProcessor - forces every block to follow a table structure (useful if, e.g., you are running the model on table image crops)
210
+
211
+ Please refer to the example_with_processor.py for example usage with python model. With vllm, you can provide these as arguments to vllm serve, after exporting logitsprocs/ to PYTHONPATH, e.g.:
212
+ ```
213
+ vllm serve nvidia/NVIDIA-Nemotron-Parse-v1.2 \
214
+ --dtype bfloat16 \
215
+ --max-num-seqs 4 \
216
+ --limit-mm-per-prompt '{"image": 1}' \
217
+ --attention-backend=TRITON_ATTN \
218
+ --trust-remote-code \
219
+ --logits-processors nemotron_parse_vllm_logitprocs:NemotronParseTableInsertionLogitsProcessor \
220
+ --port 8000
221
+ ```
222
+
223
+ An example of inference with vllm openai server is available in vllm_example.py
224
+
225
+ ### License/Terms of Use
226
+
227
+ GOVERNING TERMS: [Product-Specific Terms for NVIDIA AI Products](https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/). Use of this model is governed by the [NVIDIA Community Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/). Use of the tokenizer included in this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/).<br>
228
+
229
+
230
+ ### Deployment Geography:
231
+ Global<br>
232
+
233
+ ### Use Case: <br>
234
+ NVIDIA Nemotron Parse v1.2 will be capable of comprehensive text understanding and document structure understanding. It will be used in retriever and curator solutions. Its text extraction datasets and capabilities will help with LLM and VLM training, as well as improve run-time inference accuracy of VLMs. The NVIDIA Nemotron Parse v1.2 model will perform text extraction from PDF and PPT documents. The NVIDIA Nemotron Parse v1.2 can classify the objects (title, section, caption, index, footnote, lists, tables, bibliography, image) in a given document, and provide bounding boxes with coordinates.<br>
235
+
236
+ ### Release Date: <br>
237
+ Hugging Face [02/17/2026] via [[URL](https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.2)] <br>
238
+
239
+ ## References(s):
240
+ * https://huggingface.co/docs/transformers/en/model_doc/mbart <br>
241
+
242
+ ## Model Architecture:
243
+ **Architecture Type:** Transformer-based vision-encoder-decoder model<br>
244
+
245
+ **Network Architecture:** <br>
246
+ * Vision Encoder: ViT-H model (https://huggingface.co/nvidia/C-RADIO)<br>
247
+ * Adapter Layer: 1D convolutions & norms to compress dimensionality and sequence length of the latent space (1280 tokens to 320 tokens)<br>
248
+ * Decoder: mBart [1] 10 blocks<br>
249
+ * Tokenizer: Use of the tokenizer included in this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/)<br>
250
+ * Number of Parameters: < 1B<br>
251
+
252
+ ## Input: <br>
253
+ * Input Type: Image, Text<br>
254
+ * Input Type(s): Red, Green, Blue (RGB) + Prompt (String)
255
+ * Input Parameters: Two-Dimensional (2D), One-Dimensional (1D)
256
+ * Other Properties Related to Input:
257
+ - Max Input Resolution (Width, Height): 1664, 2048
258
+ - Min Input Resolution (Width, Height): 1024, 1280
259
+ * Channel Count: 3
260
+
261
+ ## Output: <br>
262
+ * Output Type: Text<br>
263
+ * Output Format: String
264
+ * Output Parameters: One-Dimensional (1D)
265
+ * Other Properties Related to Output: Nemotron-parse output format is a string which encodes text content (formatted or not) as well as bounding boxes and class attributes.<br>
266
+
267
+ Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions. <br>
268
+
269
+ ## Software Integration:
270
+ **Runtime Engine(s):**
271
+ * TensorRT-LLM <br>
272
+ * vLLM <br>
273
+
274
+
275
+ **Supported Hardware Microarchitecture Compatibility:** <br>
276
+ * NVIDIA Ampere <br>
277
+ * NVIDIA Blackwell <br>
278
+ * NVIDIA Hopper <br>
279
+ * NVIDIA Turing <br>
280
+
281
+
282
+ **Supported Operating System(s):**
283
+ * [Linux] <br>
284
+
285
+ The integration of foundation and fine-tuned models into AI systems requires additional testing using use-case-specific data to ensure safe and effective deployment. Following the V-model methodology, iterative testing and validation at both unit and system levels are essential to mitigate risks, meet technical and functional requirements, and ensure compliance with safety and ethical standards before deployment. <br>
286
+
287
+
288
+ ## Model Version(s):
289
+ Nemotron Parse 1.2 <br>
290
+
291
+
292
+ ## Training, Testing, and Evaluation Datasets:
293
+
294
+ ### Training Dataset
295
+
296
+ ** Image Training Data Size <br>
297
+ * [1 Million to 1 Billion Images] <br>
298
+
299
+ ** Text Training Data Size <br>
300
+ * [1 Billion to 10 Trillion Tokens] <br>
301
+
302
+ ** Data Collection Method by dataset <br>
303
+ * Hybrid: Automated, Human, Synthetic <br>
304
+
305
+ ** Labeling Method by dataset <br>
306
+ * Hybrid: Automated, Human, Synthetic <br>
307
+
308
+ **Properties (Quantity, Dataset Descriptions, Sensor(s)):** The training set contains millions of image–text items, aggregated across many large document and table datasets totaling several terabytes of data. The data consists of document-page and table images paired with OCR text, bounding boxes, and layout labels, drawn from real-world sources (scientific papers, PDFs, Wikipedia pages) as well as fully synthetic tables and word/character renderings. Modalities are primarily images plus associated text and structural annotations; content spans public-domain resources, and synthetic data. Images are obtained by rendering digital documents or generating synthetic layouts, and annotations come from OCR/layout models, third-party OCR services, and human labeling. <br>
309
+
310
+
311
+ # Inference:
312
+ **Acceleration Engine:** Tensor(RT)-LLM, vLLM <br>
313
+ **Test Hardware:** <br>
314
+ * H100 <br>
315
+ * A100 <br>
316
+
317
+ ## Ethical Considerations:
318
+ NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse. <br>
319
+
320
+ Please make sure you have proper rights and permissions for all input image and video content; if image or video includes people, personal health information, or intellectual property, the image or video generated will not blur or maintain proportions of image subjects included. <br>
321
+
322
+ For more detailed information on ethical considerations for this model, please see the [Model Card++ Bias, Explainability, Safety & Security, and Privacy Subcards](https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.2/blob/main/bias.md,https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.2/blob/main/explainability.md,https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.2/blob/main/safety.md,https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.2/blob/main/privacy.md)
323
+ Please report model quality, risk, security vulnerabilities or NVIDIA AI Concerns [here](https://app.intigriti.com/programs/nvidia/nvidiavdp/detail).
324
+ <br>
bias.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ## Bias
2
+ | Field | Response |
3
+ | :---- | :---- |
4
+ | Participation considerations from adversely impacted groups in model design and testing: | None |
5
+ | Measures taken to mitigate against unwanted bias: | None|
6
+ | Bias Metric (If Measured): | None |
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% for message in messages %}{{ message['content'] }}{% endfor %}
config.json ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "NemotronParseForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "hf_nemotron_parse_config.NemotronParseConfig",
7
+ "AutoModel": "hf_nemotron_parse_modeling.NemotronParseForConditionalGeneration",
8
+ "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseImageProcessor",
9
+ "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseProcessor"
10
+ },
11
+ "bos_token_id": 0,
12
+ "decoder": {
13
+ "_attn_implementation": "sdpa",
14
+ "_name_or_path": "",
15
+ "activation_dropout": 0.0,
16
+ "activation_function": "gelu",
17
+ "add_cross_attention": true,
18
+ "add_final_layer_norm": true,
19
+ "architectures": null,
20
+ "attention_dropout": 0.0,
21
+ "bad_words_ids": null,
22
+ "begin_suppress_tokens": null,
23
+ "bos_token_id": 0,
24
+ "chunk_size_feed_forward": 0,
25
+ "classifier_dropout": 0.0,
26
+ "cross_attention_hidden_size": null,
27
+ "d_model": 1024,
28
+ "decoder_attention_heads": 16,
29
+ "decoder_ffn_dim": 4096,
30
+ "decoder_layerdrop": 0.0,
31
+ "decoder_layers": 10,
32
+ "decoder_start_token_id": null,
33
+ "diversity_penalty": 0.0,
34
+ "do_sample": false,
35
+ "dropout": 0.1,
36
+ "early_stopping": false,
37
+ "encoder_attention_heads": 16,
38
+ "encoder_ffn_dim": 4096,
39
+ "encoder_layerdrop": 0.0,
40
+ "encoder_layers": 12,
41
+ "encoder_no_repeat_ngram_size": 0,
42
+ "eos_token_id": 2,
43
+ "exponential_decay_length_penalty": null,
44
+ "finetuning_task": null,
45
+ "forced_bos_token_id": null,
46
+ "forced_eos_token_id": 2,
47
+ "hidden_size": 1024,
48
+ "id2label": {
49
+ "0": "LABEL_0",
50
+ "1": "LABEL_1",
51
+ "2": "LABEL_2"
52
+ },
53
+ "init_std": 0.02,
54
+ "is_decoder": true,
55
+ "is_encoder_decoder": false,
56
+ "label2id": {
57
+ "LABEL_0": 0,
58
+ "LABEL_1": 1,
59
+ "LABEL_2": 2
60
+ },
61
+ "length_penalty": 1.0,
62
+ "max_length": 20,
63
+ "min_length": 0,
64
+ "model_type": "nemotron_parse_text",
65
+ "no_repeat_ngram_size": 0,
66
+ "num_beam_groups": 1,
67
+ "num_beams": 1,
68
+ "num_hidden_layers": 12,
69
+ "num_return_sequences": 1,
70
+ "output_attentions": false,
71
+ "output_hidden_states": false,
72
+ "output_scores": false,
73
+ "pad_token_id": 1,
74
+ "prefix": null,
75
+ "problem_type": null,
76
+ "pruned_heads": {},
77
+ "remove_invalid_values": false,
78
+ "repetition_penalty": 1.0,
79
+ "return_dict": true,
80
+ "return_dict_in_generate": false,
81
+ "scale_embedding": true,
82
+ "sep_token_id": null,
83
+ "suppress_tokens": null,
84
+ "task_specific_params": null,
85
+ "temperature": 1.0,
86
+ "tf_legacy_loss": false,
87
+ "tie_encoder_decoder": false,
88
+ "tie_word_embeddings": false,
89
+ "tokenizer_class": null,
90
+ "top_k": 50,
91
+ "top_p": 1.0,
92
+ "torch_dtype": "bfloat16",
93
+ "torchscript": false,
94
+ "transformers_version": "4.51.3",
95
+ "typical_p": 1.0,
96
+ "use_bfloat16": true,
97
+ "use_cache": true,
98
+ "vocab_size": 52352
99
+ },
100
+ "decoder_start_token_id": 2,
101
+ "encoder": {
102
+ "attn_implementation": "eager",
103
+ "_name_or_path": "nvidia/C-RADIOv2-H",
104
+ "adaptor_configs": {},
105
+ "adaptor_names": null,
106
+ "add_cross_attention": false,
107
+ "architectures": [
108
+ "RADIOModel"
109
+ ],
110
+ "args": {
111
+ "aa": null,
112
+ "amp": true,
113
+ "amp_dtype": "bfloat16",
114
+ "amp_impl": "native",
115
+ "aug_repeats": 0,
116
+ "aug_splits": 0,
117
+ "bn_eps": null,
118
+ "bn_momentum": null,
119
+ "cache_dir": null,
120
+ "channels_last": false,
121
+ "checkpoint_hist": 10,
122
+ "chk_keep_forever": 100,
123
+ "class_map": "",
124
+ "clip_grad": null,
125
+ "clip_mode": "norm",
126
+ "cls_token_per_teacher": true,
127
+ "coco_annotations_file": "/datasets/coco2017-adlsa/annotations/captions_val2017.json",
128
+ "coco_image_dir": "/datasets/coco2017-adlsa/val2017",
129
+ "color_jitter": 0.4,
130
+ "cooldown_epochs": 0,
131
+ "cpe_max_size": 2048,
132
+ "crd_loss": false,
133
+ "crd_loss_weight": 0.8,
134
+ "crop_pct": null,
135
+ "cutmix": 0.0,
136
+ "cutmix_minmax": null,
137
+ "dataset_download": false,
138
+ "debug_full_knn": false,
139
+ "decay_epochs": 90,
140
+ "decay_milestones": [
141
+ 90,
142
+ 180,
143
+ 270
144
+ ],
145
+ "decay_rate": 0.1,
146
+ "depchain": true,
147
+ "dist_bn": "reduce",
148
+ "dist_norm_weight": 0.0,
149
+ "distributed": true,
150
+ "drop": 0.0,
151
+ "drop_block": null,
152
+ "drop_connect": null,
153
+ "drop_path": null,
154
+ "dtype": "bfloat16",
155
+ "epoch_repeats": 0.0,
156
+ "eval": false,
157
+ "eval_metric": "knn_top1",
158
+ "eval_teacher": false,
159
+ "eval_teacher_only": false,
160
+ "eval_throughput": false,
161
+ "fast_norm": false,
162
+ "fd_loss_fn": "MSE",
163
+ "feature_normalization": "SHIP_NORM",
164
+ "feature_summarizer": "cls_token",
165
+ "feature_upscale_factor": null,
166
+ "force_new_wandb_id": false,
167
+ "force_spectral_reparam": true,
168
+ "freeze_bn": false,
169
+ "fsdp": false,
170
+ "fuser": "",
171
+ "gp": null,
172
+ "grad_accum_steps": 1,
173
+ "grad_checkpointing": false,
174
+ "head_init_bias": null,
175
+ "head_init_scale": null,
176
+ "head_warmup": 5,
177
+ "head_weight_decay": 0.001,
178
+ "hflip": 0.5,
179
+ "img_size": null,
180
+ "in_chans": null,
181
+ "initial_checkpoint": null,
182
+ "input_size": null,
183
+ "interpolation": "",
184
+ "layer_decay": null,
185
+ "local_rank": 0,
186
+ "log_interval": 50,
187
+ "log_mlflow": false,
188
+ "log_wandb": true,
189
+ "loss_auto_balance": false,
190
+ "lr_base": 0.1,
191
+ "lr_base_scale": "",
192
+ "lr_base_size": 256,
193
+ "lr_cycle_decay": 0.5,
194
+ "lr_cycle_limit": 1,
195
+ "lr_cycle_mul": 1.0,
196
+ "lr_k_decay": 1.0,
197
+ "lr_noise": null,
198
+ "lr_noise_pct": 0.67,
199
+ "lr_noise_std": 1.0,
200
+ "mean": null,
201
+ "mesa": false,
202
+ "min_lr": 0,
203
+ "mixup": 0.0,
204
+ "mixup_mode": "batch",
205
+ "mixup_off_epoch": 0,
206
+ "mixup_prob": 1.0,
207
+ "mixup_switch_prob": 0.5,
208
+ "mlp_hidden_size": 1520,
209
+ "mlp_num_inner": 3,
210
+ "mlp_version": "v2",
211
+ "model": "vit_huge_patch16_224",
212
+ "model_kwargs": {},
213
+ "model_norm": false,
214
+ "momentum": 0.9,
215
+ "no_aug": false,
216
+ "no_ddp_bb": true,
217
+ "no_prefetcher": false,
218
+ "no_resume_opt": false,
219
+ "num_classes": null,
220
+ "opt_betas": null,
221
+ "opt_eps": null,
222
+ "patience_epochs": 10,
223
+ "pin_mem": false,
224
+ "prefetcher": true,
225
+ "pretrained": false,
226
+ "rank": 0,
227
+ "ratio": [
228
+ 0.75,
229
+ 1.3333333333333333
230
+ ],
231
+ "recount": 1,
232
+ "recovery_interval": 0,
233
+ "register_multiple": 8,
234
+ "remode": "pixel",
235
+ "reprob": 0.0,
236
+ "reset_loss_state": false,
237
+ "resplit": false,
238
+ "save_images": false,
239
+ "scale": [
240
+ 0.5,
241
+ 1.0
242
+ ],
243
+ "sched": "cosine",
244
+ "seed": 42,
245
+ "smoothing": 0.1,
246
+ "spectral_heads": false,
247
+ "spectral_reparam": false,
248
+ "split_bn": false,
249
+ "start_epoch": null,
250
+ "std": null,
251
+ "stream_teachers": true,
252
+ "sync_bn": false,
253
+ "synchronize_step": false,
254
+ "teachers": [
255
+ {
256
+ "fd_normalize": false,
257
+ "feature_distillation": true,
258
+ "input_size": 378,
259
+ "model": "ViT-H-14-378-quickgelu",
260
+ "name": "clip",
261
+ "pretrained": "dfn5b",
262
+ "type": "open_clip",
263
+ "use_summary": true
264
+ },
265
+ {
266
+ "fd_normalize": false,
267
+ "feature_distillation": true,
268
+ "input_size": 378,
269
+ "model": "ViT-SO400M-14-SigLIP-384",
270
+ "name": "siglip",
271
+ "pretrained": "webli",
272
+ "type": "open_clip",
273
+ "use_summary": true
274
+ },
275
+ {
276
+ "fd_normalize": false,
277
+ "feature_distillation": true,
278
+ "input_size": 378,
279
+ "model": "dinov2_vitg14_reg",
280
+ "name": "dino_v2",
281
+ "type": "dino_v2",
282
+ "use_summary": true
283
+ },
284
+ {
285
+ "fd_normalize": false,
286
+ "feature_distillation": true,
287
+ "input_size": 1024,
288
+ "model": "vit-h",
289
+ "name": "sam",
290
+ "type": "sam",
291
+ "use_summary": false
292
+ }
293
+ ],
294
+ "torchcompile": null,
295
+ "torchscript": false,
296
+ "train_interpolation": "random",
297
+ "train_split": "train",
298
+ "tta": 0,
299
+ "use_coco": false,
300
+ "use_multi_epochs_loader": false,
301
+ "val_ema_only": false,
302
+ "val_split": "val",
303
+ "vflip": 0.0,
304
+ "vitdet_version": 1,
305
+ "wandb_entity": "",
306
+ "wandb_job_type": "",
307
+ "wandb_name": "",
308
+ "wandb_project": "",
309
+ "warmup_lr": 1e-05,
310
+ "warmup_prefix": false,
311
+ "worker_seeding": "all",
312
+ "workers": 8,
313
+ "world_size": 256
314
+ },
315
+ "auto_map": {
316
+ "AutoConfig": "nvidia/C-RADIOv2-H--hf_model.RADIOConfig",
317
+ "AutoModel": "nvidia/C-RADIOv2-H--hf_model.RADIOModel"
318
+ },
319
+ "bad_words_ids": null,
320
+ "begin_suppress_tokens": null,
321
+ "bos_token_id": null,
322
+ "chunk_size_feed_forward": 0,
323
+ "cross_attention_hidden_size": null,
324
+ "decoder_start_token_id": null,
325
+ "diversity_penalty": 0.0,
326
+ "do_sample": false,
327
+ "early_stopping": false,
328
+ "encoder_no_repeat_ngram_size": 0,
329
+ "eos_token_id": null,
330
+ "exponential_decay_length_penalty": null,
331
+ "feature_normalizer_config": null,
332
+ "finetuning_task": null,
333
+ "forced_bos_token_id": null,
334
+ "forced_eos_token_id": null,
335
+ "id2label": {
336
+ "0": "LABEL_0",
337
+ "1": "LABEL_1"
338
+ },
339
+ "inter_feature_normalizer_config": null,
340
+ "is_decoder": false,
341
+ "is_encoder_decoder": false,
342
+ "label2id": {
343
+ "LABEL_0": 0,
344
+ "LABEL_1": 1
345
+ },
346
+ "length_penalty": 1.0,
347
+ "max_length": 20,
348
+ "max_resolution": 2048,
349
+ "min_length": 0,
350
+ "model_type": "",
351
+ "no_repeat_ngram_size": 0,
352
+ "num_beam_groups": 1,
353
+ "num_beams": 1,
354
+ "num_return_sequences": 1,
355
+ "output_attentions": false,
356
+ "output_hidden_states": false,
357
+ "output_scores": false,
358
+ "pad_token_id": null,
359
+ "patch_size": 16,
360
+ "preferred_resolution": [
361
+ 768,
362
+ 768
363
+ ],
364
+ "prefix": null,
365
+ "problem_type": null,
366
+ "pruned_heads": {},
367
+ "remove_invalid_values": false,
368
+ "repetition_penalty": 1.0,
369
+ "return_dict": true,
370
+ "return_dict_in_generate": false,
371
+ "sep_token_id": null,
372
+ "suppress_tokens": null,
373
+ "task_specific_params": null,
374
+ "temperature": 1.0,
375
+ "tf_legacy_loss": false,
376
+ "tie_encoder_decoder": false,
377
+ "tie_word_embeddings": true,
378
+ "tokenizer_class": null,
379
+ "top_k": 50,
380
+ "top_p": 1.0,
381
+ "torch_dtype": "bfloat16",
382
+ "torchscript": false,
383
+ "transformers_version": "4.51.3",
384
+ "typical_p": 1.0,
385
+ "use_bfloat16": false,
386
+ "version": "radio_v2.5-h",
387
+ "vitdet_window_size": null
388
+ },
389
+ "eos_token_id": 2,
390
+ "image_size": [
391
+ 2048,
392
+ 1664
393
+ ],
394
+ "is_encoder_decoder": true,
395
+ "max_sequence_length": 9000,
396
+ "model_type": "nemotron_parse",
397
+ "pad_token_id": 1,
398
+ "tie_word_embeddings": false,
399
+ "torch_dtype": "bfloat16",
400
+ "transformers_version": "4.51.3",
401
+ "vocab_size": 52329
402
+ }
example_with_processor.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example usage of LogitsProcessors for document parsing.
3
+
4
+ This example shows how to use:
5
+ - TableInsertionLogitsProcessor: Force \begin{tabular} at the start of every object
6
+ - RepetitionStopProcessor: Detect hallucination/repetition and force coordinate tokens
7
+ """
8
+
9
+ import torch
10
+ from PIL import Image, ImageDraw
11
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer, GenerationConfig
12
+ from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
13
+ from hf_logits_processor import TableInsertionLogitsProcessor, RepetitionStopProcessor
14
+
15
+ # Load model and processor
16
+ model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.2" or use a local path
17
+ device = "cuda:0"
18
+
19
+ model = AutoModel.from_pretrained(
20
+ model_path,
21
+ trust_remote_code=True,
22
+ torch_dtype=torch.bfloat16
23
+ ).to(device).eval()
24
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
25
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
26
+
27
+ # Load image
28
+ image = Image.open('example.png').convert("RGB")
29
+ task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown><predict_no_text_in_pic>"
30
+
31
+ # Process image
32
+ inputs = processor(images=[image], text=task_prompt, return_tensors="pt", add_special_tokens=False).to(device)
33
+
34
+ generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
35
+
36
+ # Create the table processor - inserts \begin{tabular} after every <x_...><y_...> that starts a new object
37
+ table_processor = TableInsertionLogitsProcessor(
38
+ tokenizer=tokenizer,
39
+ table_prefix="\\begin{tabular}"
40
+ )
41
+
42
+ # Create the repetition stop processor - detects hallucination and forces <x_...> tokens
43
+ repetition_processor = RepetitionStopProcessor(
44
+ tokenizer=tokenizer,
45
+ max_repetitions=10, # Force stop after any pattern repeats 10+ times
46
+ ngram_sizes=[3, 4, 5, 6], # Check these n-gram sizes for repetition
47
+ window_size=500 # Only check the last 500 tokens
48
+ )
49
+
50
+ # Generate with both logits processors
51
+ outputs = model.generate(
52
+ **inputs,
53
+ generation_config=generation_config,
54
+ logits_processor=[table_processor, repetition_processor]
55
+ )
56
+
57
+ # Reset processor states for next generation (important for batch processing)
58
+ table_processor.reset()
59
+ repetition_processor.reset()
60
+
61
+ # Decode and process the generated text
62
+ generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
63
+ print(outputs)
64
+ print('--------------------------------')
65
+ print("Generated text:", generated_text)
66
+ print('--------------------------------')
67
+ classes, bboxes, texts = extract_classes_bboxes(generated_text)
68
+ bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
69
+
70
+ # Specify output formats for postprocessing
71
+ table_format = 'HTML' # latex | HTML | markdown
72
+ text_format = 'markdown' # markdown | plain
73
+ blank_text_in_figures = False # remove text inside 'Picture' class
74
+
75
+ texts = [
76
+ postprocess_text(
77
+ text,
78
+ cls=cls,
79
+ table_format=table_format,
80
+ text_format=text_format,
81
+ blank_text_in_figures=blank_text_in_figures
82
+ )
83
+ for text, cls in zip(texts, classes)
84
+ ]
85
+
86
+ for cl, bb, txt in zip(classes, bboxes, texts):
87
+ print(cl, ': ', txt)
88
+
89
+ # OPTIONAL - Draw bounding boxes
90
+ draw = ImageDraw.Draw(image)
91
+ for bbox in bboxes:
92
+ draw.rectangle((bbox[0], bbox[1], (max(bbox[0], bbox[2])), (max(bbox[1], bbox[3]))), outline="red", width=2)
93
+
94
+ # Save or display the image
95
+ image.save("output_with_boxes.jpg")
96
+ # image.show()
explainability.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Explainability
2
+ | Field | Response |
3
+ | :---- | :---- |
4
+ | Intended Task/Domain: | Document Understanding & Text Extraction|
5
+ | Model Type: | Transformer-based vision-encoder-decoder model|
6
+ | Intended Users: | Generative AI creators working with conversational AI models and image content. |
7
+ | Output: | Text |
8
+ | Describe how the model works: | Generates text by predicting the next word or token based on the context provided in the input sequence using multiple self-attention layers. |
9
+ | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable |
10
+ | Technical Limitations & Mitigation: | The model demonstrates weakness to alignment-breaking attacks. Users are advised to deploy language model guardrails alongside this model to prevent potentially harmful outputs. The Model may generate answers that are inaccurate, omit key information, or include irrelevant or redundant text. |
11
+ | Verified to have met prescribed NVIDIA quality standards: | Yes |
12
+ | Performance Metrics: | Accuracy, Throughput, and User-side throughput |
13
+ | Potential Known Risks: | The model was optimized explicitly for instruction following and as such is more susceptible to prompt injection and jailbreaking in various forms as a result of its instruction tuning. This means that the model should be paired with additional rails or system filtering to limit exposure to instructions from malicious sources \-- either directly or indirectly by retrieval (e.g. via visiting a website) \-- as they may yield outputs that can lead to harmful, system-level outcomes up to and including remote code execution in agentic systems when effective security controls including guardrails are not in place. The model was trained on data that contains toxic language and societal biases originally crawled from the internet. Therefore, the model may amplify those biases and return toxic responses especially when prompted with toxic prompts. The model may generate answers that may be inaccurate, omit key information, or include irrelevant or redundant text producing socially unacceptable or undesirable text, even if the prompt itself does not include anything explicitly offensive. |
14
+ | Licensing: | GOVERNING TERMS: The NIM container is governed by the [NVIDIA Software License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/) and [Product-Specific Terms for NVIDIA AI Products](https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/). Use of this model is governed by the [NVIDIA Community Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/). Use of the tokenizer included in this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/) |
15
+
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "forced_eos_token_id": 2,
7
+ "pad_token_id": 1,
8
+ "transformers_version": "4.51.3",
9
+ "max_new_tokens": 9000,
10
+ "do_sample": false,
11
+ "num_beams": 1,
12
+ "repetition_penalty": 1.1
13
+ }
hf_logits_processor.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LogitsProcessors for document parsing models.
3
+
4
+ Contains:
5
+ - TableInsertionLogitsProcessor: Forces table structure insertion after coordinates
6
+ - RepetitionStopProcessor: Detects repetition/hallucination and forces coordinate tokens
7
+ """
8
+
9
+ import torch
10
+ from collections import Counter
11
+ from transformers import LogitsProcessor
12
+ from typing import Set, List, Tuple
13
+
14
+
15
+ class TableInsertionLogitsProcessor(LogitsProcessor):
16
+ """
17
+ A LogitsProcessor that inserts \begin{tabular} tokens after coordinate pairs
18
+ that mark the START of an object (not the END coordinates).
19
+
20
+ Pattern: <x_start><y_start>CONTENT<x_end><y_end><class_...>
21
+
22
+ This processor triggers after the first <x_...><y_...> pair (start coords),
23
+ but not after the second pair (end coords).
24
+
25
+ Args:
26
+ tokenizer: The tokenizer used for encoding/decoding
27
+ table_prefix: The string to insert (default: "\\begin{tabular}")
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ tokenizer,
33
+ table_prefix: str = "\\begin{tabular}"
34
+ ):
35
+ self.tokenizer = tokenizer
36
+ self.table_prefix = table_prefix
37
+
38
+ # Tokenize the table prefix to get the sequence of tokens to force
39
+ self.table_prefix_ids = tokenizer.encode(
40
+ table_prefix,
41
+ add_special_tokens=False
42
+ )
43
+
44
+ # Build sets of token IDs for detection
45
+ self._build_token_sets()
46
+
47
+ # State tracking for forced insertion
48
+ self._insertion_position = {} # batch_idx -> position in table_prefix_ids
49
+ self._insertion_active = {} # batch_idx -> bool
50
+
51
+ # State tracking for coordinate pairs
52
+ # False = expecting START coordinates (should trigger)
53
+ # True = expecting END coordinates (should NOT trigger)
54
+ self._expecting_end_coords = {} # batch_idx -> bool
55
+
56
+ def _build_token_sets(self):
57
+ """Build sets of token IDs for x_, y_, class_, and special tokens."""
58
+ vocab = self.tokenizer.get_vocab()
59
+
60
+ # Collect all x_ coordinate tokens
61
+ self.x_token_ids: Set[int] = set()
62
+ for token, token_id in vocab.items():
63
+ if token.startswith("<x_") and token.endswith(">"):
64
+ self.x_token_ids.add(token_id)
65
+
66
+ # Collect all y_ coordinate tokens
67
+ self.y_token_ids: Set[int] = set()
68
+ for token, token_id in vocab.items():
69
+ if token.startswith("<y_") and token.endswith(">"):
70
+ self.y_token_ids.add(token_id)
71
+
72
+ # Collect all class tokens
73
+ self.class_token_ids: Set[int] = set()
74
+ for token, token_id in vocab.items():
75
+ if token.startswith("<class_") and token.endswith(">"):
76
+ self.class_token_ids.add(token_id)
77
+
78
+ def _is_xy_pair(self, input_ids: torch.Tensor, batch_idx: int) -> bool:
79
+ """Check if the sequence ends with <x_...><y_...>."""
80
+ seq = input_ids[batch_idx].tolist()
81
+
82
+ if len(seq) < 2:
83
+ return False
84
+
85
+ return seq[-1] in self.y_token_ids and seq[-2] in self.x_token_ids
86
+
87
+ def _last_token_is_class(self, input_ids: torch.Tensor, batch_idx: int) -> bool:
88
+ """Check if the last token is a <class_...> token."""
89
+ seq = input_ids[batch_idx].tolist()
90
+
91
+ if len(seq) < 1:
92
+ return False
93
+
94
+ return seq[-1] in self.class_token_ids
95
+
96
+ def reset(self):
97
+ """Reset the processor state for a new generation."""
98
+ self._insertion_position = {}
99
+ self._insertion_active = {}
100
+ self._expecting_end_coords = {}
101
+
102
+ def __call__(
103
+ self,
104
+ input_ids: torch.LongTensor,
105
+ scores: torch.FloatTensor
106
+ ) -> torch.FloatTensor:
107
+ """
108
+ Process logits to force table prefix insertion when appropriate.
109
+
110
+ Args:
111
+ input_ids: (batch_size, seq_len) - tokens generated so far
112
+ scores: (batch_size, vocab_size) - logits for next token
113
+
114
+ Returns:
115
+ Modified scores with forced tokens where appropriate
116
+ """
117
+ batch_size = input_ids.shape[0]
118
+
119
+ for batch_idx in range(batch_size):
120
+ # Check if we're currently in an insertion sequence
121
+ if self._insertion_active.get(batch_idx, False):
122
+ pos = self._insertion_position[batch_idx]
123
+
124
+ if pos < len(self.table_prefix_ids):
125
+ # Force the next token in the sequence
126
+ forced_token_id = self.table_prefix_ids[pos]
127
+ scores[batch_idx] = torch.full_like(scores[batch_idx], float('-inf'))
128
+ scores[batch_idx, forced_token_id] = 0.0
129
+
130
+ # Advance position
131
+ self._insertion_position[batch_idx] = pos + 1
132
+ else:
133
+ # Finished inserting, deactivate
134
+ self._insertion_active[batch_idx] = False
135
+ continue
136
+
137
+ # Check if we just saw a <class_...> token - reset state to expect START coords
138
+ if self._last_token_is_class(input_ids, batch_idx):
139
+ self._expecting_end_coords[batch_idx] = False
140
+
141
+ # Check if we have an <x_...><y_...> pair
142
+ if self._is_xy_pair(input_ids, batch_idx):
143
+ expecting_end = self._expecting_end_coords.get(batch_idx, False)
144
+
145
+ if not expecting_end:
146
+ # This is START coordinates - trigger insertion!
147
+ self._insertion_active[batch_idx] = True
148
+ self._insertion_position[batch_idx] = 0
149
+
150
+ # After START coords, we expect END coords next
151
+ self._expecting_end_coords[batch_idx] = True
152
+
153
+ # Force the first token
154
+ if len(self.table_prefix_ids) > 0:
155
+ forced_token_id = self.table_prefix_ids[0]
156
+ scores[batch_idx] = torch.full_like(scores[batch_idx], float('-inf'))
157
+ scores[batch_idx, forced_token_id] = 0.0
158
+ self._insertion_position[batch_idx] = 1
159
+ else:
160
+ # This is END coordinates - don't trigger, but reset for next object
161
+ # The <class_...> token will come next, which resets expecting_end_coords
162
+ pass
163
+
164
+ return scores
165
+
166
+
167
+ class RepetitionStopProcessor(LogitsProcessor):
168
+ """
169
+ A LogitsProcessor that detects CONSECUTIVE repetition/hallucination in generated
170
+ content and forces the model to output <x_...> tokens to close the current object.
171
+
172
+ This detects patterns like:
173
+ - <x_0.0><x_0.0><x_0.0>... (same token repeating consecutively)
174
+ - "ABC ABC ABC ABC..." (same phrase repeating consecutively)
175
+
176
+ But NOT patterns like:
177
+ - "& 1 & 2 & 3 & 4..." (delimiters with different content between them)
178
+
179
+ When consecutive repetition exceeds the threshold, the processor forces an <x_...>
180
+ token to start coordinates, effectively ending the current content. After triggering,
181
+ it enters a cooldown until a <class_...> token is seen (indicating the object was
182
+ closed properly).
183
+
184
+ Args:
185
+ tokenizer: The tokenizer used for encoding/decoding
186
+ max_repetitions: Max consecutive repetitions before forcing stop (default: 10)
187
+ ngram_sizes: List of n-gram sizes to check for repetition (default: [1, 2, 3, 4, 5])
188
+ window_size: Size of the sliding window to check for repetitions (default: 2 * max_ngram * (max_repetitions + 1))
189
+ """
190
+
191
+ def __init__(
192
+ self,
193
+ tokenizer,
194
+ max_repetitions: int = 10,
195
+ ngram_sizes: List[int] = None,
196
+ window_size: int | None = None
197
+ ):
198
+ self.tokenizer = tokenizer
199
+ self.max_repetitions = max_repetitions
200
+ # Check various n-gram sizes for consecutive repetition
201
+ # n=1 catches single token repetition like <x_0.0><x_0.0><x_0.0>...
202
+ # n=2,3,4,5 catch phrase repetition like "ABC ABC ABC..."
203
+ self.ngram_sizes = ngram_sizes if ngram_sizes is not None else [1, 2, 3, 4, 5]
204
+ max_ngram = max((n for n in self.ngram_sizes if n > 0), default=1)
205
+ default_window = 2 * max_ngram * (self.max_repetitions + 1)
206
+ self.window_size = int(window_size) if window_size is not None else int(default_window)
207
+
208
+ # Build set of token IDs
209
+ self._build_token_sets()
210
+
211
+ # State tracking - once triggered, stay in cooldown until we see <class_...>
212
+ self._in_cooldown = {} # batch_idx -> bool
213
+ # Track the start of the "current object segment" (tokens after the last <class_...>)
214
+ # We only look for repetition inside this segment to avoid old repetition re-triggering immediately.
215
+ self._segment_start = {} # batch_idx -> int (seq index)
216
+
217
+ def _build_token_sets(self):
218
+ """Build sets of token IDs for detection."""
219
+ vocab = self.tokenizer.get_vocab()
220
+
221
+ # Collect all x_ coordinate tokens
222
+ self.x_token_ids: Set[int] = set()
223
+ self.x_token_list: List[int] = []
224
+ for token, token_id in vocab.items():
225
+ if token.startswith("<x_") and token.endswith(">"):
226
+ self.x_token_ids.add(token_id)
227
+ self.x_token_list.append(token_id)
228
+
229
+ # Sort and pick a middle x token as default
230
+ self.x_token_list.sort()
231
+ self.default_x_token = self.x_token_list[len(self.x_token_list) // 2] if self.x_token_list else None
232
+
233
+ # Collect all class tokens (used to reset cooldown)
234
+ self.class_token_ids: Set[int] = set()
235
+ for token, token_id in vocab.items():
236
+ if token.startswith("<class_") and token.endswith(">"):
237
+ self.class_token_ids.add(token_id)
238
+
239
+ def _count_consecutive_repetitions(self, seq: List[int], n: int) -> int:
240
+ """
241
+ Count the maximum number of times an n-gram repeats CONSECUTIVELY.
242
+
243
+ For example, if seq contains "ABC ABC ABC ABC DEF ABC":
244
+ - The n-gram "ABC" appears 4 times consecutively at the start
245
+ - Returns 4 (the max consecutive count)
246
+
247
+ Returns:
248
+ Maximum consecutive repetition count for any n-gram
249
+ """
250
+ if len(seq) < n:
251
+ return 0
252
+
253
+ max_consecutive = 1
254
+ current_consecutive = 1
255
+
256
+ # Slide through the sequence comparing adjacent n-grams
257
+ prev_ngram = tuple(seq[0:n])
258
+
259
+ i = n
260
+ while i <= len(seq) - n:
261
+ current_ngram = tuple(seq[i:i + n])
262
+
263
+ if current_ngram == prev_ngram:
264
+ current_consecutive += 1
265
+ max_consecutive = max(max_consecutive, current_consecutive)
266
+ i += n # Skip by n to check the next occurrence
267
+ else:
268
+ current_consecutive = 1
269
+ prev_ngram = current_ngram
270
+ i += 1 # Move by 1 to find new patterns
271
+
272
+ return max_consecutive
273
+
274
+ def _has_excessive_repetition(self, seq: List[int]) -> bool:
275
+ """
276
+ Check if the sequence has excessive CONSECUTIVE repetition of any n-gram.
277
+
278
+ This detects hallucination patterns like:
279
+ - <x_0.0><x_0.0><x_0.0><x_0.0>... (same token repeating)
280
+ - "ABC ABC ABC ABC..." (same phrase repeating)
281
+
282
+ But NOT patterns like:
283
+ - "& 1 & 2 & 3 & 4..." (delimiters with different content)
284
+
285
+ Returns:
286
+ True if any n-gram repeats consecutively more than max_repetitions times
287
+ """
288
+ # Only check the recent window
289
+ check_seq = seq[-self.window_size:] if len(seq) > self.window_size else seq
290
+
291
+ for n in self.ngram_sizes:
292
+ consecutive_count = self._count_consecutive_repetitions(check_seq, n)
293
+ if consecutive_count > self.max_repetitions:
294
+ return True
295
+
296
+ return False
297
+
298
+ def reset(self):
299
+ """Reset the processor state for a new generation."""
300
+ self._in_cooldown = {}
301
+ self._segment_start = {}
302
+
303
+ def __call__(
304
+ self,
305
+ input_ids: torch.LongTensor,
306
+ scores: torch.FloatTensor
307
+ ) -> torch.FloatTensor:
308
+ """
309
+ Process logits to force <x_...> token when repetition is detected.
310
+
311
+ Args:
312
+ input_ids: (batch_size, seq_len) - tokens generated so far
313
+ scores: (batch_size, vocab_size) - logits for next token
314
+
315
+ Returns:
316
+ Modified scores with forced <x_...> tokens where repetition detected
317
+ """
318
+ batch_size = input_ids.shape[0]
319
+
320
+ for batch_idx in range(batch_size):
321
+ seq = input_ids[batch_idx].tolist()
322
+ seq_len = len(seq)
323
+
324
+ # Check if we just saw a class token - reset cooldown and start a new segment
325
+ if seq_len > 0 and seq[-1] in self.class_token_ids:
326
+ self._in_cooldown[batch_idx] = False
327
+ self._segment_start[batch_idx] = seq_len
328
+
329
+ # If in cooldown, don't check for repetition (let model generate naturally)
330
+ if self._in_cooldown.get(batch_idx, False):
331
+ continue
332
+
333
+ # Check if repetition threshold is exceeded
334
+ segment_start = self._segment_start.get(batch_idx, 0)
335
+ segment_seq = seq[segment_start:]
336
+ if self._has_excessive_repetition(segment_seq):
337
+ # Enter cooldown to avoid repeated triggering
338
+ self._in_cooldown[batch_idx] = True
339
+
340
+ # Force one of the <x_...> tokens to start closing the object
341
+ # Keep the model's original preferences for which x_ coordinate to use
342
+ original_scores = scores[batch_idx].clone()
343
+ scores[batch_idx] = torch.full_like(scores[batch_idx], float('-inf'))
344
+
345
+ # Restore original logits for x_ tokens only
346
+ for x_token_id in self.x_token_ids:
347
+ scores[batch_idx, x_token_id] = original_scores[x_token_id]
348
+
349
+ return scores
hf_nemotron_parse_config.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import truncate
2
+ from quopri import decodestring
3
+ from transformers import PretrainedConfig
4
+ from typing import List, Optional
5
+
6
+ from transformers.dynamic_module_utils import get_class_from_dynamic_module
7
+
8
+ class NemotronParseTextConfig(PretrainedConfig):
9
+ """
10
+ Configuration class for NemotronParse text decoder (mBART-based).
11
+ """
12
+ model_type = "nemotron_parse_text"
13
+
14
+ def __init__(
15
+ self,
16
+ vocab_size: int = 250027,
17
+ d_model: int = 1024,
18
+ encoder_layers: int = 12,
19
+ decoder_layers: int = 12,
20
+ encoder_attention_heads: int = 16,
21
+ decoder_attention_heads: int = 16,
22
+ decoder_ffn_dim: int = 4096,
23
+ encoder_ffn_dim: int = 4096,
24
+ activation_function: str = "gelu",
25
+ dropout: float = 0.1,
26
+ attention_dropout: float = 0.0,
27
+ activation_dropout: float = 0.0,
28
+ classifier_dropout: float = 0.0,
29
+ init_std: float = 0.02,
30
+ encoder_layerdrop: float = 0.0,
31
+ decoder_layerdrop: float = 0.0,
32
+ scale_embedding: bool = False,
33
+ use_cache: bool = True,
34
+ num_labels: int = 3,
35
+ forced_eos_token_id: int = 2,
36
+ add_cross_attention: bool = True, # Enable cross-attention for vision-encoder-decoder
37
+ is_decoder: bool = True, # This is a decoder
38
+ max_sequence_length: int = 9000,
39
+ **kwargs
40
+ ):
41
+ super().__init__(**kwargs)
42
+ self.vocab_size = vocab_size
43
+ self.d_model = d_model
44
+ self.encoder_layers = encoder_layers
45
+ self.decoder_layers = decoder_layers
46
+ self.encoder_attention_heads = encoder_attention_heads
47
+ self.decoder_attention_heads = decoder_attention_heads
48
+ self.decoder_ffn_dim = decoder_ffn_dim
49
+ self.encoder_ffn_dim = encoder_ffn_dim
50
+ self.activation_function = activation_function
51
+ self.dropout = dropout
52
+ self.attention_dropout = attention_dropout
53
+ self.activation_dropout = activation_dropout
54
+ self.classifier_dropout = classifier_dropout
55
+ self.init_std = init_std
56
+ self.encoder_layerdrop = encoder_layerdrop
57
+ self.decoder_layerdrop = decoder_layerdrop
58
+ self.scale_embedding = scale_embedding
59
+ self.use_cache = use_cache
60
+ self.num_labels = num_labels
61
+ self.add_cross_attention = add_cross_attention
62
+ self.is_decoder = is_decoder
63
+
64
+ # Add hidden_size as alias for d_model (for compatibility)
65
+ self.hidden_size = self.d_model
66
+ self.forced_eos_token_id = forced_eos_token_id
67
+ self.num_attention_heads = self.encoder_attention_heads
68
+
69
+ self.max_sequence_length = max_sequence_length
70
+
71
+
72
+ class NemotronParseConfig(PretrainedConfig):
73
+ """
74
+ Configuration class for NemotronParse model.
75
+
76
+ This configuration class is used to store the configuration of a [`NemotronParseForConditionalGeneration`] model.
77
+ It is used to instantiate an NemotronParse model according to the specified arguments, defining the vision and text model configs.
78
+ """
79
+ model_type = "nemotron_parse"
80
+ is_composition = True
81
+ max_sequence_length = 9000
82
+
83
+ def __init__(
84
+ self,
85
+ encoder: Optional[dict] = None,
86
+ decoder: Optional[dict] = None,
87
+ tie_word_embeddings: bool = False,
88
+ decoder_start_token_id: int = 2,
89
+ pad_token_id: int = 1,
90
+ eos_token_id: int = 2,
91
+ bos_token_id: int = 0,
92
+ image_size: List[int] = [2048, 1664],
93
+ is_encoder_decoder: bool = True,
94
+ max_sequence_length: int = 9000,
95
+ **kwargs
96
+ ):
97
+ super().__init__(
98
+ tie_word_embeddings=tie_word_embeddings,
99
+ decoder_start_token_id=decoder_start_token_id,
100
+ pad_token_id=pad_token_id,
101
+ eos_token_id=eos_token_id,
102
+ bos_token_id=bos_token_id,
103
+ max_sequence_length=max_sequence_length,
104
+ **kwargs
105
+ )
106
+
107
+
108
+ if decoder is None:
109
+ decoder = {}
110
+
111
+ if encoder is not None:
112
+ assert "auto_map" in encoder and "AutoConfig" in encoder["auto_map"]
113
+ vision_auto_config = get_class_from_dynamic_module(*encoder["auto_map"]["AutoConfig"].split("--")[::-1])
114
+ self.encoder = vision_auto_config(**encoder)
115
+ else:
116
+ self.encoder = PretrainedConfig()
117
+
118
+ decoder["max_sequence_length"] = max_sequence_length
119
+ self.decoder = NemotronParseTextConfig(**decoder)
120
+ self.image_size = image_size
121
+
122
+ # Initialize vocab size from text config
123
+ self.vocab_size = self.decoder.vocab_size
124
+ self.is_encoder_decoder = is_encoder_decoder
125
+ self.max_sequence_length = max_sequence_length
126
+
127
+ def to_dict(self):
128
+ """
129
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
130
+ """
131
+ output = super().to_dict()
132
+ output["encoder"] = self.encoder.to_dict()
133
+ output["decoder"] = self.decoder.to_dict()
134
+ output["model_type"] = self.model_type
135
+ output["is_encoder_decoder"] = self.is_encoder_decoder
136
+ return output
hf_nemotron_parse_modeling.py ADDED
@@ -0,0 +1,585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import CrossEntropyLoss
5
+ from transformers import PreTrainedModel, GenerationMixin
6
+ from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import VisionEncoderDecoderModel
7
+ from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
8
+ from transformers.modeling_outputs import Seq2SeqLMOutput
9
+ from transformers.models.mbart.modeling_mbart import MBartPreTrainedModel, MBartConfig, MBartScaledWordEmbedding, MBartDecoderLayer, BaseModelOutputWithPastAndCrossAttentions
10
+ from transformers.models.donut.modeling_donut_swin import DonutSwinModelOutput
11
+ from einops import rearrange
12
+ from typing import Optional, List, Union, Tuple
13
+ import warnings
14
+ from transformers.modeling_outputs import BaseModelOutput
15
+ from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
16
+ from .hf_nemotron_parse_config import NemotronParseConfig
17
+ from transformers import AutoModel
18
+ import time
19
+ from transformers.modeling_attn_mask_utils import (
20
+ _prepare_4d_attention_mask,
21
+ _prepare_4d_attention_mask_for_sdpa,
22
+ _prepare_4d_causal_attention_mask,
23
+ _prepare_4d_causal_attention_mask_for_sdpa,
24
+ )
25
+
26
+
27
+ class NemotronParseDecoder(MBartPreTrainedModel):
28
+ """
29
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
30
+
31
+ Args:
32
+ config: MBartConfig
33
+ embed_tokens (nn.Embedding): output embedding
34
+ """
35
+
36
+ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
37
+ super().__init__(config)
38
+ self.dropout = config.dropout
39
+ self.layerdrop = config.decoder_layerdrop
40
+ self.padding_idx = config.pad_token_id
41
+ embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
42
+
43
+ self.embed_tokens = MBartScaledWordEmbedding(
44
+ config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
45
+ )
46
+
47
+ if embed_tokens is not None:
48
+ self.embed_tokens.weight = embed_tokens.weight
49
+
50
+ self.layers = nn.ModuleList([MBartDecoderLayer(config) for _ in range(config.decoder_layers)])
51
+ self.config = config
52
+
53
+ self.layernorm_embedding = nn.LayerNorm(config.d_model)
54
+ self.layer_norm = nn.LayerNorm(config.d_model)
55
+
56
+ self.gradient_checkpointing = False
57
+ # Initialize weights and apply final processing
58
+ self.post_init()
59
+
60
+ def get_input_embeddings(self):
61
+ return self.embed_tokens
62
+
63
+ def set_input_embeddings(self, value):
64
+ self.embed_tokens = value
65
+
66
+ def forward(
67
+ self,
68
+ input_ids: Optional[torch.LongTensor] = None,
69
+ attention_mask: Optional[torch.Tensor] = None,
70
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
71
+ encoder_attention_mask: Optional[torch.LongTensor] = None,
72
+ head_mask: Optional[torch.Tensor] = None,
73
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
74
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
75
+ inputs_embeds: Optional[torch.FloatTensor] = None,
76
+ use_cache: Optional[bool] = None,
77
+ output_attentions: Optional[bool] = None,
78
+ output_hidden_states: Optional[bool] = None,
79
+ return_dict: Optional[bool] = None,
80
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
81
+ r"""
82
+ Args:
83
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
84
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
85
+ provide it.
86
+
87
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
88
+ [`PreTrainedTokenizer.__call__`] for details.
89
+
90
+ [What are input IDs?](../glossary#input-ids)
91
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
92
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
93
+
94
+ - 1 for tokens that are **not masked**,
95
+ - 0 for tokens that are **masked**.
96
+
97
+ [What are attention masks?](../glossary#attention-mask)
98
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
99
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
100
+ of the decoder.
101
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
102
+ Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
103
+ selected in `[0, 1]`:
104
+
105
+ - 1 for tokens that are **not masked**,
106
+ - 0 for tokens that are **masked**.
107
+
108
+ [What are attention masks?](../glossary#attention-mask)
109
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
110
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
111
+
112
+ - 1 indicates the head is **not masked**,
113
+ - 0 indicates the head is **masked**.
114
+
115
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
116
+ Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
117
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
118
+
119
+ - 1 indicates the head is **not masked**,
120
+ - 0 indicates the head is **masked**.
121
+
122
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
123
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
124
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
125
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
126
+
127
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
128
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
129
+
130
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
131
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
132
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
133
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
134
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
135
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
136
+ than the model's internal embedding lookup matrix.
137
+ output_attentions (`bool`, *optional*):
138
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
139
+ returned tensors for more detail.
140
+ output_hidden_states (`bool`, *optional*):
141
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
142
+ for more detail.
143
+ return_dict (`bool`, *optional*):
144
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
145
+ """
146
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
147
+ output_hidden_states = (
148
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
149
+ )
150
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
151
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
152
+
153
+ # retrieve input_ids and inputs_embeds
154
+ if input_ids is not None and inputs_embeds is not None:
155
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
156
+ elif input_ids is not None:
157
+ input = input_ids
158
+ input_shape = input.size()
159
+ input_ids = input_ids.view(-1, input_shape[-1])
160
+ elif inputs_embeds is not None:
161
+ input_shape = inputs_embeds.size()[:-1]
162
+ input = inputs_embeds[:, :, -1]
163
+ else:
164
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
165
+
166
+ # past_key_values_length
167
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
168
+
169
+ if inputs_embeds is None:
170
+ inputs_embeds = self.embed_tokens(input_ids)
171
+
172
+ if self.config._attn_implementation == "flash_attention_2":
173
+ # 2d mask is passed through the layers
174
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
175
+ elif self.config._attn_implementation == "sdpa" and not output_attentions and cross_attn_head_mask is None:
176
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
177
+ # the manual implementation that requires a 4D causal mask in all cases.
178
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
179
+ attention_mask,
180
+ input_shape,
181
+ inputs_embeds,
182
+ past_key_values_length,
183
+ )
184
+ else:
185
+ # 4d mask is passed through the layers
186
+ attention_mask = _prepare_4d_causal_attention_mask(
187
+ attention_mask, input_shape, inputs_embeds, past_key_values_length
188
+ )
189
+
190
+ # expand encoder attention mask
191
+ if encoder_hidden_states is not None and encoder_attention_mask is not None:
192
+ if self.config._attn_implementation == "flash_attention_2":
193
+ encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
194
+ elif self.config._attn_implementation == "sdpa" and cross_attn_head_mask is None and not output_attentions:
195
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
196
+ # the manual implementation that requires a 4D causal mask in all cases.
197
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
198
+ encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
199
+ encoder_attention_mask,
200
+ inputs_embeds.dtype,
201
+ tgt_len=input_shape[-1],
202
+ )
203
+ else:
204
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
205
+ encoder_attention_mask = _prepare_4d_attention_mask(
206
+ encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
207
+ )
208
+ hidden_states = inputs_embeds
209
+ hidden_states = self.layernorm_embedding(hidden_states)
210
+
211
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
212
+
213
+ if self.gradient_checkpointing and self.training:
214
+ if use_cache:
215
+ logger.warning_once(
216
+ "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
217
+ )
218
+ use_cache = False
219
+
220
+ # decoder layers
221
+ all_hidden_states = () if output_hidden_states else None
222
+ all_self_attns = () if output_attentions else None
223
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
224
+ next_decoder_cache = () if use_cache else None
225
+
226
+ # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
227
+ for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
228
+ if attn_mask is not None:
229
+ if attn_mask.size()[0] != len(self.layers):
230
+ raise ValueError(
231
+ f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
232
+ f" {attn_mask.size()[0]}."
233
+ )
234
+ for idx, decoder_layer in enumerate(self.layers):
235
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
236
+ if output_hidden_states:
237
+ all_hidden_states += (hidden_states,)
238
+ if self.training:
239
+ dropout_probability = torch.rand([])
240
+ if dropout_probability < self.layerdrop:
241
+ continue
242
+
243
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
244
+
245
+ if self.gradient_checkpointing and self.training:
246
+ layer_outputs = self._gradient_checkpointing_func(
247
+ decoder_layer.__call__,
248
+ hidden_states,
249
+ attention_mask,
250
+ encoder_hidden_states,
251
+ encoder_attention_mask,
252
+ head_mask[idx] if head_mask is not None else None,
253
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
254
+ None,
255
+ output_attentions,
256
+ use_cache,
257
+ )
258
+ else:
259
+ layer_outputs = decoder_layer(
260
+ hidden_states,
261
+ attention_mask=attention_mask,
262
+ encoder_hidden_states=encoder_hidden_states,
263
+ encoder_attention_mask=encoder_attention_mask,
264
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
265
+ cross_attn_layer_head_mask=(
266
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
267
+ ),
268
+ past_key_value=past_key_value,
269
+ output_attentions=output_attentions,
270
+ use_cache=use_cache,
271
+ )
272
+ hidden_states = layer_outputs[0]
273
+
274
+ if use_cache:
275
+ next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
276
+
277
+ if output_attentions:
278
+ all_self_attns += (layer_outputs[1],)
279
+
280
+ if encoder_hidden_states is not None:
281
+ all_cross_attentions += (layer_outputs[2],)
282
+
283
+ hidden_states = self.layer_norm(hidden_states)
284
+
285
+ # add hidden states from the last decoder layer
286
+ if output_hidden_states:
287
+ all_hidden_states += (hidden_states,)
288
+
289
+ next_cache = next_decoder_cache if use_cache else None
290
+ if not return_dict:
291
+ return tuple(
292
+ v
293
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
294
+ if v is not None
295
+ )
296
+ return BaseModelOutputWithPastAndCrossAttentions(
297
+ last_hidden_state=hidden_states,
298
+ past_key_values=next_cache,
299
+ hidden_states=all_hidden_states,
300
+ attentions=all_self_attns,
301
+ cross_attentions=all_cross_attentions,
302
+ )
303
+
304
+
305
+ class RadioWithNeck(nn.Module):
306
+ """Vision encoder using RADIO model with custom neck."""
307
+
308
+ def __init__(self, config):
309
+ super().__init__()
310
+ self.config = config
311
+
312
+ self.model_encoder = AutoModel.from_config(config, trust_remote_code=True)
313
+
314
+ # Neck components
315
+ last_hidden_state = 1024
316
+ self.conv1 = nn.Conv1d(1280, last_hidden_state, 1)
317
+ self.layer_norm1 = nn.LayerNorm(last_hidden_state, eps=1e-06, elementwise_affine=True)
318
+ self.conv2 = nn.Conv2d(last_hidden_state, last_hidden_state, kernel_size=(1,4), stride=(1,4), padding=0, bias=False)
319
+ self.layer_norm2 = nn.LayerNorm(last_hidden_state, eps=1e-06, elementwise_affine=True)
320
+ self.sum_proj = nn.Linear(3840, last_hidden_state)
321
+ self.layer_norm3 = nn.LayerNorm(last_hidden_state, eps=1e-06, elementwise_affine=True)
322
+
323
+ def forward(self, pixel_values, output_attentions=False, output_hidden_states=False, return_dict=False, **kwargs):
324
+ radio_output = self.model_encoder(pixel_values)
325
+ summary, feature = radio_output
326
+
327
+
328
+ output = self.conv1(feature.permute(0,2,1)).permute(0,2,1)
329
+ output = self.layer_norm1(output)
330
+
331
+ patch_size = self.config.patch_size
332
+ output = rearrange(output, 'b (h w) d -> b d h w',
333
+ h=pixel_values.shape[-2] // patch_size,
334
+ w=pixel_values.shape[-1] // patch_size)
335
+
336
+ output = self.conv2(output)
337
+ output = rearrange(output, 'b d h w -> b (h w) d')
338
+ output = self.layer_norm2(output)
339
+ summary = self.layer_norm3(self.sum_proj(summary))
340
+ output = torch.cat((output, summary.unsqueeze(1)), dim=1)
341
+
342
+ return DonutSwinModelOutput(last_hidden_state=output)
343
+
344
+
345
+ class NemotronParsePreTrainedModel(PreTrainedModel):
346
+ """
347
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
348
+ """
349
+ config_class = NemotronParseConfig
350
+ base_model_prefix = "vision_encoder_decoder" # Use VisionEncoderDecoder prefix
351
+ main_input_name = "pixel_values"
352
+ supports_gradient_checkpointing = True
353
+ _no_split_modules = ["RadioWithNeck", "MBartDecoder"]
354
+ _skip_keys_device_placement = "past_key_values"
355
+
356
+ def _init_weights(self, module):
357
+ """Initialize the weights"""
358
+ if isinstance(module, nn.Linear):
359
+ module.weight.data.normal_(mean=0.0, std=self.config.decoder.init_std)
360
+ if module.bias is not None:
361
+ module.bias.data.zero_()
362
+ elif isinstance(module, nn.Embedding):
363
+ module.weight.data.normal_(mean=0.0, std=self.config.decoder.init_std)
364
+ if module.padding_idx is not None:
365
+ module.weight.data[module.padding_idx].zero_()
366
+
367
+ # Based on transformers.models.encoder_decoder.modeling_encoder_decoder
368
+ class NemotronParseForConditionalGeneration(NemotronParsePreTrainedModel, GenerationMixin):
369
+ """
370
+ NemotronParse model for conditional generation tasks.
371
+
372
+ This model combines a RADIO-based vision encoder with an mBART-based text decoder.
373
+ """
374
+
375
+ def __init__(self, config: NemotronParseConfig):
376
+ super().__init__(config)
377
+
378
+ self.encoder = RadioWithNeck(config.encoder)
379
+ self.encoder.main_input_name = 'pixel_values'
380
+ self.encoder = self.encoder.to(config.encoder.torch_dtype)
381
+
382
+ self.decoder = NemotronParseDecoder(config.decoder)
383
+ self.decoder = self.decoder.to(config.decoder.torch_dtype)
384
+
385
+ self.lm_head = nn.Linear(config.decoder.d_model, config.decoder.vocab_size, bias=False, dtype=config.decoder.torch_dtype)
386
+
387
+ # Extra heads
388
+ num_extra_heads = getattr(config, 'num_extra_heads', 0)
389
+ self.decoder.extra_heads = nn.ModuleList([
390
+ nn.Linear(config.decoder.d_model, config.decoder.d_model)
391
+ for _ in range(num_extra_heads)
392
+ ])
393
+ self.decoder.extra_proj = nn.ModuleList([
394
+ nn.Linear(config.decoder.d_model, config.decoder.d_model)
395
+ for _ in range(num_extra_heads)
396
+ ])
397
+
398
+ # Class token index for loss weighting
399
+ self.class_token_indx_start = getattr(config, 'class_token_start_idx', 50000)
400
+
401
+ self.post_init()
402
+
403
+ def get_encoder(self):
404
+ return self.encoder
405
+
406
+ def get_decoder(self):
407
+ return self.decoder
408
+
409
+ def get_output_embeddings(self):
410
+ return self.lm_head
411
+
412
+ def set_output_embeddings(self, new_embeddings):
413
+ self.lm_head = new_embeddings
414
+
415
+ def get_input_embeddings(self):
416
+ return self.decoder.get_input_embeddings()
417
+
418
+ def forward(
419
+ self,
420
+ pixel_values: Optional[torch.FloatTensor] = None,
421
+ decoder_input_ids: Optional[torch.LongTensor] = None,
422
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
423
+ encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
424
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
425
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
426
+ labels: Optional[torch.LongTensor] = None,
427
+ use_cache: Optional[bool] = None,
428
+ output_attentions: Optional[bool] = None,
429
+ output_hidden_states: Optional[bool] = None,
430
+ return_dict: Optional[bool] = None,
431
+ __subflavors__: Optional[str] = None,
432
+ __keys__: Optional[List[str]] = None,
433
+ return_sample_losses: Optional[torch.FloatTensor] = None,
434
+ **kwargs,
435
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
436
+
437
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
438
+
439
+ kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
440
+
441
+ kwargs_decoder = {
442
+ argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
443
+ }
444
+
445
+ if encoder_outputs is None:
446
+ if pixel_values is None:
447
+ raise ValueError("You have to specify pixel_values")
448
+
449
+ encoder_outputs = self.encoder(
450
+ pixel_values,
451
+ output_attentions=output_attentions,
452
+ output_hidden_states=output_hidden_states,
453
+ return_dict=return_dict,
454
+ **kwargs_encoder,
455
+ )
456
+
457
+ elif isinstance(encoder_outputs, tuple):
458
+ encoder_outputs = BaseModelOutput(*encoder_outputs)
459
+
460
+ encoder_hidden_states = encoder_outputs[0]
461
+
462
+ encoder_attention_mask = None
463
+
464
+ if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
465
+ decoder_input_ids = shift_tokens_right(
466
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
467
+ )
468
+
469
+ output_hidden_states = True
470
+
471
+ decoder_outputs = self.decoder(
472
+ input_ids=decoder_input_ids,
473
+ attention_mask=decoder_attention_mask,
474
+ encoder_hidden_states=encoder_hidden_states,
475
+ encoder_attention_mask=encoder_attention_mask,
476
+ inputs_embeds=decoder_inputs_embeds,
477
+ output_attentions=output_attentions,
478
+ output_hidden_states=output_hidden_states,
479
+ use_cache=use_cache,
480
+ past_key_values=past_key_values,
481
+ return_dict=return_dict,
482
+ **kwargs_decoder,
483
+ )
484
+ loss = None
485
+
486
+ if labels is not None:
487
+ main_logits = self.lm_head(decoder_outputs.last_hidden_state)
488
+ logits = [main_logits]
489
+ decoder_inputs_embeds = decoder_outputs.inputs_embeds
490
+ for iii, head in enumerate(self.decoder.extra_heads):
491
+
492
+ decoder_input_embeds_shift = self.decoder.extra_proj[iii](torch.cat((decoder_inputs_embeds[:,1:,:], torch.zeros_like(decoder_inputs_embeds[:,0,:].unsqueeze(1))), axis=1))
493
+ hidden = head(decoder_outputs['hidden_states'][-1] + decoder_input_embeds_shift)
494
+ logits.append(self.lm_head(hidden)) # Use main lm_head, NOT decoder.lm_head
495
+
496
+ logits = torch.stack(logits, dim=-2)
497
+ loss_fct = CrossEntropyLoss(reduction="none")
498
+
499
+ losses_per_head = []
500
+ tokens_per_head = []
501
+ for head_num in range(len(self.decoder.extra_heads)+1):
502
+ logits_head = logits[:,:,head_num,:]
503
+ labels_head = torch.cat(
504
+ (labels[:, head_num:], torch.full_like(labels[:, :head_num], -100)),
505
+ 1
506
+ )
507
+ loss_full = loss_fct(logits_head.permute(0, 2, 1), labels_head)
508
+ loss_full[labels_head >= self.class_token_indx_start] *= 10
509
+ losses_per_head.append(loss_full.sum(1))
510
+ tokens_per_head.append((labels_head != -100).sum(1))
511
+
512
+ losses_per_sample = torch.stack(losses_per_head, dim=1).sum(1)
513
+ tokens_per_sample = torch.stack(tokens_per_head, dim=1).sum(1)
514
+ loss = losses_per_sample.sum() / (tokens_per_sample.sum() + 1e-6)
515
+ if return_sample_losses is not None:
516
+ return_sample_losses.copy_(losses_per_sample.detach() / (tokens_per_sample + 1e-6))
517
+
518
+ if not return_dict:
519
+ if loss is not None:
520
+ return (loss,) + decoder_outputs + encoder_outputs
521
+ else:
522
+ return decoder_outputs + encoder_outputs
523
+ output_logits = self.lm_head(decoder_outputs.last_hidden_state)
524
+ return Seq2SeqLMOutput(
525
+ loss=loss,
526
+ logits=output_logits,
527
+ past_key_values=decoder_outputs.past_key_values,
528
+ decoder_hidden_states=decoder_outputs.hidden_states,
529
+ decoder_attentions=decoder_outputs.attentions,
530
+ cross_attentions=decoder_outputs.cross_attentions,
531
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
532
+ encoder_hidden_states=encoder_outputs.hidden_states,
533
+ encoder_attentions=encoder_outputs.attentions,
534
+ )
535
+
536
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
537
+ return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
538
+
539
+
540
+ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None):
541
+ """Resize token embeddings and update lm_head accordingly."""
542
+ # Resize decoder embeddings
543
+ new_embeddings = self.decoder.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
544
+
545
+ # Update lm_head to match new vocab size
546
+ if new_embeddings is not None:
547
+ old_vocab_size, hidden_size = self.lm_head.weight.shape
548
+ new_vocab_size = new_embeddings.num_embeddings
549
+
550
+ if old_vocab_size != new_vocab_size:
551
+ print(f"Resizing lm_head from {old_vocab_size} to {new_vocab_size} tokens")
552
+ new_lm_head = nn.Linear(hidden_size, new_vocab_size, bias=False, device=self.lm_head.weight.device, dtype=self.lm_head.weight.dtype)
553
+
554
+ # Copy old weights to new lm_head
555
+ num_tokens_to_copy = min(old_vocab_size, new_vocab_size)
556
+ new_lm_head.weight.data[:num_tokens_to_copy] = self.lm_head.weight.data[:num_tokens_to_copy]
557
+
558
+ # Update reference
559
+ self.lm_head = new_lm_head
560
+ # DO NOT update decoder.lm_head - keep them separate
561
+
562
+ return new_embeddings
563
+
564
+ def _reorder_cache(self, past_key_values, beam_idx):
565
+ # apply decoder cache reordering here
566
+ return self.decoder._reorder_cache(past_key_values, beam_idx)
567
+
568
+
569
+ # Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
570
+ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
571
+ """
572
+ Shift input ids one token to the right.
573
+ """
574
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
575
+ shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
576
+ if decoder_start_token_id is None:
577
+ raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
578
+ shifted_input_ids[:, 0] = decoder_start_token_id
579
+
580
+ if pad_token_id is None:
581
+ raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
582
+ # replace possible -100 values in labels by `pad_token_id`
583
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
584
+
585
+ return shifted_input_ids
hf_nemotron_parse_processor.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from PIL import Image
3
+ from typing import List, Optional, Union, Dict, Any
4
+ import torch
5
+ from torchvision import transforms as T
6
+ import albumentations as A
7
+ import cv2
8
+ import json
9
+
10
+ from transformers import ProcessorMixin, BaseImageProcessor, ImageProcessingMixin
11
+ from transformers.tokenization_utils_base import BatchEncoding
12
+ from transformers.image_utils import ChannelDimension, ImageInput, PILImageResampling, infer_channel_dimension_format
13
+ from transformers.utils import TensorType
14
+
15
+
16
+ class NemotronParseImageProcessor(BaseImageProcessor, ImageProcessingMixin):
17
+ """
18
+ Image processor for NemotronParse model.
19
+
20
+ This processor inherits from BaseImageProcessor to be compatible with transformers AutoImageProcessor.
21
+ """
22
+
23
+ model_input_names = ["pixel_values"]
24
+
25
+ def __init__(
26
+ self,
27
+ final_size: tuple = (2048, 1664),
28
+ **kwargs,
29
+ ):
30
+ clean_kwargs = {}
31
+ for k, v in kwargs.items():
32
+ if not k.startswith('_') and k not in ['transform', 'torch_transform']:
33
+ clean_kwargs[k] = v
34
+
35
+ if 'size' in clean_kwargs:
36
+ size_config = clean_kwargs.pop('size')
37
+ if isinstance(size_config, dict):
38
+ if 'longest_edge' in size_config:
39
+ longest_edge = size_config['longest_edge']
40
+ if isinstance(longest_edge, (list, tuple)):
41
+ final_size = tuple(int(x) for x in longest_edge)
42
+ else:
43
+ final_size = (int(longest_edge), int(longest_edge))
44
+ elif 'height' in size_config and 'width' in size_config:
45
+ final_size = (int(size_config['height']), int(size_config['width']))
46
+
47
+ super().__init__(**clean_kwargs)
48
+
49
+ if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
50
+ self.final_size = (int(final_size[0]), int(final_size[1]))
51
+ elif isinstance(final_size, (int, float)):
52
+ self.final_size = (int(final_size), int(final_size))
53
+ else:
54
+ self.final_size = (2048, 1664) # Default fallback
55
+
56
+ self._create_transforms()
57
+
58
+ def _create_transforms(self):
59
+ """Create transform objects (not serialized to JSON)."""
60
+ if isinstance(self.final_size, (list, tuple)):
61
+ self.target_height, self.target_width = int(self.final_size[0]), int(self.final_size[1])
62
+ else:
63
+ self.target_height = self.target_width = int(self.final_size)
64
+
65
+ self.transform = A.Compose([
66
+ A.PadIfNeeded(
67
+ min_height=self.target_height,
68
+ min_width=self.target_width,
69
+ border_mode=cv2.BORDER_CONSTANT,
70
+ value=[255, 255, 255],
71
+ p=1.0
72
+ ),
73
+ ])
74
+
75
+ self.torch_transform = T.Compose([
76
+ T.ToTensor(),
77
+ # Note: Normalization is done within RADIO model
78
+ ])
79
+
80
+ def to_dict(self):
81
+ """Override to exclude non-serializable transforms."""
82
+ output = super().to_dict()
83
+ output.pop('transform', None)
84
+ output.pop('torch_transform', None)
85
+ return output
86
+
87
+ @classmethod
88
+ def from_dict(cls, config_dict: dict, **kwargs):
89
+ """Override to recreate transforms after loading."""
90
+ config_dict = config_dict.copy()
91
+ config_dict.pop('transform', None)
92
+ config_dict.pop('torch_transform', None)
93
+
94
+ # Clean any problematic entries
95
+ for key in list(config_dict.keys()):
96
+ if key.startswith('_') or config_dict[key] is None:
97
+ config_dict.pop(key, None)
98
+
99
+ # Ensure numeric types are correct
100
+ if 'final_size' in config_dict:
101
+ final_size = config_dict['final_size']
102
+ if isinstance(final_size, (list, tuple)):
103
+ config_dict['final_size'] = tuple(int(x) for x in final_size)
104
+
105
+ try:
106
+ return cls(**config_dict, **kwargs)
107
+ except Exception as e:
108
+ print(f"Warning: Error in from_dict: {e}")
109
+ print("Using default parameters...")
110
+ return cls(**kwargs)
111
+
112
+ def save_pretrained(self, save_directory, **kwargs):
113
+ """Save image processor configuration."""
114
+ import os
115
+ import json
116
+
117
+ os.makedirs(save_directory, exist_ok=True)
118
+
119
+ # Save preprocessor config in standard HuggingFace format
120
+ config = {
121
+ "feature_extractor_type": "NemotronParseImageProcessor",
122
+ "image_processor_type": "NemotronParseImageProcessor",
123
+ "processor_class": "NemotronParseImageProcessor",
124
+ "size": {
125
+ "height": self.final_size[0],
126
+ "width": self.final_size[1],
127
+ "longest_edge": self.final_size
128
+ },
129
+ "final_size": self.final_size,
130
+ }
131
+
132
+ config_path = os.path.join(save_directory, "preprocessor_config.json")
133
+ with open(config_path, 'w') as f:
134
+ json.dump(config, f, indent=2)
135
+
136
+ def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
137
+ """Resize image maintaining aspect ratio (exact replica of original LongestMaxSizeHW)."""
138
+ height, width = image.shape[:2]
139
+ max_size_height = self.target_height
140
+ max_size_width = self.target_width
141
+
142
+ # Original LongestMaxSizeHW algorithm from custom_augmentations.py
143
+ aspect_ratio = width / height
144
+ new_height = height
145
+ new_width = width
146
+
147
+ if height > max_size_height:
148
+ new_height = max_size_height
149
+ new_width = int(new_height * aspect_ratio)
150
+
151
+ if new_width > max_size_width:
152
+ new_width = max_size_width
153
+ new_height = int(new_width / aspect_ratio)
154
+
155
+ return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
156
+
157
+ def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
158
+ """Pad image to target size with white padding (matches A.PadIfNeeded behavior)."""
159
+ h, w = image.shape[:2]
160
+ min_height, min_width = self.target_height, self.target_width
161
+
162
+ pad_h = max(0, min_height - h)
163
+ pad_w = max(0, min_width - w)
164
+
165
+ if pad_h == 0 and pad_w == 0:
166
+ return image
167
+
168
+ if len(image.shape) == 3:
169
+ padded = np.pad(
170
+ image,
171
+ ((0, pad_h), (0, pad_w), (0, 0)),
172
+ mode='constant',
173
+ constant_values=255
174
+ )
175
+ else:
176
+ padded = np.pad(
177
+ image,
178
+ ((0, pad_h), (0, pad_w)),
179
+ mode='constant',
180
+ constant_values=255
181
+ )
182
+
183
+ return padded
184
+
185
+ def preprocess(
186
+ self,
187
+ images: ImageInput,
188
+ return_tensors: Optional[Union[str, TensorType]] = None,
189
+ **kwargs,
190
+ ) -> Dict[str, torch.Tensor]:
191
+ """
192
+ Preprocess an image or batch of images for the NemotronParse model.
193
+
194
+ Args:
195
+ images: Input image(s)
196
+ return_tensors: Type of tensors to return
197
+ """
198
+
199
+ # Ensure images is a list
200
+ if not isinstance(images, list):
201
+ images = [images]
202
+
203
+ # Ensure images are RGB
204
+ for i in range(len(images)):
205
+ images[i] = images[i].convert('RGB')
206
+
207
+ # Convert PIL images to numpy arrays if needed
208
+ processed_images = []
209
+ for image in images:
210
+ if isinstance(image, Image.Image):
211
+ image = np.asarray(image)
212
+ processed_images.append(image)
213
+
214
+ # Apply NemotronParse-specific transforms
215
+ pixel_values = []
216
+ for image in processed_images:
217
+ processed_image = self._resize_with_aspect_ratio(image)
218
+
219
+ if self.transform is not None:
220
+ transformed = self.transform(image=processed_image)
221
+ processed_image = transformed["image"]
222
+ else:
223
+ # Fallback: just pad to target size
224
+ processed_image = self._pad_to_size(processed_image)
225
+
226
+ pixel_values_tensor = self.torch_transform(processed_image)
227
+
228
+ if pixel_values_tensor.shape[0] == 1:
229
+ pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
230
+
231
+ pixel_values.append(pixel_values_tensor)
232
+
233
+ pixel_values = torch.stack(pixel_values)
234
+
235
+ data = {"pixel_values": pixel_values}
236
+
237
+ if return_tensors is not None:
238
+ data = self._convert_output_format(data, return_tensors)
239
+
240
+ return data
241
+
242
+ def _convert_output_format(self, data: Dict[str, torch.Tensor], return_tensors: Union[str, TensorType]) -> Dict:
243
+ """Convert output format based on return_tensors parameter."""
244
+ if return_tensors == "pt" or return_tensors == TensorType.PYTORCH:
245
+ return data
246
+ elif return_tensors == "np" or return_tensors == TensorType.NUMPY:
247
+ return {k: v.numpy() for k, v in data.items()}
248
+ else:
249
+ return data
250
+
251
+ def __call__(self, images: Union[Image.Image, List[Image.Image]], **kwargs) -> Dict[str, torch.Tensor]:
252
+ """Process images for the model (backward compatibility)."""
253
+ return self.preprocess(images, **kwargs)
254
+
255
+
256
+ class NemotronParseProcessor(ProcessorMixin):
257
+
258
+ attributes = ["image_processor", "tokenizer"]
259
+ image_processor_class = "AutoImageProcessor"
260
+ tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
261
+
262
+ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
263
+ if image_processor is None:
264
+ image_processor = NemotronParseImageProcessor(**kwargs)
265
+
266
+ super().__init__(image_processor, tokenizer)
267
+
268
+
269
+ def __call__(
270
+ self,
271
+ images: Union[Image.Image, List[Image.Image]] = None,
272
+ text: Union[str, List[str]] = None,
273
+ add_special_tokens: bool = True,
274
+ padding: Union[bool, str] = False,
275
+ truncation: Union[bool, str] = False,
276
+ max_length: Optional[int] = None,
277
+ stride: int = 0,
278
+ pad_to_multiple_of: Optional[int] = None,
279
+ return_attention_mask: Optional[bool] = None,
280
+ return_overflowing_tokens: bool = False,
281
+ return_special_tokens_mask: bool = False,
282
+ return_offsets_mapping: bool = False,
283
+ return_token_type_ids: bool = False,
284
+ return_length: bool = False,
285
+ verbose: bool = True,
286
+ return_tensors: Optional[Union[str, "TensorType"]] = None,
287
+ **kwargs
288
+ ) -> BatchEncoding:
289
+ """
290
+ Main method to prepare for the model one or several text(s) and image(s).
291
+ """
292
+
293
+ # Process images
294
+ if images is not None:
295
+ image_inputs = self.image_processor(images, **kwargs)
296
+ else:
297
+ image_inputs = {}
298
+
299
+ # Process text
300
+ if text is not None:
301
+ text_inputs = self.tokenizer(
302
+ text,
303
+ add_special_tokens=add_special_tokens,
304
+ padding=padding,
305
+ truncation=truncation,
306
+ max_length=max_length,
307
+ stride=stride,
308
+ pad_to_multiple_of=pad_to_multiple_of,
309
+ return_attention_mask=return_attention_mask,
310
+ return_overflowing_tokens=return_overflowing_tokens,
311
+ return_special_tokens_mask=return_special_tokens_mask,
312
+ return_offsets_mapping=return_offsets_mapping,
313
+ return_token_type_ids=return_token_type_ids,
314
+ return_length=return_length,
315
+ verbose=verbose,
316
+ return_tensors=return_tensors,
317
+ **kwargs,
318
+ )
319
+ else:
320
+ text_inputs = {}
321
+
322
+ # Combine inputs
323
+ return BatchEncoding({**image_inputs, **text_inputs})
324
+
325
+ def decode(self, *args, **kwargs):
326
+ """Decode token ids to strings."""
327
+ return self.tokenizer.decode(*args, **kwargs)
328
+
329
+ def batch_decode(self, *args, **kwargs):
330
+ """Batch decode token ids to strings."""
331
+ return self.tokenizer.batch_decode(*args, **kwargs)
332
+
333
+ def post_process_generation(self, sequences, fix_markdown=False):
334
+ """Post-process generated sequences."""
335
+ if hasattr(self.tokenizer, 'post_process_generation'):
336
+ return self.tokenizer.post_process_generation(sequences, fix_markdown=fix_markdown)
337
+ else:
338
+ # Fallback processing
339
+ if isinstance(sequences, str):
340
+ sequences = [sequences]
341
+
342
+ processed = []
343
+ for seq in sequences:
344
+ # Basic cleaning
345
+ seq = seq.replace('<s>', '').replace('</s>', '').strip()
346
+ processed.append(seq)
347
+
348
+ return processed[0] if len(processed) == 1 else processed
349
+
350
+ @classmethod
351
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
352
+ """
353
+ Load processor from pretrained model.
354
+
355
+ This method is compatible with AutoProcessor.from_pretrained().
356
+ """
357
+ # Explicitly load subcomponents via Auto* to ensure remote auto_map is honored.
358
+ from transformers import AutoImageProcessor, AutoTokenizer
359
+ trust_remote_code = kwargs.get("trust_remote_code", None)
360
+ revision = kwargs.get("revision", None)
361
+ token = kwargs.get("token", None)
362
+ image_processor = AutoImageProcessor.from_pretrained(
363
+ pretrained_model_name_or_path,
364
+ trust_remote_code=trust_remote_code,
365
+ revision=revision,
366
+ token=token,
367
+ )
368
+ tokenizer = AutoTokenizer.from_pretrained(
369
+ pretrained_model_name_or_path,
370
+ trust_remote_code=trust_remote_code,
371
+ revision=revision,
372
+ token=token,
373
+ )
374
+ return cls(image_processor=image_processor, tokenizer=tokenizer)
375
+
376
+ def save_pretrained(self, save_directory, **kwargs):
377
+ """
378
+ Save processor to directory.
379
+
380
+ This method is compatible with AutoProcessor/AutoImageProcessor loading.
381
+ """
382
+ import os
383
+ os.makedirs(save_directory, exist_ok=True)
384
+
385
+ # Save tokenizer with proper configuration for AutoTokenizer
386
+ print("Saving tokenizer for AutoTokenizer compatibility...")
387
+ self.tokenizer.save_pretrained(save_directory, **kwargs)
388
+
389
+ # Save image processor
390
+ print("Saving image processor...")
391
+ self.image_processor.save_pretrained(save_directory, **kwargs)
392
+
393
+ # Use the parent class's save_pretrained method for processor config
394
+ super().save_pretrained(save_directory, **kwargs)
395
+ print(f"NemotronParseProcessor saved to {save_directory}")
396
+ print(f"AutoTokenizer.from_pretrained('{save_directory}') should now work!")
latex2html.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from bs4 import BeautifulSoup
3
+
4
+ def skip_whitespace(text, i):
5
+ """Advance index i past any whitespace."""
6
+ while i < len(text) and text[i].isspace():
7
+ i += 1
8
+ return i
9
+
10
+ def parse_braced_argument(text, i):
11
+ """
12
+ Given text and an index i that should point at an opening '{',
13
+ return a tuple (argument_content, new_index) where argument_content is the full
14
+ string inside the balanced braces and new_index is the position just after the matching '}'.
15
+ """
16
+ if i >= len(text) or text[i] != '{':
17
+ raise ValueError("Expected '{' at position {}".format(i))
18
+ i += 1 # skip the opening brace
19
+ start = i
20
+ level = 1
21
+ while i < len(text) and level > 0:
22
+ if text[i] == '{':
23
+ level += 1
24
+ elif text[i] == '}':
25
+ level -= 1
26
+ i += 1
27
+ if level != 0:
28
+ raise ValueError("Unbalanced braces starting at position {}".format(start-1))
29
+ # The argument content is from start to i-1 (excluding the closing brace)
30
+ return text[start:i-1], i
31
+
32
+ def parse_command(text, i):
33
+ """
34
+ Parse a \multirow or \multicolumn command starting at index i.
35
+ This function assumes the command has exactly three braced arguments.
36
+
37
+ It processes each argument recursively. For the third argument, after recursive processing,
38
+ it replaces any unescaped & with \&.
39
+
40
+ Returns a tuple (command_text, new_index) where command_text is the reconstructed command.
41
+ """
42
+ # Determine which command we have.
43
+ if text.startswith(r"\multirow", i):
44
+ command_name = r"\multirow"
45
+ i += len(r"\multirow")
46
+ elif text.startswith(r"\multicolumn", i):
47
+ command_name = r"\multicolumn"
48
+ i += len(r"\multicolumn")
49
+ else:
50
+ raise ValueError("Expected \\multirow or \\multicolumn at position {}".format(i))
51
+
52
+ # Skip whitespace between the command name and the first argument.
53
+ i = skip_whitespace(text, i)
54
+ args = []
55
+ # Expect exactly three arguments
56
+ for arg_index in range(3):
57
+ if i >= len(text) or text[i] != '{':
58
+ raise ValueError("Expected '{' for argument {} at position {}".format(arg_index+1, i))
59
+ arg_content, i = parse_braced_argument(text, i)
60
+ # Process the content recursively to catch nested commands
61
+ processed_arg = clean_multi_cells(arg_content)
62
+ if arg_index == 2:
63
+ # For the cell text (third argument), replace any unescaped &
64
+ processed_arg = re.sub(r'(?<!\\)&', r'\\&', processed_arg)
65
+ args.append(processed_arg)
66
+ # Only skip whitespace between arguments, not after the last one.
67
+ if arg_index < 2:
68
+ i = skip_whitespace(text, i)
69
+ # Reconstruct the full command with its three arguments
70
+ command_text = f"{command_name}{{{args[0]}}}{{{args[1]}}}{{{args[2]}}}"
71
+ return command_text, i
72
+
73
+ def clean_multi_cells(text):
74
+ """
75
+ Process an arbitrary LaTeX text string and look for occurrences of \multirow or \multicolumn commands.
76
+ When found, the command is parsed (handling nested braces and nested commands) and its third argument is fixed.
77
+
78
+ Returns the processed text.
79
+ """
80
+ result = []
81
+ i = 0
82
+ while i < len(text):
83
+ # Find next occurrence of either command.
84
+ idx_multi = text.find(r"\multirow", i)
85
+ idx_multiC = text.find(r"\multicolumn", i)
86
+
87
+ # Determine the next index among the two (if any)
88
+ if idx_multi == -1 and idx_multiC == -1:
89
+ result.append(text[i:])
90
+ break
91
+ if idx_multi == -1:
92
+ next_idx = idx_multiC
93
+ elif idx_multiC == -1:
94
+ next_idx = idx_multi
95
+ else:
96
+ next_idx = min(idx_multi, idx_multiC)
97
+
98
+ # Append text before the command (preserving any whitespace)
99
+ result.append(text[i:next_idx])
100
+ # Process the command starting at next_idx
101
+ command_text, new_index = parse_command(text, next_idx)
102
+ result.append(command_text)
103
+ i = new_index
104
+ return ''.join(result)
105
+
106
+ def parse_brace(s, pos):
107
+ """
108
+ Given a string s and an index pos pointing to an opening '{',
109
+ returns a tuple (content, new_pos) where content is the string
110
+ between the matching braces (handling nested braces) and new_pos is
111
+ the index just after the closing '}'.
112
+ """
113
+ if pos >= len(s) or s[pos] != '{':
114
+ raise ValueError("Expected '{' at position %d" % pos)
115
+ pos += 1 # skip the opening brace
116
+ content = ""
117
+ depth = 1
118
+ while pos < len(s) and depth:
119
+ char = s[pos]
120
+ if char == '{':
121
+ depth += 1
122
+ content += char
123
+ elif char == '}':
124
+ depth -= 1
125
+ if depth:
126
+ content += char
127
+ else:
128
+ content += char
129
+ pos += 1
130
+ if depth != 0:
131
+ raise ValueError("Unmatched '{' in string.")
132
+ return content, pos
133
+
134
+ def parse_command_merge(s, pos):
135
+ """
136
+ Parse a multirow or multicolumn command starting at s[pos]. If the content
137
+ of the command contains a nested command, then recursively parse the inner
138
+ command and merge its parameters with the outer ones. The merging is done
139
+ so that the outer multirow’s parameters (e.g. rowspan and width) are kept
140
+ while the inner command’s parameters (e.g. colspan, alignment) and its innermost
141
+ content are returned.
142
+
143
+ Returns a tuple (merged_dict, new_pos) where merged_dict is a dictionary
144
+ containing the combined parameters and new_pos is the updated index after
145
+ parsing the command.
146
+ """
147
+ if s.startswith(r"\multirow", pos):
148
+ newpos = pos + len(r"\multirow")
149
+ # Parse the three required arguments for multirow: rowspan, width, and content.
150
+ rowspan, newpos = parse_brace(s, newpos)
151
+ width, newpos = parse_brace(s, newpos)
152
+ content, newpos = parse_brace(s, newpos)
153
+ # Look for a nested command (either \multirow or \multicolumn) in the content.
154
+ index_mr = content.find(r"\multirow")
155
+ index_mc = content.find(r"\multicolumn")
156
+ if index_mr == -1 and index_mc == -1:
157
+ # No nested command found; return this command’s details.
158
+ return {"rowspan": rowspan.strip(), "width": width.strip(), "content": content.strip()}, newpos
159
+ else:
160
+ # At least one nested command is present. Pick the first occurrence.
161
+ indices = [i for i in (index_mr, index_mc) if i != -1]
162
+ first_index = min(indices)
163
+ # Parse the inner (nested) command from within the content.
164
+ inner, _ = parse_command_merge(content, first_index)
165
+ # Merge: keep the outer multirow’s parameters and add the inner ones.
166
+ merged = {"rowspan": rowspan.strip(), "width": width.strip()}
167
+ merged.update(inner)
168
+ return merged, newpos
169
+
170
+ elif s.startswith(r"\multicolumn", pos):
171
+ newpos = pos + len(r"\multicolumn")
172
+ # Parse the three arguments for multicolumn: colspan, alignment, and content.
173
+ colspan, newpos = parse_brace(s, newpos)
174
+ alignment, newpos = parse_brace(s, newpos)
175
+ content, newpos = parse_brace(s, newpos)
176
+ # Look for a nested command in the content.
177
+ index_mr = content.find(r"\multirow")
178
+ index_mc = content.find(r"\multicolumn")
179
+ if index_mr == -1 and index_mc == -1:
180
+ return {"colspan": colspan.strip(), "alignment": alignment.strip(), "content": content.strip()}, newpos
181
+ else:
182
+ indices = [i for i in (index_mr, index_mc) if i != -1]
183
+ first_index = min(indices)
184
+ inner, _ = parse_command_merge(content, first_index)
185
+ merged = {"colspan": colspan.strip(), "alignment": alignment.strip()}
186
+ merged.update(inner)
187
+ return merged, newpos
188
+
189
+ # Not a recognized command starting at pos.
190
+ return None, pos
191
+
192
+ def extract_merged_commands(s):
193
+ """
194
+ Scan through the LaTeX string s and extract merged multirow/multicolumn commands.
195
+ For each command found, if there is nesting the parser merges the outer and inner
196
+ parameters so that the final result includes both the rowspan (or width) and the colspan
197
+ (or alignment) along with the innermost content.
198
+
199
+ Returns a list of dictionaries.
200
+ """
201
+ pos = 0
202
+ results = []
203
+ while pos < len(s):
204
+ if s[pos] == '\\':
205
+ res, newpos = parse_command_merge(s, pos)
206
+ if res is not None:
207
+ results.append(res)
208
+ pos = newpos
209
+ continue
210
+ pos += 1
211
+ return results
212
+
213
+ def remove_tags(html, tags_to_remove):
214
+ soup = BeautifulSoup(html, "html.parser")
215
+ # Loop through the tags to remove
216
+ for tag_name in tags_to_remove:
217
+ for tag in soup.find_all(tag_name):
218
+ # Move the children of the tag to the parent tag
219
+ tag.unwrap() # This removes the tag but keeps its contents
220
+ # Return the modified HTML as a string
221
+ return str(soup)
222
+
223
+ def convert_th_to_td(html):
224
+ """Replace all th tags with td tags
225
+ """
226
+ soup = BeautifulSoup(html)
227
+ for th_tag in soup.find_all('th'):
228
+ th_tag.name = 'td'
229
+ return str(soup)
230
+
231
+ def replace_italic(text):
232
+ pattern = re.compile(r'(?<!\\)_(.*?)(?<!\\)_')
233
+
234
+ def italic_replacer(match):
235
+ # Get the text inside the underscores.
236
+ content = match.group(1)
237
+ # Remove the escape (backslash) from any escaped underscores inside.
238
+ content = content.replace(r'\_', '_')
239
+ return f"<i>{content}</i>"
240
+
241
+ # Replace all occurrences of the pattern using the replacer function.
242
+ return pattern.sub(italic_replacer, text)
243
+
244
+
245
+ def replace_bold(text):
246
+ pattern = re.compile(r'(?<!\\)\*\*(.*?)(?<!\\)\*\*')
247
+
248
+ def bold_replacer(match):
249
+ content = match.group(1)
250
+ # Unescape any escaped asterisks within the captured text.
251
+ content = content.replace(r'\*', '*')
252
+ return f"<b>{content}</b>"
253
+
254
+ return pattern.sub(bold_replacer, text)
255
+
256
+ def latex_table_to_html(latex_str, add_head_body = False):
257
+ # Pattern to match the entire tabular environment
258
+ table_pattern = r'\\begin{tabular}{([^}]*)}\s*(.*?)\\end{tabular}'
259
+
260
+ def process_cell(cell):
261
+ # Clean up cell content
262
+ cell = cell.strip()
263
+
264
+ out = extract_merged_commands(cell)
265
+ if len(out) > 0:
266
+ cell = process_cell(out[0]["content"])["content"]
267
+ rowspan = int(out[0].get("rowspan", "1"))
268
+ colspan = int(out[0].get("colspan", "1"))
269
+ return {
270
+ "content": cell,
271
+ "colspan": colspan,
272
+ "rowspan": rowspan
273
+ }
274
+
275
+ # Replace latex and markdown formatting with HTML tags
276
+ cell = re.sub(r'\$([^$]*)\$', r'\1', cell) # Remove math mode
277
+ cell = re.sub(r'\\textbf{([^}]*)}', r'<b>\1</b>', cell) # Convert latex bold
278
+ cell = re.sub(r'\\textit{([^}]*)}', r'<i>\1</i>', cell) # Convert latex italic
279
+ cell = replace_italic(cell)
280
+ cell = replace_bold(cell)
281
+ cell = cell.replace("\\$", "$").replace("\\%", "%").replace("\\newline", "\n").replace("\\textless", "<").replace("\\textgreater", ">").replace("\\*", "*").replace("\\_", "_").replace("\\backslash", "\\")
282
+
283
+ # Replace \& with & in the cell text
284
+ cell = cell.replace(r'\&', '&')
285
+ cell = cell.replace('<tbc>', '')
286
+ # Preserve newlines for downstream row-splitting; clean other tokens
287
+ cell = cell.replace('\\unknown', '').replace('\\<|unk|\\>', '').replace('<u>', '<underline>').replace('</u>', '</underline>')
288
+ return {
289
+ 'content': cell,
290
+ 'colspan': 1,
291
+ 'rowspan': 1
292
+ }
293
+
294
+ def split_row(input_string):
295
+ # Use a regular expression to split on '&' that is not preceded by a backslash
296
+ return re.split(r'(?<!\\)&', input_string)
297
+
298
+ def convert_table(match):
299
+ # Extract table content
300
+ format_spec, content = match.groups()
301
+
302
+ # Start building HTML table
303
+ html = ['<table>']
304
+
305
+ # Track cells for multirow
306
+ multirow_tracker = set()
307
+
308
+ # Process rows
309
+ rows = re.split(r'\\\\', content)
310
+ current_row = 0
311
+
312
+ for row in rows:
313
+ if not row.strip():
314
+ continue
315
+
316
+ row = row.strip()
317
+
318
+ # Skip \hline
319
+ if '\\hline' in row:
320
+ row = row.replace('\\hline', '')
321
+ if not row.strip():
322
+ continue
323
+
324
+ row = clean_multi_cells(row)
325
+
326
+ # Process cells
327
+ cells = split_row(row)
328
+ processed_cells = [process_cell(cell) for cell in cells]
329
+
330
+ # Build per-cell line lists splitting on newline or <br> tokens
331
+ def split_lines(text):
332
+ parts = re.split(r'(?:\n|<br\s*/?>)+', text)
333
+ return parts if parts is not None else ['']
334
+
335
+ line_lists = [split_lines(cell['content']) for cell in processed_cells]
336
+ max_lines = max(len(lst) for lst in line_lists) if line_lists else 1
337
+
338
+ # Emit one or more rows based on max_lines
339
+ for line_idx in range(max_lines):
340
+ if add_head_body:
341
+ if current_row == 0:
342
+ html.append(' <thead>')
343
+ if current_row == 1:
344
+ html.append(' <tbody>')
345
+ html.append(' <tr>')
346
+ current_col = 0
347
+
348
+ for col_idx, cell in enumerate(processed_cells):
349
+ content_segment = line_lists[col_idx][line_idx] if line_idx < len(line_lists[col_idx]) else ''
350
+
351
+ attrs = []
352
+ if cell['colspan'] > 1:
353
+ attrs.append(f'colspan="{cell["colspan"]}"')
354
+ # Only apply original rowspan to the first emitted line
355
+ if cell['rowspan'] > 1 and line_idx == 0:
356
+ attrs.append(f'rowspan="{cell["rowspan"]}"')
357
+ for r in range(current_row + 1, current_row + cell['rowspan']):
358
+ for c in range(current_col, current_col + cell['colspan']):
359
+ multirow_tracker.add((r, c))
360
+
361
+ # If this position is covered by a prior rowspan, skip rendering a duplicate cell
362
+ if cell['rowspan'] > 1 and line_idx > 0:
363
+ current_col += cell['colspan']
364
+ continue
365
+
366
+ if (current_row, current_col) in multirow_tracker and content_segment == '' and cell["colspan"] == 1 and cell["rowspan"] == 1:
367
+ current_col += cell['colspan']
368
+ continue
369
+
370
+ attr_str = ' ' + ' '.join(attrs) if attrs else ''
371
+ cell_tag = 'td'
372
+ html.append(f' <{cell_tag}{attr_str}>{content_segment}</{cell_tag}>')
373
+ current_col += cell['colspan']
374
+
375
+ if add_head_body and current_row == 0:
376
+ html.append(' </thead>')
377
+ html.append(' </tr>')
378
+ current_row += 1
379
+ if add_head_body:
380
+ html.append(' </tbody>')
381
+ html.append('</table>')
382
+ return '\n'.join(html)
383
+
384
+ # Convert all tabular environments in the input
385
+ return re.sub(table_pattern, convert_table, latex_str, flags=re.DOTALL)
386
+ def convert_single_table(table):
387
+ """
388
+ Convert a single HTML table to Markdown format.
389
+
390
+ Args:
391
+ table: BeautifulSoup table element
392
+
393
+ Returns:
394
+ str: Markdown table string
395
+ """
396
+ markdown_lines = []
397
+ rows = table.find_all('tr')
398
+
399
+ for i, row in enumerate(rows):
400
+ cells = row.find_all(['td', 'th'])
401
+ if not cells:
402
+ continue
403
+
404
+ # Convert cells to text, handling nested elements
405
+ row_data = []
406
+ for cell in cells:
407
+ # Get text content, handling nested elements
408
+ cell_text = cell.get_text(separator=' ', strip=True)
409
+ # Escape pipe characters
410
+ cell_text = cell_text.replace('|', '\\|')
411
+ row_data.append(cell_text)
412
+
413
+ # Add row to markdown
414
+ markdown_lines.append('| ' + ' | '.join(row_data) + ' |')
415
+
416
+ # Add separator after header row
417
+ if i == 0:
418
+ separator = '| ' + ' | '.join(['---'] * len(cells)) + ' |'
419
+ markdown_lines.append(separator)
420
+
421
+ return '\n'.join(markdown_lines)
422
+ def convert_html_tables_to_markdown(html_content):
423
+ """
424
+ Find all HTML tables and convert them to Markdown while preserving all other content.
425
+
426
+ Args:
427
+ html_content (str): HTML content that may contain tables
428
+
429
+ Returns:
430
+ str: HTML content with tables converted to Markdown
431
+ """
432
+ soup = BeautifulSoup(html_content, 'html.parser')
433
+
434
+ # Find all tables
435
+ tables = soup.find_all('table')
436
+
437
+ if not tables:
438
+ return html_content # Return original content unchanged
439
+
440
+ # Convert each table to markdown and replace it
441
+ for table in tables:
442
+ markdown_table = convert_single_table(table)
443
+
444
+ # Create a new element to replace the table
445
+ replacement = soup.new_string('\n' + markdown_table + '\n')
446
+ table.replace_with(replacement)
447
+
448
+ return str(soup)
logitsprocs/nemotron_parse_vllm_logitprocs.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom vLLM v1 logits processors for NVIDIA Nemotron Parse.
3
+
4
+ Tuning via environment variables:
5
+ - NEMOTRON_PARSE_TABLE_PREFIX (default: \\begin{tabular})
6
+ - NEMOTRON_PARSE_REP_MAX (default: 10)
7
+ - NEMOTRON_PARSE_REP_WINDOW (default: 2 * max_ngram * (rep_max + 1))
8
+ - NEMOTRON_PARSE_REP_NGRAMS (default: 1,2,3,4,5)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import warnings
15
+ from dataclasses import dataclass
16
+ from typing import Dict, List, Set
17
+
18
+ import torch
19
+ from transformers import AutoTokenizer
20
+
21
+ from vllm.v1.sample.logits_processor.builtin import process_dict_updates
22
+ from vllm.v1.sample.logits_processor.interface import BatchUpdate, LogitsProcessor
23
+
24
+
25
+ def _strip_trailing_negative_token_ids(token_ids: List[int]) -> List[int]:
26
+ """vLLM v1 keeps a trailing -1 placeholder in output_token_ids."""
27
+ i = len(token_ids)
28
+ while i > 0 and token_ids[i - 1] < 0:
29
+ i -= 1
30
+ return token_ids[:i]
31
+
32
+
33
+ def _env_int(name: str, default: int) -> int:
34
+ val = os.environ.get(name, "").strip()
35
+ if not val:
36
+ return default
37
+ try:
38
+ return int(val)
39
+ except ValueError:
40
+ return default
41
+
42
+
43
+ def _env_csv_ints(name: str, default: List[int]) -> List[int]:
44
+ val = os.environ.get(name, "").strip()
45
+ if not val:
46
+ return default
47
+ out: List[int] = []
48
+ for part in val.split(","):
49
+ part = part.strip()
50
+ if not part:
51
+ continue
52
+ try:
53
+ out.append(int(part))
54
+ except ValueError:
55
+ return default
56
+ return out or default
57
+
58
+
59
+ def _warn_if_unexpected_arch(vllm_config) -> None:
60
+ try:
61
+ arches = getattr(vllm_config.model_config, "architectures", None) or []
62
+ except Exception:
63
+ arches = []
64
+ if any(a == "NemotronParseForConditionalGeneration" for a in arches):
65
+ return
66
+ warnings.warn(
67
+ "Nemotron-Parse logits processors enabled for a model whose "
68
+ f"architectures={arches!r}. These processors assume Nemotron-Parse-style "
69
+ "special tokens like <x_...>, <y_...>, and <class_...>; behavior may be "
70
+ "unsupported for other models.",
71
+ RuntimeWarning,
72
+ stacklevel=2,
73
+ )
74
+
75
+
76
+ def _load_hf_tokenizer(vllm_config):
77
+ model_cfg = vllm_config.model_config
78
+ tok_name = getattr(model_cfg, "tokenizer", None) or getattr(model_cfg, "model", None)
79
+ trust_rc = bool(getattr(model_cfg, "trust_remote_code", False))
80
+ revision = getattr(model_cfg, "tokenizer_revision", None) or getattr(model_cfg, "revision", None)
81
+ token = getattr(model_cfg, "hf_token", None)
82
+ kwargs = {"trust_remote_code": trust_rc}
83
+ if revision:
84
+ kwargs["revision"] = revision
85
+ if token:
86
+ # HF uses `token=` in modern versions.
87
+ kwargs["token"] = token
88
+ return AutoTokenizer.from_pretrained(tok_name, **kwargs)
89
+
90
+
91
+ def _build_token_sets(tokenizer) -> tuple[Set[int], Set[int], Set[int]]:
92
+ """Build token-id sets for <x_...>, <y_...>, and <class_...>."""
93
+
94
+ def _scan(items) -> tuple[Set[int], Set[int], Set[int]]:
95
+ x_ids: Set[int] = set()
96
+ y_ids: Set[int] = set()
97
+ class_ids: Set[int] = set()
98
+ for token, tid in items:
99
+ if token.startswith("<x_") and token.endswith(">") and token.count(">") == 1:
100
+ x_ids.add(tid)
101
+ elif token.startswith("<y_") and token.endswith(">") and token.count(">") == 1:
102
+ y_ids.add(tid)
103
+ elif token.startswith("<class_") and token.endswith(">") and token.count(">") == 1:
104
+ class_ids.add(tid)
105
+ return x_ids, y_ids, class_ids
106
+
107
+ added_vocab = {}
108
+ if hasattr(tokenizer, "get_added_vocab"):
109
+ try:
110
+ added_vocab = tokenizer.get_added_vocab() or {}
111
+ except Exception:
112
+ added_vocab = {}
113
+
114
+ x_ids, y_ids, class_ids = _scan(added_vocab.items())
115
+
116
+ if len(x_ids) < 100 or len(y_ids) < 100 or not class_ids:
117
+ vocab = tokenizer.get_vocab()
118
+ x2, y2, c2 = _scan(vocab.items())
119
+ x_ids, y_ids, class_ids = x2, y2, c2
120
+
121
+ return x_ids, y_ids, class_ids
122
+
123
+
124
+ @dataclass
125
+ class _TableReqState:
126
+ output_ids: List[int]
127
+ insertion_active: bool = False
128
+ insertion_pos: int = 0
129
+ expecting_end_coords: bool = False
130
+ _last_class_pos: int = -1
131
+
132
+
133
+ class NemotronParseTableInsertionLogitsProcessor(LogitsProcessor):
134
+ """Force a table prefix right after each START <x_...><y_...> coordinate pair."""
135
+
136
+ def __init__(self, vllm_config, device: torch.device, is_pin_memory: bool) -> None:
137
+ self.enabled = True
138
+ self.req_states: Dict[int, _TableReqState] = {}
139
+
140
+ self.table_prefix = os.environ.get("NEMOTRON_PARSE_TABLE_PREFIX", r"\begin{tabular}")
141
+ self.table_prefix_ids: List[int] = []
142
+ self.x_token_ids: Set[int] = set()
143
+ self.y_token_ids: Set[int] = set()
144
+ self.class_token_ids: Set[int] = set()
145
+
146
+ if self.enabled:
147
+ _warn_if_unexpected_arch(vllm_config)
148
+ tok = _load_hf_tokenizer(vllm_config)
149
+ self.table_prefix_ids = tok.encode(self.table_prefix, add_special_tokens=False)
150
+ self.x_token_ids, self.y_token_ids, self.class_token_ids = _build_token_sets(tok)
151
+
152
+ if not self.x_token_ids or not self.y_token_ids or not self.table_prefix_ids:
153
+ self.enabled = False
154
+
155
+ @classmethod
156
+ def validate_params(cls, sampling_params):
157
+ return None
158
+
159
+ def is_argmax_invariant(self) -> bool:
160
+ return False
161
+
162
+ def update_state(self, batch_update: BatchUpdate | None) -> None:
163
+ if not self.enabled:
164
+ return
165
+
166
+ def _new_state(params, prompt_ids, output_ids):
167
+ return _TableReqState(output_ids=output_ids)
168
+
169
+ process_dict_updates(self.req_states, batch_update, _new_state)
170
+
171
+ for st in self.req_states.values():
172
+ out = st.output_ids
173
+ if not out:
174
+ continue
175
+
176
+ view = _strip_trailing_negative_token_ids(out)
177
+ if not view:
178
+ continue
179
+
180
+ # If we ended an object (a newly-generated <class_...>), next xy should be START.
181
+ if self.class_token_ids:
182
+ last_class_pos = -1
183
+ for i in range(len(view) - 1, -1, -1):
184
+ if view[i] in self.class_token_ids:
185
+ last_class_pos = i
186
+ break
187
+ if last_class_pos != -1 and last_class_pos != st._last_class_pos:
188
+ st._last_class_pos = last_class_pos
189
+ st.expecting_end_coords = False
190
+
191
+ if st.insertion_active:
192
+ continue
193
+
194
+ if len(view) >= 2 and view[-2] in self.x_token_ids and view[-1] in self.y_token_ids:
195
+ if not st.expecting_end_coords:
196
+ st.insertion_active = True
197
+ st.insertion_pos = 0
198
+ st.expecting_end_coords = True
199
+ else:
200
+ pass
201
+
202
+ def apply(self, logits: torch.Tensor) -> torch.Tensor:
203
+ if not self.enabled or not self.req_states:
204
+ return logits
205
+
206
+ for req_idx, st in self.req_states.items():
207
+ if not st.insertion_active:
208
+ continue
209
+ pos = st.insertion_pos
210
+ if pos >= len(self.table_prefix_ids):
211
+ st.insertion_active = False
212
+ continue
213
+
214
+ forced_tid = self.table_prefix_ids[pos]
215
+ logits[req_idx].fill_(-float("inf"))
216
+ logits[req_idx, forced_tid] = 0.0
217
+
218
+ st.insertion_pos = pos + 1
219
+ if st.insertion_pos >= len(self.table_prefix_ids):
220
+ st.insertion_active = False
221
+
222
+ return logits
223
+
224
+
225
+ @dataclass
226
+ class _RepReqState:
227
+ output_ids: List[int]
228
+ in_cooldown: bool = False
229
+ segment_start: int = 0
230
+ _last_class_pos: int = -1
231
+
232
+
233
+ class NemotronParseRepetitionStopLogitsProcessor(LogitsProcessor):
234
+ """Force an <x_...> token when consecutive repetition exceeds a threshold."""
235
+
236
+ def __init__(self, vllm_config, device: torch.device, is_pin_memory: bool) -> None:
237
+ self.enabled = True
238
+ self.req_states: Dict[int, _RepReqState] = {}
239
+
240
+ self.max_repetitions = _env_int("NEMOTRON_PARSE_REP_MAX", 10)
241
+ self.ngram_sizes = _env_csv_ints("NEMOTRON_PARSE_REP_NGRAMS", [1, 2, 3, 4, 5])
242
+ max_ngram = max((n for n in self.ngram_sizes if n > 0), default=1)
243
+ default_window = 2 * max_ngram * (self.max_repetitions + 1)
244
+ self.window_size = _env_int("NEMOTRON_PARSE_REP_WINDOW", default_window)
245
+
246
+ self.x_token_ids: Set[int] = set()
247
+ self.class_token_ids: Set[int] = set()
248
+
249
+ if self.enabled:
250
+ _warn_if_unexpected_arch(vllm_config)
251
+ tok = _load_hf_tokenizer(vllm_config)
252
+ x_ids, _, class_ids = _build_token_sets(tok)
253
+ self.x_token_ids = x_ids
254
+ self.class_token_ids = class_ids
255
+
256
+ if not self.x_token_ids:
257
+ self.enabled = False
258
+
259
+ @classmethod
260
+ def validate_params(cls, sampling_params):
261
+ return None
262
+
263
+ def is_argmax_invariant(self) -> bool:
264
+ return False
265
+
266
+ def update_state(self, batch_update: BatchUpdate | None) -> None:
267
+ if not self.enabled:
268
+ return
269
+
270
+ def _new_state(params, prompt_ids, output_ids):
271
+ return _RepReqState(output_ids=output_ids)
272
+
273
+ process_dict_updates(self.req_states, batch_update, _new_state)
274
+
275
+ for st in self.req_states.values():
276
+ out = st.output_ids
277
+ if not out:
278
+ continue
279
+ view = _strip_trailing_negative_token_ids(out)
280
+ if not view:
281
+ continue
282
+ # Clear cooldown and advance segment on a newly-generated <class_...> token.
283
+ if self.class_token_ids:
284
+ last_class_pos = -1
285
+ for i in range(len(view) - 1, -1, -1):
286
+ if view[i] in self.class_token_ids:
287
+ last_class_pos = i
288
+ break
289
+ if last_class_pos != -1 and last_class_pos != st._last_class_pos:
290
+ st._last_class_pos = last_class_pos
291
+ st.in_cooldown = False
292
+ st.segment_start = last_class_pos + 1
293
+
294
+ @staticmethod
295
+ def _max_consecutive_repetitions(seq: List[int], n: int) -> int:
296
+ if len(seq) < n:
297
+ return 0
298
+ max_consec = 1
299
+ cur = 1
300
+ prev = tuple(seq[0:n])
301
+ i = n
302
+ while i <= len(seq) - n:
303
+ cur_ng = tuple(seq[i:i + n])
304
+ if cur_ng == prev:
305
+ cur += 1
306
+ if cur > max_consec:
307
+ max_consec = cur
308
+ i += n
309
+ else:
310
+ cur = 1
311
+ prev = cur_ng
312
+ i += 1
313
+ return max_consec
314
+
315
+ def _has_excessive_repetition(self, seq: List[int]) -> bool:
316
+ if not seq:
317
+ return False
318
+ check_seq = seq[-self.window_size:] if len(seq) > self.window_size else seq
319
+ for n in self.ngram_sizes:
320
+ if n <= 0:
321
+ continue
322
+ if self._max_consecutive_repetitions(check_seq, n) > self.max_repetitions:
323
+ return True
324
+ return False
325
+
326
+ def apply(self, logits: torch.Tensor) -> torch.Tensor:
327
+ if not self.enabled or not self.req_states:
328
+ return logits
329
+
330
+ for req_idx, st in self.req_states.items():
331
+ if st.in_cooldown:
332
+ continue
333
+ view = _strip_trailing_negative_token_ids(st.output_ids)
334
+ seg = view[st.segment_start:]
335
+ if not self._has_excessive_repetition(seg):
336
+ continue
337
+
338
+ st.in_cooldown = True
339
+ row = logits[req_idx]
340
+ original = row.clone()
341
+ row.fill_(-float("inf"))
342
+ for tid in self.x_token_ids:
343
+ row[tid] = original[tid]
344
+
345
+ return logits
346
+
347
+
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d77822f43a04504619e4c3527e1371568b6e33dba94b7c3a0c0cf509a200cd4
3
+ size 3745188184
postprocessing.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from latex2html import convert_html_tables_to_markdown, latex_table_to_html
3
+
4
+ def extract_classes_bboxes(text: str):
5
+ _re_extract_class_bbox = re.compile(r'<x_(\d+(?:\.\d+)?)><y_(\d+(?:\.\d+)?)>(.*?)<x_(\d+(?:\.\d+)?)><y_(\d+(?:\.\d+)?)><class_([^>]+)>', re.DOTALL)
6
+ classes = []
7
+ bboxes = []
8
+ texts = []
9
+ for m in _re_extract_class_bbox.finditer(text):
10
+ x1, y1, text, x2, y2, cls = m.groups()
11
+ classes.append(cls)
12
+ bboxes.append((float(x1), float(y1), float(x2), float(y2)))
13
+ texts.append(text)
14
+
15
+ # TODO: Remove when fixed
16
+ classes = [
17
+ "Formula" if cls == "Inline-formula" else cls for cls in classes
18
+ ]
19
+ assert "Page-number" not in classes
20
+
21
+ return classes, bboxes, texts
22
+
23
+ def transform_bbox_to_original(bbox, original_width, original_height, target_w=1664, target_h=2048):
24
+ # Replicate exact resize logic
25
+ aspect_ratio = original_width / original_height
26
+ new_height = original_height
27
+ new_width = original_width
28
+
29
+ if original_height > target_h:
30
+ new_height = target_h
31
+ new_width = int(new_height * aspect_ratio)
32
+
33
+ if new_width > target_w:
34
+ new_width = target_w
35
+ new_height = int(new_width / aspect_ratio)
36
+
37
+ resized_width = new_width
38
+ resized_height = new_height
39
+
40
+ # Calculate padding
41
+ pad_left = (target_w - resized_width) // 2
42
+ pad_top = (target_h - resized_height) // 2
43
+
44
+ # # Transform: use the ACTUAL resized dimensions, not the scale
45
+ # # X coords
46
+ left = ((bbox[0] * target_w) - pad_left) * original_width / resized_width
47
+ right = ((bbox[2] * target_w) - pad_left) * original_width / resized_width
48
+
49
+ # # Y coords - using original_height / resized_height directly
50
+ top = ((bbox[1] * target_h) - pad_top) * original_height / resized_height
51
+ bottom = ((bbox[3] * target_h) - pad_top) * original_height / resized_height
52
+
53
+ return left, top, right, bottom
54
+
55
+ def postprocess_text(text, cls = 'Text', text_format='markdown', table_format='latex', blank_text_in_figures=False):
56
+ assert text_format in ['markdown', 'plain'], 'Unknown text format. Supported: markdown | plain'
57
+ assert table_format in ['latex', 'HTML', 'markdown'], 'Unknown table format. Supported: latex | HTML | markdown'
58
+ if cls != 'Table':
59
+ if text_format == 'plain':
60
+ text = convert_mmd_to_plain_text_ours(text)
61
+ elif table_format == 'HTML':
62
+ text = latex_table_to_html(text)
63
+ elif table_format == 'markdown':
64
+ text = convert_html_tables_to_markdown(latex_table_to_html(text))
65
+ if blank_text_in_figures and cls == 'Picture':
66
+ text = ''
67
+ return text
68
+
69
+ def remove_nemotron_formatting(text):
70
+ text = text.replace('<tbc>', '')
71
+ text = text.replace('\\<|unk|\\>', '')
72
+ text = text.replace('\\unknown', '')
73
+ return text
74
+ def convert_mmd_to_plain_text_ours(mmd_text):
75
+ mmd_text = re.sub(r'<sup>(.*?)</sup>', r'^{\\1}', mmd_text, flags=re.DOTALL)
76
+ mmd_text = re.sub(r'<sub>(.*?)</sub>', r'_{\\1}', mmd_text, flags=re.DOTALL)
77
+ mmd_text = mmd_text.replace('<br>', '\n')
78
+
79
+ # Remove headers (e.g., ##)
80
+ mmd_text = re.sub(r'#+\s', '', mmd_text)
81
+
82
+ # Remove bold (e.g., **)
83
+ mmd_text = re.sub(r'\*\*(.*?)\*\*', r'\1', mmd_text)
84
+ #mmd_text = mmd_text.replace("**","")
85
+ # Remove italic (e.g., *)
86
+ mmd_text = re.sub(r'\*(.*?)\*', r'\1', mmd_text)
87
+ # Remove emphasized text formatting (e.g., _)
88
+ mmd_text = re.sub(r'(?<!\w)_([^_]+)_', r'\1', mmd_text)
89
+
90
+ # Remove formulas inside paragraphs (e.g., \(R_{ij}(P^{a})=0\))
91
+ #mmd_text = re.sub(r'\\\((.*?)\\\)', '', mmd_text)
92
+
93
+ # Remove asterisk in lists
94
+ #mmd_text = re.sub(r'^\*\s', '', mmd_text, flags=re.MULTILINE)
95
+ return mmd_text.strip()
preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "NemotronParseImageProcessor",
3
+ "image_processor_type": "NemotronParseImageProcessor",
4
+ "processor_class": "NemotronParseProcessor",
5
+ "auto_map": {
6
+ "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseImageProcessor",
7
+ "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseProcessor"
8
+ },
9
+ "do_normalize": false,
10
+ "do_rescale": true,
11
+ "rescale_factor": 0.00392156862745098,
12
+ "size": {
13
+ "height": 2048,
14
+ "width": 1664,
15
+ "longest_edge": [
16
+ 2048,
17
+ 1664
18
+ ]
19
+ },
20
+ "final_size": [
21
+ 2048,
22
+ 1664
23
+ ]
24
+ }
privacy.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Field | Response |
2
+ | :---- | :---- |
3
+ | Generatable or reverse engineerable personal data? | No |
4
+ | Personal data used to create this model? | No |
5
+ | How often is the dataset reviewed? | Before Release |
6
+ | Was data from user interactions with the AI model (e.g. user input and prompts) used to train the model?| No|
7
+ | Is there provenance for all datasets used in training? | Yes |
8
+ | Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
9
+ | Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data. |
10
+ | Applicable Privacy Policy | [NVIDIA Privacy Policy](https://www.nvidia.com/en-us/about-nvidia/privacy-policy/) |
processor_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "NemotronParseProcessor",
3
+ "image_processor_type": "NemotronParseImageProcessor",
4
+ "auto_map": {
5
+ "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseProcessor",
6
+ "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseImageProcessor"
7
+ }
8
+ }
safety.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ | Field | Response |
2
+ | :---- | :---- |
3
+ | Model Application Field(s): | Chat, Instruction Following, Chatbot Development, Code Generation, Reasoning, Customer Service |
4
+ | Describe the life critical impact (if present). | Not Applicable |
5
+ | Use Case Restrictions: | GOVERNING TERMS: [Product-Specific Terms for NVIDIA AI Products](https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/). Use of this model is governed by the [NVIDIA Community Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-community-models-license/). Use of the tokenizer included in this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/)|
6
+ | Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |
7
+
8
+ **You are responsible for ensuring that your use of NVIDIA provided models complies with all applicable laws.**
special_tokens_map.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<predict_no_text_in_pic>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "bos_token": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "eos_token": {
19
+ "content": "</s>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<pad>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "unk_token": {
33
+ "content": "<unk>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ }
39
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
vllm_example.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import base64
3
+ import os
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
8
+
9
+ from openai import OpenAI
10
+
11
+
12
+ def _guess_mime(path: str) -> str:
13
+ ext = Path(path).suffix.lower().lstrip(".")
14
+ if ext in ("jpg", "jpeg"):
15
+ return "image/jpeg"
16
+ if ext in ("webp",):
17
+ return "image/webp"
18
+ # default
19
+ return "image/png"
20
+
21
+
22
+ def _b64_image_data_url(path: str) -> str:
23
+ with open(path, "rb") as f:
24
+ img_b64 = base64.b64encode(f.read()).decode("utf-8")
25
+ mime = _guess_mime(path)
26
+ return f"data:{mime};base64,{img_b64}"
27
+
28
+
29
+ def _iter_images(paths: List[str], image_dir: Optional[str]) -> List[str]:
30
+ out: List[str] = []
31
+ for p in paths:
32
+ out.append(p)
33
+ if image_dir:
34
+ for ext in ("*.png", "*.jpg", "*.jpeg", "*.webp"):
35
+ out.extend([str(x) for x in sorted(Path(image_dir).glob(ext))])
36
+ # De-dupe, keep order
37
+ seen = set()
38
+ deduped: List[str] = []
39
+ for p in out:
40
+ if p in seen:
41
+ continue
42
+ seen.add(p)
43
+ deduped.append(p)
44
+ return deduped
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class _ReqSpec:
49
+ image_path: str
50
+ request_idx: int
51
+
52
+
53
+ def _make_client(base_url: str) -> OpenAI:
54
+ # openai>=1.x requires an API key; vLLM ignores it by default.
55
+ api_key = os.environ.get("OPENAI_API_KEY", "EMPTY")
56
+ return OpenAI(base_url=base_url, api_key=api_key)
57
+
58
+
59
+ def _run_one(
60
+ req: _ReqSpec,
61
+ *,
62
+ base_url: str,
63
+ model: str,
64
+ prompt_text: str,
65
+ max_tokens: int,
66
+ temperature: float,
67
+ extra_body: Dict[str, Any],
68
+ ) -> Tuple[_ReqSpec, str]:
69
+ client = _make_client(base_url)
70
+ img_url = _b64_image_data_url(req.image_path)
71
+ resp = client.chat.completions.create(
72
+ model=model,
73
+ messages=[
74
+ {
75
+ "role": "user",
76
+ "content": [
77
+ {"type": "text", "text": prompt_text},
78
+ {"type": "image_url", "image_url": {"url": img_url}},
79
+ ],
80
+ }
81
+ ],
82
+ max_tokens=max_tokens,
83
+ temperature=temperature,
84
+ extra_body=extra_body,
85
+ )
86
+ text = resp.choices[0].message.content or ""
87
+ return req, text
88
+
89
+
90
+ def _maybe_annotate(image_path: str, generated_text: str, out_image_path: str) -> None:
91
+ # Optional visualization (similar to example_with_table_processor.py).
92
+ from PIL import Image, ImageDraw # local import so batching can run without pillow
93
+
94
+ from postprocessing import extract_classes_bboxes, postprocess_text, transform_bbox_to_original
95
+
96
+ image = Image.open(image_path).convert("RGB")
97
+
98
+ classes, bboxes, texts = extract_classes_bboxes(generated_text)
99
+ bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
100
+
101
+ table_format = "HTML" # latex | HTML | markdown
102
+ text_format = "markdown" # markdown | plain
103
+ blank_text_in_figures = False
104
+
105
+ _ = [
106
+ postprocess_text(
107
+ text,
108
+ cls=cls,
109
+ table_format=table_format,
110
+ text_format=text_format,
111
+ blank_text_in_figures=blank_text_in_figures,
112
+ )
113
+ for text, cls in zip(texts, classes)
114
+ ]
115
+
116
+ draw = ImageDraw.Draw(image)
117
+ for bbox in bboxes:
118
+ draw.rectangle(
119
+ (bbox[0], bbox[1], max(bbox[0], bbox[2]), max(bbox[1], bbox[3])),
120
+ outline="red",
121
+ width=2,
122
+ )
123
+
124
+ image.save(out_image_path)
125
+
126
+
127
+ def main() -> None:
128
+ ap = argparse.ArgumentParser(description="vLLM OpenAI-compatible example (batch + .txt outputs).")
129
+ ap.add_argument("--base-url", default="http://localhost:8000/v1")
130
+ ap.add_argument("--model", default="nvidia/NVIDIA-Nemotron-Parse-v1.2")
131
+ ap.add_argument("--image", action="append", default=[], help="Image path (repeatable).")
132
+ ap.add_argument("--image-dir", default=None, help="Directory of images to run (png/jpg/jpeg/webp).")
133
+ ap.add_argument("--out-dir", default="vllm_outputs", help="Where to write .txt outputs.")
134
+ ap.add_argument("--concurrency", type=int, default=4, help="How many concurrent requests to send.")
135
+ ap.add_argument("--max-tokens", type=int, default=8994)
136
+ ap.add_argument("--temperature", type=float, default=0.0)
137
+ ap.add_argument(
138
+ "--annotate",
139
+ action=argparse.BooleanOptionalAction,
140
+ default=True,
141
+ help="Write annotated images with boxes to --out-dir (default: enabled). Use --no-annotate to disable.",
142
+ )
143
+
144
+ args = ap.parse_args()
145
+
146
+ image_paths = _iter_images(args.image, args.image_dir)
147
+ if not image_paths:
148
+ raise SystemExit("No images provided. Use --image PATH (repeatable) or --image-dir DIR.")
149
+
150
+ out_dir = Path(args.out_dir)
151
+ out_dir.mkdir(parents=True, exist_ok=True)
152
+
153
+ prompt_text = "</s><s><predict_bbox><predict_classes><output_markdown><predict_no_text_in_pic>"
154
+ #prompt_text = "</s><s><predict_bbox><predict_classes><output_markdown><predict_text_in_pic>"
155
+
156
+ extra_body = {
157
+ "repetition_penalty": 1.1,
158
+ "top_k": 1,
159
+ "skip_special_tokens": False,
160
+ }
161
+
162
+ reqs: List[_ReqSpec] = []
163
+ for idx, img in enumerate(image_paths):
164
+ reqs.append(_ReqSpec(image_path=img, request_idx=idx))
165
+
166
+ # Concurrency is the simplest way to make sure vLLM batches requests internally.
167
+ summary_lines: List[str] = []
168
+ with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as ex:
169
+ futs = [
170
+ ex.submit(
171
+ _run_one,
172
+ r,
173
+ base_url=args.base_url,
174
+ model=args.model,
175
+ prompt_text=prompt_text,
176
+ max_tokens=args.max_tokens,
177
+ temperature=args.temperature,
178
+ extra_body=extra_body,
179
+ )
180
+ for r in reqs
181
+ ]
182
+ for fut in as_completed(futs):
183
+ req, text = fut.result()
184
+ base = Path(req.image_path).name
185
+ stem = f"{req.request_idx:04d}_{base}"
186
+ out_txt = out_dir / f"{stem}.txt"
187
+ out_txt.write_text(text, encoding="utf-8")
188
+ summary_lines.append(f"{req.image_path}\t{out_txt}")
189
+
190
+ if args.annotate:
191
+ out_img = out_dir / f"{stem}.annotated.jpg"
192
+ _maybe_annotate(req.image_path, text, str(out_img))
193
+
194
+ (out_dir / "summary.txt").write_text("\n".join(sorted(summary_lines)) + "\n", encoding="utf-8")
195
+
196
+
197
+ if __name__ == "__main__":
198
+ main()