helizac
/

dots.ocr-4bit

@@ -47,53 +47,46 @@ You can then use the 4-bit model with the following Python script. Note the incl
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
 from PIL import Image
-import os
-import traceback
-# This assumes the utility script is available in your environment
 from qwen_vl_utils import process_vision_info
 MODEL_ID = "helizac/dots.ocr-4bit"
-print("Loading 4-bit quantized model from the Hub...")
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    device_map="auto",
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16,
-)
-processor = AutoProcessor.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True
-)
-print("✅ Model and processor loaded successfully!")
-# --- Inference ---
-image_path = "demo/demo_image1.jpg" # Make sure you have this image
 image = Image.open(image_path)
-prompt_text = "Parse all layout info, both detection and recognition"
-messages = [
-    {"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": prompt_text}]}
-]
-# Prepare inputs using the official workflow
 text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 image_inputs, _ = process_vision_info(messages)
-inputs = processor(
-    text=[text], images=image_inputs, padding=True, return_tensors="pt"
-).to(model.device)
-# Generate with parameters to prevent looping with the 4-bit model
-generated_ids = model.generate(
-    **inputs, max_new_tokens=4096, do_sample=True, temperature=0.6, top_p=0.9, repetition_penalty=1.15
-)
-# Trim and decode output
 generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
 output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-print("\n--- Inference Result ---")
 print(output_text)
 ```

 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
 from PIL import Image
+from huggingface_hub import snapshot_download
 from qwen_vl_utils import process_vision_info
 MODEL_ID = "helizac/dots.ocr-4bit"
+local_model_path = snapshot_download(repo_id=MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(local_model_path, device_map="auto", trust_remote_code=True, torch_dtype=torch.bfloat16)
+processor = AutoProcessor.from_pretrained(local_model_path, trust_remote_code=True)
+image_path = "test.jpg"
 image = Image.open(image_path)
+prompt_text = """\
+Please output the layout information from the image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+1. Bbox format: [x1, y1, x2, y2]
+2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+3. Text Extraction & Formatting Rules:
+- Picture: For the 'Picture' category, the text field should be omitted.
+- Formula: Format its text as LaTeX.
+- Table: Format its text as HTML.
+- All Others (Text, Title, etc.): Format their text as Markdown.
+4. Constraints:
+- The output text must be the original text from the image, with no translation.
+- All layout elements must be sorted according to human reading order.
+5. Final Output: The entire output must be a single JSON object.\
+"""
+messages = [{"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": prompt_text}]}]
 text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 image_inputs, _ = process_vision_info(messages)
+inputs = processor(text=[text], images=image_inputs, padding=True, return_tensors="pt").to(model.device)
+generated_ids = model.generate(**inputs, max_new_tokens=1048, do_sample=True, temperature=0.6, top_p=0.9, repetition_penalty=1.15)
 generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
 output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 print(output_text)
 ```