DNivalis
/

med-jargon-crf

@@ -56,23 +56,82 @@ Fine-tuned on the **MedReadMe** dataset introduced by Jiang & Xu (2024).
 ## 🔧 Quick Start
 ```python
-from transformers import AutoTokenizer
-from modeling_jargon import CRFTokenClassificationModel
 model_name = "DNivalis/med-jargon-crf"
 tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
 model = CRFTokenClassificationModel.from_pretrained(model_name)
 model.eval()
 text = "The patient presented with elevated CRP and intermittent AF."
 inputs = tokenizer(text, return_tensors="pt")
-with torch.no_grad():
-    logits = model(**inputs)["logits"]
-    tags = model.decode(logits, inputs["attention_mask"])[0]
-# Convert IDs → labels
-id2label = model.config.id2label
-spans = [(i, id2label[t]) for i, t in enumerate(tags) if t != 0]
 ```
 ---

 ## 🔧 Quick Start
 ```python
+from transformers import AutoTokenizer, AutoModel
+from huggingface_hub import PyTorchModelHubMixin
+from torchcrf import CRF
+import torch
+import torch.nn as nn
+class CRFTokenClassificationModel(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, config):
+        super().__init__()
+        # Load base transformer model
+        self.transformer = AutoModel.from_pretrained(config["pretrained_model_name"])
+        # Classification layers
+        self.dropout = nn.Dropout(config["hidden_dropout_prob"])
+        self.classifier = nn.Linear(config["hidden_size"], config["num_labels"])
+        # CRF layer for sequence labeling
+        self.crf = CRF(config["num_labels"], batch_first=True)
+        # Label mappings
+        self.id2label = {v: k for k, v in config["label_map"].items()}
+    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
+        # Get transformer outputs
+        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
+        sequence_output = self.dropout(outputs.last_hidden_state)
+        logits = self.classifier(sequence_output)
+        # Calculate loss if labels provided (training mode)
+        if labels is not None:
+            loss = -self.crf(logits, labels, mask=attention_mask.bool(), reduction='mean')
+            return {"loss": loss, "logits": logits}
+        # Return logits only (inference mode)
+        return {"logits": logits}
+    def decode(self, logits, mask):
+        # Use CRF to decode best sequence
+        return self.crf.decode(logits, mask.bool())
+# 1. Load model and tokenizer
 model_name = "DNivalis/med-jargon-crf"
 tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
 model = CRFTokenClassificationModel.from_pretrained(model_name)
 model.eval()
+# 2. Prepare input text
 text = "The patient presented with elevated CRP and intermittent AF."
 inputs = tokenizer(text, return_tensors="pt")
+# 3. Run inference
+with torch.no_grad():
+    outputs = model(**inputs)
+    logits = outputs["logits"]
+    # Decode best sequence using CRF
+    predicted_tags = model.decode(logits, inputs["attention_mask"])[0]
+# 4. Extract spans from predictions
+spans = [(i, model.id2label[tag_id]) for i, tag_id in enumerate(predicted_tags) if tag_id != 0]
+tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
+# 5. Display results
+print("Detected medical jargon:")
+for token_idx, label in spans:
+    # Find continuous spans of the same entity
+    end_idx = token_idx + 1
+    while (end_idx < len(predicted_tags) and
+           predicted_tags[end_idx] == predicted_tags[token_idx]):
+        end_idx += 1
+    # Convert tokens back to text
+    detected_tokens = tokens[token_idx:end_idx]
+    detected_text = tokenizer.convert_tokens_to_string(detected_tokens)
+    print(f"{label}: '{detected_text.strip()}'")
 ```
 ---