Spaces:

hugofara
/

wavlm-phonemizer-word-detection

Sleeping

App Files Files Community

Hugo Farajallah commited on Sep 22, 2025

Commit

54520eb

1 Parent(s): b6bd379

fix(HF): the displayed alignment matrix was not correct.

Browse files

Files changed (2) hide show

dataset_process.py +23 -11
hf_space.py +19 -4

dataset_process.py CHANGED Viewed

@@ -283,6 +283,25 @@ def score_phoneme_deletion(matching, prediction, target, threshold):
     return 0
 def get_alignment_score(
     prediction,
     target,
@@ -304,17 +323,10 @@ def get_alignment_score(
     :param common.Scoring scoring: Type of scoring to use
     :return int: Scoring score.
     """
-    logits = torch.softmax(
-        torch.as_tensor(prediction) / weights[3],
-        dim=-1
-    )
-    reduced_logits = logits[torch.argmax(logits, -1) != pad_token_id]
-    reduced_logits = reduced_logits.reshape((1, reduced_logits.shape[0], reduced_logits.shape[1]))
     matching, alignment_score = bellman_matching(
-        reduced_logits,
         target,
         insertion_cost=weights[0],
         deletion_cost=weights[1],
@@ -323,9 +335,9 @@ def get_alignment_score(
     np_matching = np.array(matching)
     if scoring is common.Scoring.NUMBER_CORRECT:
-        return score_correct(np_matching, reduced_logits, target, weights[2])
     if scoring is common.Scoring.PHONEME_DELETION:
-        return score_phoneme_deletion(np_matching, reduced_logits, target, weights[2])
     raise NotImplementedError("Unknown scoring method.")

     return 0
+def remove_pad_tokens(prediction, pad_token_id, temperature):
+    """
+    Remove the pad token from a prediction to decrease temporal effects.
+    :param prediction: Predicted logits.
+    :param int pad_token_id: ID of the pad token.
+    :param float temperature: Temperature to pass to the SoftMax.
+    :return torch.Tensor: Probabilities where no row has a pad token id as an argmax.
+    """
+    logits = torch.softmax(
+        torch.as_tensor(prediction) / temperature,
+        dim=-1
+    )
+    reduced_logits = logits[torch.argmax(logits, -1) != pad_token_id]
+    reduced_logits = reduced_logits.reshape((1, reduced_logits.shape[0], reduced_logits.shape[1]))
+    return reduced_logits
 def get_alignment_score(
     prediction,
     target,
     :param common.Scoring scoring: Type of scoring to use
     :return int: Scoring score.
     """
+    collapsed_prediction = remove_pad_tokens(prediction, pad_token_id, weights[3])
     matching, alignment_score = bellman_matching(
+        collapsed_prediction,
         target,
         insertion_cost=weights[0],
         deletion_cost=weights[1],
     np_matching = np.array(matching)
     if scoring is common.Scoring.NUMBER_CORRECT:
+        return score_correct(np_matching, collapsed_prediction, target, weights[2])
     if scoring is common.Scoring.PHONEME_DELETION:
+        return score_phoneme_deletion(np_matching, collapsed_prediction, target, weights[2])
     raise NotImplementedError("Unknown scoring method.")

hf_space.py CHANGED Viewed

@@ -84,20 +84,35 @@ def process_audio_advanced(audio_data, target_word, language, advanced_mode, ins
                     prediction_logits,
                     target_encoded,
                     weights,
-                    94,
                     scoring=scoring_enum
                 )
-                # Generate alignment plot
                 path_matrix = dataset_process.compute_path_matrix(
-                    prediction_logits,
                     target_encoded,
                     dataset_process.l2_logit_norm,
                     insertion_cost,
                     deletion_cost
                 )
                 alignment_plot_fig = dataset_process.display_matrix_result(
-                    path_matrix, matching, prediction_logits, target_encoded, processor
                 )
                 alignment_result = f"**🔬 Advanced Alignment Analysis:**\n\n"

                     prediction_logits,
                     target_encoded,
                     weights,
+                    processor.tokenizer.pad_token_id,
                     scoring=scoring_enum
                 )
+                # Use reduced prediction tensor for alignment plot (remove temporal effects)
+                reduced_prediction = dataset_process.remove_pad_tokens(
+                    prediction_logits, processor.tokenizer.pad_token_id, temperature
+                )
+                # Generate alignment plot with reduced prediction
                 path_matrix = dataset_process.compute_path_matrix(
+                    reduced_prediction,
                     target_encoded,
                     dataset_process.l2_logit_norm,
                     insertion_cost,
                     deletion_cost
                 )
+                # Re-compute matching with reduced prediction for visualization
+                matching_for_plot, _ = dataset_process.bellman_matching(
+                    reduced_prediction,
+                    target_encoded,
+                    insertion_cost=insertion_cost,
+                    deletion_cost=deletion_cost,
+                    metric=dataset_process.l2_logit_norm
+                )
                 alignment_plot_fig = dataset_process.display_matrix_result(
+                    path_matrix, matching_for_plot, reduced_prediction, target_encoded, processor
                 )
                 alignment_result = f"**🔬 Advanced Alignment Analysis:**\n\n"