Spaces:

hugofara
/

wavlm-phonemizer-word-detection

Sleeping

App Files Files Community

Hugo Farajallah commited on Sep 22, 2025

Commit

0cde9d4

1 Parent(s): 54520eb

ui(charts): better display of the data.

Browse files

Files changed (2) hide show

dataset_process.py +52 -39
main.py +23 -9

dataset_process.py CHANGED Viewed

@@ -126,58 +126,71 @@ def display_matrix_result(path_matrix, matching, prediction, target, processor=N
     Returns the figure instead of showing it directly for use in Gradio.
     """
-    fig, axis = plt.subplots(figsize=(10, 6))
     if processor is None:
         _model, processor = common.get_model()
     # Display the matrix
     im = axis.matshow(path_matrix.T, aspect="auto", cmap='Blues')
-    plt.colorbar(im, ax=axis)
-    # Set the labels for the axes
-    axis.set_xlabel('Predicted String', fontsize=12)
-    axis.set_title('Alignment Matrix: Predicted vs Target Phonemes', fontsize=14, pad=20)
-    # String for the x-axis
     predicted_labels = tuple(map(processor.decode, torch.argmax(prediction, -1)[0]))
-    axis.set_xticks(
-        [i for i, label in enumerate(predicted_labels) if label == ""],
-        labels=[label for label in predicted_labels if label == ""]
-    )
-    axis.set_xticks(
-        [i for i, label in enumerate(predicted_labels) if label not in ("[PAD]", "")],
-        labels=[label for label in predicted_labels if label not in ("[PAD]", "")],
-        minor=True
-    )
-    axis.set_ylabel('Target String', fontsize=12)
     target_labels = tuple(map(processor.decode, torch.argmax(target, -1)[0]))
-    axis.set_yticks(
-        [i for i, label in enumerate(target_labels) if label == ""],
-        labels=[label for label in target_labels if label == ""]
-    )
-    axis.set_yticks(
-        [i for i, label in enumerate(target_labels) if label != ""],
-        labels=[label for label in target_labels if label != ""],
-        minor=True
-    )
-    axis.grid(which="major", color="black", alpha=0.3)
-    axis.grid(which="minor", linestyle="--", alpha=0.2)
-    # Plot the optimal path in red
-    axis.plot(
-        [val[0] for val in matching],
-        [val[1] for val in matching],
-        color="red",
-        linewidth=2,
-        marker='o',
-        markersize=3,
-        label="Optimal Alignment Path"
     )
-    axis.legend()
     plt.tight_layout()
     return fig

     Returns the figure instead of showing it directly for use in Gradio.
     """
+    fig, axis = plt.subplots(figsize=(12, 8))
     if processor is None:
         _model, processor = common.get_model()
     # Display the matrix
     im = axis.matshow(path_matrix.T, aspect="auto", cmap='Blues')
+    cbar = plt.colorbar(im, ax=axis)
+    cbar.set_label('Alignment Cost', rotation=270, labelpad=20, fontsize=11)
+    # Set the labels for the axes with clearer names
+    axis.set_xlabel('Predicted Phoneme Sequence', fontsize=12)
+    axis.set_ylabel('Target Phoneme Sequence', fontsize=12)
+    axis.set_title('Phoneme Alignment Matrix\n(Blue = Lower Cost, Red Line = Optimal Path)',
+                  fontsize=14, pad=20)
+    # Get phoneme labels for both axes
     predicted_labels = tuple(map(processor.decode, torch.argmax(prediction, -1)[0]))
     target_labels = tuple(map(processor.decode, torch.argmax(target, -1)[0]))
+    # Set x-axis ticks (predicted phonemes)
+    non_empty_pred_indices = [i for i, label in enumerate(predicted_labels) if label not in ("", "[PAD]")]
+    non_empty_pred_labels = [label for i, label in enumerate(predicted_labels) if label not in ("", "[PAD]")]
+    if non_empty_pred_indices:
+        axis.set_xticks(non_empty_pred_indices)
+        axis.set_xticklabels(non_empty_pred_labels, rotation=45, ha='right', fontsize=10)
+    # Set y-axis ticks (target phonemes)
+    non_empty_target_indices = [i for i, label in enumerate(target_labels) if label not in ("", "[PAD]")]
+    non_empty_target_labels = [label for i, label in enumerate(target_labels) if label not in ("", "[PAD]")]
+    if non_empty_target_indices:
+        axis.set_yticks(non_empty_target_indices)
+        axis.set_yticklabels(non_empty_target_labels, fontsize=10)
+    # Add subtle grid
+    axis.grid(which="major", color="gray", alpha=0.2, linestyle="-")
+    # Plot the optimal path in red with better visibility
+    if matching:
+        axis.plot(
+            [val[0] for val in matching],
+            [val[1] for val in matching],
+            color="red",
+            linewidth=3,
+            marker='o',
+            markersize=4,
+            markerfacecolor='white',
+            markeredgecolor='red',
+            markeredgewidth=2,
+            label="Optimal Alignment Path",
+            alpha=0.9
+        )
+    # Add legend with better positioning
+    axis.legend(loc='upper right', bbox_to_anchor=(1.0, 1.0), fontsize=11)
+    # Add text annotations for better understanding
+    axis.text(
+        0.02, 0.98, 'Lower values indicate\nbetter alignment',
+        transform=axis.transAxes, fontsize=9, va='top', ha='left',
+        bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8)
     )
     plt.tight_layout()
     return fig

main.py CHANGED Viewed

@@ -20,25 +20,36 @@ def fake_model(chunk):
     return np.random.rand(output_length, vocab_size)
-def update_frame(frames, ax, matrix_plot, tokenizer=None):
     ax.clear()
     ax.set_title(
         "Activation levels for WavLM Base +'s hidden layers\n"
-        f"Layer = {frames[0]}, T = {frames[1]}s"
     )
-    ax.set_xlabel("Phonemes list")
-    ax.set_ylabel("Selected phoneme by Timestamp")
     data = frames[2].detach().clone()
-    matrix_plot = ax.matshow(data, vmin=0, vmax=1)
     if tokenizer is not None:
         label_ids = torch.argmax(data, -1)
         labels = tokenizer.batch_decode(label_ids)
         ax.set_xticks([i for v, i in tokenizer.vocab.items() if v in labels])
-        ax.set_xticklabels([v for v, i in tokenizer.vocab.items() if v in labels])
         ax.set_yticks([i for i, v in enumerate(labels) if v not in ("", "[PAD]")])
         ax.set_yticklabels([v for i, v in enumerate(labels) if v not in ("", "[PAD]")])
-        ax.text(0, data.shape[0] + 15, "Decoded: " + tokenizer.decode(label_ids))
-    # matrix_plot.set_data(data)
     return ax, matrix_plot
@@ -100,7 +111,10 @@ def main(record_mic=False):
     ]
     fig, ax = plt.subplots(animated=True)
     ax.set_title("Animation Preview")
-    matrix_plot = ax.matshow(logit_groups[0][0], animated=True, vmin=0, vmax=1)
     logits_list = []
     masks = inputs["attention_mask"].sum(dim=1) / common.SAMPLING_RATE
     for i, chunk in enumerate(chunks):

     return np.random.rand(output_length, vocab_size)
+def update_frame(frames, ax, matrix_plot, tokenizer=None, colorbar=None):
     ax.clear()
     ax.set_title(
         "Activation levels for WavLM Base +'s hidden layers\n"
+        f"Layer = {frames[0] + 1}, T = {frames[1]}s"
     )
+    ax.set_xlabel("Phoneme Vocabulary")
+    ax.set_ylabel("Time Steps, and Selected Phoneme")
     data = frames[2].detach().clone()
+    matrix_plot = ax.matshow(data, vmin=0, vmax=1, cmap='Blues')
     if tokenizer is not None:
         label_ids = torch.argmax(data, -1)
         labels = tokenizer.batch_decode(label_ids)
         ax.set_xticks([i for v, i in tokenizer.vocab.items() if v in labels])
+        ax.set_xticklabels([v for v, i in tokenizer.vocab.items() if v in labels], rotation=45, ha='right')
         ax.set_yticks([i for i, v in enumerate(labels) if v not in ("", "[PAD]")])
         ax.set_yticklabels([v for i, v in enumerate(labels) if v not in ("", "[PAD]")])
+        # Position the decoded text below the plot with proper spacing
+        decoded_text = tokenizer.decode(label_ids)
+        if len(decoded_text) > 50:
+            decoded_text = decoded_text[:50] + "..."
+        ax.text(
+            0.5, -0.15, f"Decoded: {decoded_text}",
+            transform=ax.transAxes, ha='center', va='top',
+            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray", alpha=0.8)
+        )
+    plt.tight_layout()
     return ax, matrix_plot
     ]
     fig, ax = plt.subplots(animated=True)
     ax.set_title("Animation Preview")
+    matrix_plot = ax.matshow(logit_groups[0][0], animated=True, vmin=0, vmax=1, cmap='Blues')
+    # Add colorbar once for the entire animation
+    colorbar = plt.colorbar(matrix_plot, ax=ax, label='Activation Level')
     logits_list = []
     masks = inputs["attention_mask"].sum(dim=1) / common.SAMPLING_RATE
     for i, chunk in enumerate(chunks):