Spaces:

hugofara
/

wavlm-phonemizer-word-detection

Sleeping

App Files Files Community

Hugo Farajallah commited on Sep 15, 2025

Commit

1b910a7

0 Parent(s):

feat(animation): initial animation of the logits with WavLM.

Browse files

Files changed (11) hide show

.gitattributes +2 -0
.gitignore +13 -0
.idea/.gitignore +3 -0
.python-version +1 -0
README.md +11 -0
ceci est un test.wav +0 -0
figures/.gitkeep +0 -0
main.py +73 -0
pyproject.toml +12 -0
uv.lock +0 -0
wavlm_phoneme_fr_it.py +188 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.png filter=lfs diff=lfs merge=lfs -text
2	+ *.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,13 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv
+# Ignored generated figures
+figures/*.png

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# WavLM Demo
+Some simple utility script to show how WavLM works and how to use it.
+It is all based on WavLM Base + Phonemizer FR-IT
+## Idea
+- [x] Show activation logits of WavLM (fake model for now)
+- [ ] Compare performances with Wav2Vec 2.0-Phonemizer-FR
+- [x] Animate activation logits over time.
+- [ ] SHow the result from the feature encoder.

ceci est un test.wav ADDED Viewed

Binary file (33.9 kB). View file

figures/.gitkeep ADDED Viewed

File without changes

main.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import functools
+import matplotlib.animation
+import matplotlib.pyplot as plt
+import numpy as np
+import transformers
+# import wavlm_phoneme_fr_it
+SAMPLING_RATE = 16_000
+VOCAB_SIZE = 100
+def get_model():
+    checkpoint = "hugofara/wavlm-base-plus-phonemizer-fr-it"
+    processor = transformers.AutoProcessor.from_pretrained(
+        checkpoint, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"
+    )
+    model = wavlm_phoneme_fr_it.WavLMPhonemeFrIt.from_pretrained(
+        checkpoint
+    )
+    return model, processor
+def fake_model(chunk):
+    output_length = int(chunk.shape[0] * 0.02)
+    return np.random.rand(output_length, VOCAB_SIZE)
+def update_frame(data, ax):
+    ax.matshow(data)
+    return ax,
+def main():
+    # model, processor = get_model()
+    audio_length = 5
+    split_length = 0.1
+    audio_file = np.random.rand(audio_length * SAMPLING_RATE)
+    # TODO: normalize audio
+    # Split audio
+    chunks = []
+    for i in np.linspace(0, audio_file.shape[0], int(audio_length / split_length), dtype=np.uint64):
+        if i == 0:
+            continue
+        chunks.append(audio_file[:i])
+    # Inference time
+    logit_groups = [
+        np.zeros((int(chunks[-1].shape[0] * 0.02), VOCAB_SIZE)) for _ in enumerate(chunks)
+    ]
+    fig, ax = plt.subplots(1, 1)
+    for i, chunk in enumerate(chunks):
+        logits = fake_model(chunk)
+        logit_groups[i][:logits.shape[0]] = logits
+        fig.savefig(f"figures/test{i}.png")
+    # Animate
+    global animation
+    animation = matplotlib.animation.FuncAnimation(
+        fig,
+        functools.partial(update_frame, ax=ax),
+        logit_groups,
+        # blit=True
+    )
+    animation.save("animated.webm")
+if __name__ == "__main__":
+    animation = None
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,12 @@

+[project]
+name = "wavlm-demo"
+version = "0.1.0"
+description = "Demonstration project for WavLM"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "matplotlib>=3.10.6",
+    "numpy>=2.3.3",
+    "pyqt6>=6.9.1",
+    "transformers>=4.56.1",
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

wavlm_phoneme_fr_it.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import numpy as np
+import torch
+import transformers
+_HIDDEN_STATES_START_POSITION = 2
+def add_language_to_hidden(input_values, language):
+    if isinstance(language, str):
+        raise TypeError("Language should be None, list of torch.Tensor, not str")
+    input_batch = torch.empty(
+        (input_values.shape[0], input_values.shape[1], input_values.shape[2] + 1),
+        dtype=input_values.dtype,
+        device=input_values.device
+    )
+    input_batch[:, :, :-1] = input_values
+    input_batch[:, :, -1] = language
+    return input_batch
+    if language is None:
+        lang_val = torch.zeros((input_values.shape[1],))
+    elif isinstance(language, torch.Tensor) and len(language.shape) == 0:
+        lang_val = language
+    elif isinstance(language, np.ndarray):
+        lang_val = torch.tensor(language)
+    else:
+        lang_val = (
+            torch
+            .tensor([[lang] for lang in language])
+            .repeat((1, input_batch.shape[1]))
+        )
+    input_batch[:, :, -1] *= lang_val
+    return input_batch
+def language_classifer(language):
+    """
+    Return a float identifying each known language.
+    "fr" has value of 0, "it" a value of one.
+    Other languages will have a value increasing in lexicographic order.
+    :param str language: Language to identify, should be two letters.
+    :return float: Unique identifier, between 0 and 1.
+    """
+    if language == "fr":
+        return 0
+    if language == "it":
+        return 1
+    # Some random code to encode a two-letter language between 0 and 1
+    # "aa" should be 0+1=1 and "zz" should be 1+2=3
+    codes = (
+        (ord(letter) - ord("a")) / (ord("z") - ord("a")) + i
+        for i, letter in enumerate(language)
+    )
+    # Transform to [0, 1]
+    return (sum(codes) - 1) / 2
+class WavLMPhonemeFrIt(transformers.WavLMForCTC):
+    """
+    PhonemeRecognizer: WavLM + Linear layer for speech recognition.
+    It natively separates French and Italian.
+    For a more professional implementation, view
+    https://github.com/huggingface/transformers/blob/main/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+    """
+    def __init__(self, config, tokenizer=None):
+        """
+        Create the new model out of a combination of both models.
+        :param config: Model config.
+        """
+        super().__init__(config)
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        # Replace head and add multilingualism
+        self.lm_head = torch.nn.Linear(output_hidden_size + 1, config.vocab_size)
+        self.tokenizer = tokenizer
+    def forward(
+            self,
+            input_values: torch.Tensor,
+            attention_mask: torch.Tensor = None,
+            language: torch.Tensor = None,
+            output_attentions: bool = None,
+            output_hidden_states: bool = None,
+            return_dict: bool = None,
+            labels: torch.Tensor = None,
+    ):
+        """
+        Classify audio to a chain of phonemes of the same length.
+        Stolen from
+        https://github.com/huggingface/transformers/blob/6ba8a1ff4550b4450a22a0b0d907312955ce0fd5/src/transformers/models/wavlm/modeling_wavlm.py#L1196
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+        outputs = self.wavlm(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        # hidden_with_lang = add_language_to_hidden(hidden_states, language)
+        hidden_with_lang = torch.cat(
+            [hidden_states, language.repeat(hidden_states.shape[1]).reshape((1, -1, 1))],
+            dim=2
+        )
+        logits = self.lm_head(hidden_with_lang)
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+            # ctc_loss doesn't support fp16
+            log_probs = torch.nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = torch.nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return transformers.modeling_outputs.CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+    def freeze_feature_encoder_only(self):
+        # Unfreeze base model
+        for param in self.wavlm.parameters():
+            param.requires_grad = True
+        # Now freeze the first layer
+        self.freeze_feature_encoder()
+def freeze_layer(layer, freeze=True):
+    for param in layer.parameters():
+        param.requires_grad = not freeze
+    layer._requires_grad = not freeze
+def get_wavlm_phoneme_fr_it(tokenizer, freeze_hidden_layers=False):
+    model = WavLMPhonemeFrIt.from_pretrained(
+        "microsoft/wavlm-base-plus",
+        ctc_loss_reduction="mean",
+        pad_token_id=tokenizer.pad_token_id,
+        vocab_size=len(tokenizer)
+    )
+    model.tokenizer = tokenizer
+    if freeze_hidden_layers:
+        model.freeze_base_model()
+    return model