chatterbox-turbo-DNXS

Paused

App Files Files Community

bobsackett commited on Dec 30, 2025

Commit

d2eb2ee

verified ·

1 Parent(s): 375176e

Upload tts_turbo.py

Browse files

added file upload code to allow for chunking large text

Files changed (1) hide show

chatterbox/tts_turbo.py +43 -0

chatterbox/tts_turbo.py CHANGED Viewed

@@ -20,6 +20,7 @@ from .models.voice_encoder import VoiceEncoder
 from .models.t3.modules.cond_enc import T3Cond
 from .models.t3.modules.t3_config import T3Config
 from .models.s3gen.const import S3GEN_SIL
 import logging
 logger = logging.getLogger(__name__)
@@ -266,6 +267,8 @@ class ChatterboxTurboTTS:
         temperature=0.8,
         top_k=1000,
         norm_loudness=True,
     ):
         if audio_prompt_path:
             self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, norm_loudness=norm_loudness)
@@ -275,6 +278,46 @@ class ChatterboxTurboTTS:
         if cfg_weight > 0.0 or exaggeration > 0.0 or min_p > 0.0:
             logger.warning("CFG, min_p and exaggeration are not supported by Turbo version and will be ignored.")
         # Norm and tokenize text
         text = punc_norm(text)
         text_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)

 from .models.t3.modules.cond_enc import T3Cond
 from .models.t3.modules.t3_config import T3Config
 from .models.s3gen.const import S3GEN_SIL
+from .text_utils import chunk_text
 import logging
 logger = logging.getLogger(__name__)
         temperature=0.8,
         top_k=1000,
         norm_loudness=True,
+        chunk_text_enabled=True,
+        max_chunk_chars=300,
     ):
         if audio_prompt_path:
             self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, norm_loudness=norm_loudness)
         if cfg_weight > 0.0 or exaggeration > 0.0 or min_p > 0.0:
             logger.warning("CFG, min_p and exaggeration are not supported by Turbo version and will be ignored.")
+        # Check if chunking is needed
+        if chunk_text_enabled and len(text) > max_chunk_chars:
+            text_chunks = chunk_text(text, max_chars=max_chunk_chars)
+            logger.info(f"Text split into {len(text_chunks)} chunks for processing")
+            # Generate audio for each chunk and concatenate
+            all_wavs = []
+            for i, chunk in enumerate(text_chunks):
+                logger.info(f"Processing chunk {i+1}/{len(text_chunks)}")
+                wav_chunk = self._generate_chunk(
+                    chunk,
+                    repetition_penalty=repetition_penalty,
+                    temperature=temperature,
+                    top_k=top_k,
+                    top_p=top_p,
+                )
+                all_wavs.append(wav_chunk.squeeze(0))
+            # Concatenate all audio chunks
+            final_wav = torch.cat(all_wavs, dim=-1)
+            return final_wav.unsqueeze(0)
+        else:
+            # Single text processing (original behavior)
+            return self._generate_chunk(
+                text,
+                repetition_penalty=repetition_penalty,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+            )
+    def _generate_chunk(
+        self,
+        text,
+        repetition_penalty=1.2,
+        temperature=0.8,
+        top_k=1000,
+        top_p=0.95,
+    ):
+        """Generate audio for a single text chunk."""
         # Norm and tokenize text
         text = punc_norm(text)
         text_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)