Upload tts_turbo.py
Browse filesadded file upload code to allow for chunking large text
- chatterbox/tts_turbo.py +43 -0
chatterbox/tts_turbo.py
CHANGED
|
@@ -20,6 +20,7 @@ from .models.voice_encoder import VoiceEncoder
|
|
| 20 |
from .models.t3.modules.cond_enc import T3Cond
|
| 21 |
from .models.t3.modules.t3_config import T3Config
|
| 22 |
from .models.s3gen.const import S3GEN_SIL
|
|
|
|
| 23 |
import logging
|
| 24 |
logger = logging.getLogger(__name__)
|
| 25 |
|
|
@@ -266,6 +267,8 @@ class ChatterboxTurboTTS:
|
|
| 266 |
temperature=0.8,
|
| 267 |
top_k=1000,
|
| 268 |
norm_loudness=True,
|
|
|
|
|
|
|
| 269 |
):
|
| 270 |
if audio_prompt_path:
|
| 271 |
self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, norm_loudness=norm_loudness)
|
|
@@ -275,6 +278,46 @@ class ChatterboxTurboTTS:
|
|
| 275 |
if cfg_weight > 0.0 or exaggeration > 0.0 or min_p > 0.0:
|
| 276 |
logger.warning("CFG, min_p and exaggeration are not supported by Turbo version and will be ignored.")
|
| 277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
# Norm and tokenize text
|
| 279 |
text = punc_norm(text)
|
| 280 |
text_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|
|
|
|
| 20 |
from .models.t3.modules.cond_enc import T3Cond
|
| 21 |
from .models.t3.modules.t3_config import T3Config
|
| 22 |
from .models.s3gen.const import S3GEN_SIL
|
| 23 |
+
from .text_utils import chunk_text
|
| 24 |
import logging
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
|
|
|
|
| 267 |
temperature=0.8,
|
| 268 |
top_k=1000,
|
| 269 |
norm_loudness=True,
|
| 270 |
+
chunk_text_enabled=True,
|
| 271 |
+
max_chunk_chars=300,
|
| 272 |
):
|
| 273 |
if audio_prompt_path:
|
| 274 |
self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, norm_loudness=norm_loudness)
|
|
|
|
| 278 |
if cfg_weight > 0.0 or exaggeration > 0.0 or min_p > 0.0:
|
| 279 |
logger.warning("CFG, min_p and exaggeration are not supported by Turbo version and will be ignored.")
|
| 280 |
|
| 281 |
+
# Check if chunking is needed
|
| 282 |
+
if chunk_text_enabled and len(text) > max_chunk_chars:
|
| 283 |
+
text_chunks = chunk_text(text, max_chars=max_chunk_chars)
|
| 284 |
+
logger.info(f"Text split into {len(text_chunks)} chunks for processing")
|
| 285 |
+
|
| 286 |
+
# Generate audio for each chunk and concatenate
|
| 287 |
+
all_wavs = []
|
| 288 |
+
for i, chunk in enumerate(text_chunks):
|
| 289 |
+
logger.info(f"Processing chunk {i+1}/{len(text_chunks)}")
|
| 290 |
+
wav_chunk = self._generate_chunk(
|
| 291 |
+
chunk,
|
| 292 |
+
repetition_penalty=repetition_penalty,
|
| 293 |
+
temperature=temperature,
|
| 294 |
+
top_k=top_k,
|
| 295 |
+
top_p=top_p,
|
| 296 |
+
)
|
| 297 |
+
all_wavs.append(wav_chunk.squeeze(0))
|
| 298 |
+
|
| 299 |
+
# Concatenate all audio chunks
|
| 300 |
+
final_wav = torch.cat(all_wavs, dim=-1)
|
| 301 |
+
return final_wav.unsqueeze(0)
|
| 302 |
+
else:
|
| 303 |
+
# Single text processing (original behavior)
|
| 304 |
+
return self._generate_chunk(
|
| 305 |
+
text,
|
| 306 |
+
repetition_penalty=repetition_penalty,
|
| 307 |
+
temperature=temperature,
|
| 308 |
+
top_k=top_k,
|
| 309 |
+
top_p=top_p,
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
def _generate_chunk(
|
| 313 |
+
self,
|
| 314 |
+
text,
|
| 315 |
+
repetition_penalty=1.2,
|
| 316 |
+
temperature=0.8,
|
| 317 |
+
top_k=1000,
|
| 318 |
+
top_p=0.95,
|
| 319 |
+
):
|
| 320 |
+
"""Generate audio for a single text chunk."""
|
| 321 |
# Norm and tokenize text
|
| 322 |
text = punc_norm(text)
|
| 323 |
text_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
|