bobsackett commited on
Commit
d2eb2ee
·
verified ·
1 Parent(s): 375176e

Upload tts_turbo.py

Browse files

added file upload code to allow for chunking large text

Files changed (1) hide show
  1. chatterbox/tts_turbo.py +43 -0
chatterbox/tts_turbo.py CHANGED
@@ -20,6 +20,7 @@ from .models.voice_encoder import VoiceEncoder
20
  from .models.t3.modules.cond_enc import T3Cond
21
  from .models.t3.modules.t3_config import T3Config
22
  from .models.s3gen.const import S3GEN_SIL
 
23
  import logging
24
  logger = logging.getLogger(__name__)
25
 
@@ -266,6 +267,8 @@ class ChatterboxTurboTTS:
266
  temperature=0.8,
267
  top_k=1000,
268
  norm_loudness=True,
 
 
269
  ):
270
  if audio_prompt_path:
271
  self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, norm_loudness=norm_loudness)
@@ -275,6 +278,46 @@ class ChatterboxTurboTTS:
275
  if cfg_weight > 0.0 or exaggeration > 0.0 or min_p > 0.0:
276
  logger.warning("CFG, min_p and exaggeration are not supported by Turbo version and will be ignored.")
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  # Norm and tokenize text
279
  text = punc_norm(text)
280
  text_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
 
20
  from .models.t3.modules.cond_enc import T3Cond
21
  from .models.t3.modules.t3_config import T3Config
22
  from .models.s3gen.const import S3GEN_SIL
23
+ from .text_utils import chunk_text
24
  import logging
25
  logger = logging.getLogger(__name__)
26
 
 
267
  temperature=0.8,
268
  top_k=1000,
269
  norm_loudness=True,
270
+ chunk_text_enabled=True,
271
+ max_chunk_chars=300,
272
  ):
273
  if audio_prompt_path:
274
  self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration, norm_loudness=norm_loudness)
 
278
  if cfg_weight > 0.0 or exaggeration > 0.0 or min_p > 0.0:
279
  logger.warning("CFG, min_p and exaggeration are not supported by Turbo version and will be ignored.")
280
 
281
+ # Check if chunking is needed
282
+ if chunk_text_enabled and len(text) > max_chunk_chars:
283
+ text_chunks = chunk_text(text, max_chars=max_chunk_chars)
284
+ logger.info(f"Text split into {len(text_chunks)} chunks for processing")
285
+
286
+ # Generate audio for each chunk and concatenate
287
+ all_wavs = []
288
+ for i, chunk in enumerate(text_chunks):
289
+ logger.info(f"Processing chunk {i+1}/{len(text_chunks)}")
290
+ wav_chunk = self._generate_chunk(
291
+ chunk,
292
+ repetition_penalty=repetition_penalty,
293
+ temperature=temperature,
294
+ top_k=top_k,
295
+ top_p=top_p,
296
+ )
297
+ all_wavs.append(wav_chunk.squeeze(0))
298
+
299
+ # Concatenate all audio chunks
300
+ final_wav = torch.cat(all_wavs, dim=-1)
301
+ return final_wav.unsqueeze(0)
302
+ else:
303
+ # Single text processing (original behavior)
304
+ return self._generate_chunk(
305
+ text,
306
+ repetition_penalty=repetition_penalty,
307
+ temperature=temperature,
308
+ top_k=top_k,
309
+ top_p=top_p,
310
+ )
311
+
312
+ def _generate_chunk(
313
+ self,
314
+ text,
315
+ repetition_penalty=1.2,
316
+ temperature=0.8,
317
+ top_k=1000,
318
+ top_p=0.95,
319
+ ):
320
+ """Generate audio for a single text chunk."""
321
  # Norm and tokenize text
322
  text = punc_norm(text)
323
  text_tokens = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)