Merge pull request #397 from idiap/dev

eginhard · web-flow · commit ba58deff29dc · 2025-05-22T14:06:48.000+02:00
v0.26.2
diff --git a/TTS/demos/xtts_ft_demo/XTTS_finetune_colab.ipynb b/TTS/demos/xtts_ft_demo/XTTS_finetune_colab.ipynb
@@ -37,8 +37,9 @@
       },
       "outputs": [],
       "source": [
-        "!pip install coqui-tts\n",
-        "!pip install gradio==4.7.1 faster_whisper"
+        "# TODO: update demo to gradio 5 to be compatible with default packages\n",
+        "!pip uninstall -y google-genai google-cloud-aiplatform dataproc-spark-connect yfinance\n",
+        "!pip install pydantic==2.10.6 gradio==4.44.1 faster_whisper coqui-tts"
       ]
     },
     {
diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
@@ -249,6 +249,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int =
             chunk_length (int): Length of the audio chunks in seconds. When `length == chunk_length`, the whole audio
                 is being used without chunking. It must be < `length`. Defaults to 6.
         """
+        MIN_AUDIO_SECONDS = 0.33
         if sr != 22050:
             audio = torchaudio.functional.resample(audio, sr, 22050)
         if length > 0:
@@ -259,7 +260,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int =
                 audio_chunk = audio[:, i : i + 22050 * chunk_length]
 
                 # if the chunk is too short ignore it
-                if audio_chunk.size(-1) < 22050 * 0.33:
+                if audio_chunk.size(-1) < 22050 * MIN_AUDIO_SECONDS:
                     continue
 
                 mel_chunk = wav_to_mel_cloning(
@@ -279,6 +280,9 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int =
                 style_embs.append(style_emb)
 
             # mean style embedding
+            if len(style_embs) == 0:
+                msg = f"Provided reference audio too short (minimum length: {MIN_AUDIO_SECONDS:.2f} seconds)."
+                raise RuntimeError(msg)
             cond_latent = torch.stack(style_embs).mean(dim=0)
         else:
             mel = wav_to_mel_cloning(
diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py
@@ -44,7 +44,7 @@ def sequence_mask(sequence_length: torch.Tensor, max_len: int | None = None) ->
         - mask: :math:`[B, T_max]`
     """
     if max_len is None:
-        max_len = int(sequence_length.max())
+        max_len = sequence_length.max()
     seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
     # B x T_max
     return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "coqui-tts"
-version = "0.26.1"
+version = "0.26.2"
 description = "Deep learning for Text to Speech."
 readme = "README.md"
 requires-python = ">=3.10, <3.13"
@@ -82,7 +82,7 @@ dependencies = [
     "gruut[de,es,fr]>=2.4.0",
     # Tortoise
     "einops>=0.6.0",
-    "transformers>=4.47.0",
+    "transformers>=4.47.0,<4.52",
     # Bark
     "encodec>=0.1.1",
     # XTTS

Original file line number	Diff line number	Diff line change
`@@ -37,8 +37,9 @@`
`37`	`37`	`},`
`38`	`38`	`"outputs": [],`
`39`	`39`	`"source": [`
`40`		`- "!pip install coqui-tts\n",`
`41`		`- "!pip install gradio==4.7.1 faster_whisper"`
	`40`	`+ "# TODO: update demo to gradio 5 to be compatible with default packages\n",`
	`41`	`+ "!pip uninstall -y google-genai google-cloud-aiplatform dataproc-spark-connect yfinance\n",`
	`42`	`+ "!pip install pydantic==2.10.6 gradio==4.44.1 faster_whisper coqui-tts"`
`42`	`43`	`]`
`43`	`44`	`},`
`44`	`45`	`{`