Skip to content

Commit ba58def

Browse files
authored
Merge pull request #397 from idiap/dev
v0.26.2
2 parents d76ddbc + a365e80 commit ba58def

File tree

4 files changed

+11
-6
lines changed

4 files changed

+11
-6
lines changed

TTS/demos/xtts_ft_demo/XTTS_finetune_colab.ipynb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@
3737
},
3838
"outputs": [],
3939
"source": [
40-
"!pip install coqui-tts\n",
41-
"!pip install gradio==4.7.1 faster_whisper"
40+
"# TODO: update demo to gradio 5 to be compatible with default packages\n",
41+
"!pip uninstall -y google-genai google-cloud-aiplatform dataproc-spark-connect yfinance\n",
42+
"!pip install pydantic==2.10.6 gradio==4.44.1 faster_whisper coqui-tts"
4243
]
4344
},
4445
{

TTS/tts/models/xtts.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int =
249249
chunk_length (int): Length of the audio chunks in seconds. When `length == chunk_length`, the whole audio
250250
is being used without chunking. It must be < `length`. Defaults to 6.
251251
"""
252+
MIN_AUDIO_SECONDS = 0.33
252253
if sr != 22050:
253254
audio = torchaudio.functional.resample(audio, sr, 22050)
254255
if length > 0:
@@ -259,7 +260,7 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int =
259260
audio_chunk = audio[:, i : i + 22050 * chunk_length]
260261

261262
# if the chunk is too short ignore it
262-
if audio_chunk.size(-1) < 22050 * 0.33:
263+
if audio_chunk.size(-1) < 22050 * MIN_AUDIO_SECONDS:
263264
continue
264265

265266
mel_chunk = wav_to_mel_cloning(
@@ -279,6 +280,9 @@ def get_gpt_cond_latents(self, audio, sr, length: int = 30, chunk_length: int =
279280
style_embs.append(style_emb)
280281

281282
# mean style embedding
283+
if len(style_embs) == 0:
284+
msg = f"Provided reference audio too short (minimum length: {MIN_AUDIO_SECONDS:.2f} seconds)."
285+
raise RuntimeError(msg)
282286
cond_latent = torch.stack(style_embs).mean(dim=0)
283287
else:
284288
mel = wav_to_mel_cloning(

TTS/tts/utils/helpers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def sequence_mask(sequence_length: torch.Tensor, max_len: int | None = None) ->
4444
- mask: :math:`[B, T_max]`
4545
"""
4646
if max_len is None:
47-
max_len = int(sequence_length.max())
47+
max_len = sequence_length.max()
4848
seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device)
4949
# B x T_max
5050
return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1)

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ build-backend = "hatchling.build"
2525

2626
[project]
2727
name = "coqui-tts"
28-
version = "0.26.1"
28+
version = "0.26.2"
2929
description = "Deep learning for Text to Speech."
3030
readme = "README.md"
3131
requires-python = ">=3.10, <3.13"
@@ -82,7 +82,7 @@ dependencies = [
8282
"gruut[de,es,fr]>=2.4.0",
8383
# Tortoise
8484
"einops>=0.6.0",
85-
"transformers>=4.47.0",
85+
"transformers>=4.47.0,<4.52",
8686
# Bark
8787
"encodec>=0.1.1",
8888
# XTTS

0 commit comments

Comments
 (0)