idiap
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Dockerfile‎
Lines changed: 8 additions & 3 deletions b/‎Dockerfile‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 13 additions & 2 deletions b/‎README.md‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎TTS/bin/synthesize.py‎
Lines changed: 13 additions & 1 deletion b/‎TTS/bin/synthesize.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎TTS/tts/configs/vits_config.py‎
Lines changed: 1 addition & 1 deletion b/‎TTS/tts/configs/vits_config.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎TTS/tts/datasets/dataset.py‎
Lines changed: 22 additions & 2 deletions b/‎TTS/tts/datasets/dataset.py‎
Lines changed: 22 additions & 2 deletions
@@ -76,7 +76,7 @@ jobs:
           if [ "${{ matrix.python-version }}" == "3.10" ]; then
             resolution=lowest-direct
           fi
-          uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }}
+          uv run --resolution=$resolution --extra codec --extra server --extra languages make ${{ matrix.subset }}
       - name: Upload coverage data
         uses: actions/upload-artifact@v4
         with:
@@ -119,7 +119,7 @@ jobs:
           if [ "${{ matrix.python-version }}" == "3.10" ]; then
             resolution=lowest-direct
           fi
-          uv run --resolution=$resolution --extra languages coverage run -m pytest -x -v --durations=0 $shard_tests
+          uv run --resolution=$resolution --extra codec --extra languages coverage run -m pytest -x -v --durations=0 $shard_tests
       - name: Upload coverage data
         uses: actions/upload-artifact@v4
         with:
@@ -154,7 +154,7 @@ jobs:
             uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
           fi
       - name: Zoo tests
-        run: uv run --extra server --extra languages make test_zoo
+        run: uv run --extra codec --extra server --extra languages make test_zoo
         env:
           NUM_PARTITIONS: 3
           TEST_PARTITION: ${{ matrix.partition }}
 
@@ -1,6 +1,7 @@
 ARG BASE=nvidia/cuda:12.8.1-base-ubuntu24.04
 FROM ${BASE}
 
+ARG BASE=nvidia/cuda:12.8.1-base-ubuntu24.04
 RUN apt-get update && apt-get upgrade -y
 RUN apt-get install -y --no-install-recommends \
     gcc g++ make python3 python3-dev \
@@ -9,8 +10,7 @@ RUN apt-get install -y --no-install-recommends \
 
 # Install uv
 COPY --from=ghcr.io/astral-sh/uv:0.8.15 /uv /uvx /bin/
-ENV UV_NO_CACHE=1 \
-    UV_TORCH_BACKEND=auto
+ENV UV_NO_CACHE=1
 
 RUN uv venv /opt/venv
 ENV VIRTUAL_ENV=/opt/venv PATH="/opt/venv/bin:$PATH"
@@ -19,7 +19,12 @@ WORKDIR /app
 
 # Install dependencies first for better caching
 COPY pyproject.toml /app
-RUN uv pip install -r pyproject.toml --extra all
+RUN if echo "$BASE" | grep -q "cuda"; then \
+      UV_TORCH_BACKEND=cu128; \
+    else \
+      UV_TORCH_BACKEND=cpu; \
+    fi && \
+    uv pip install -r pyproject.toml --extra all --torch-backend=${UV_TORCH_BACKEND}
 
 # Copy the rest of the application
 COPY . /app
 
@@ -44,7 +44,7 @@ lint:	## run linters.
 	uv run --only-dev ruff format ${target_dirs} --check
 
 system-deps:	## install linux system deps
-	sudo apt-get install -y libsndfile1-dev
+	sudo apt-get install -y libsndfile1-dev ffmpeg
 
 install:	## install 🐸 TTS
 	uv sync --all-extras
 
@@ -24,7 +24,8 @@
 
 ## 📣 News
 - **Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts)**
-- 0.25.0: [OpenVoice](https://github.com/myshell-ai/OpenVoice) models now available for voice conversion.
+- 0.27.0: [Caching mechanism](https://coqui-tts.readthedocs.io/en/latest/cloning.html) for cloned voices.
+- 0.25.2: [OpenVoice](https://github.com/myshell-ai/OpenVoice) and [kNN-VC](https://github.com/bshall/knn-vc) models now available for voice conversion.
 - 0.24.2: Prebuilt wheels are now also published for macOS and Windows (in addition to Linux as before) for easier installation across platforms.
 - 0.20.0: XTTSv2 is here with 17 languages and better performance across the board. XTTS can stream with <200ms latency.
 - 0.19.0: XTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech).
@@ -117,7 +118,9 @@ You can also help us implement more models.
 ## Installation
 
 🐸TTS is tested on Ubuntu 24.04 with **python >= 3.10, < 3.14**, but should also
-work on Mac and Windows.
+work on Mac and Windows. Depending on your platform, you might first want to
+separately install Pytorch, `torchaudio`, and `torchcodec` with their
+[official instructions](https://pytorch.org/get-started/locally/).
 
 If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the pretrained 🐸TTS models, installing from PyPI is the easiest option.
 
@@ -140,6 +143,7 @@ The following extras allow the installation of optional dependencies:
 | Name | Description |
 |------|-------------|
 | `all` | All optional dependencies |
+| `codec` | Installs torchcodec needed with Pytorch>=2.9 |
 | `notebooks` | Dependencies only used in notebooks |
 | `server` | Dependencies to run the TTS server |
 | `bn` | Bangla G2P |
@@ -227,6 +231,10 @@ From version 0.27.0 you can [cache cloned
 voices](https://coqui-tts.readthedocs.io/en/latest/cloning.html) with a custom
 `speaker` ID, so you only need to pass audio files in `speaker_wav` once.
 
+> [!NOTE]
+> For more control or additional outputs, e.g. timestamps, use the lower-level
+> [Synthesizer API](https://coqui-tts.readthedocs.io/en/latest/main_classes/synthesizer.html).
+
 #### Single speaker model
 
 ```python
@@ -287,6 +295,9 @@ api.tts_to_file(
 )
 ```
 
+**Note:** Some Fairseq models need the romanization library `uroman` to be
+installed. For this you can install `coqui-tts` with the `languages` extra.
+
 ### Command-line interface `tts`
 
 <!-- begin-tts-readme -->
 
@@ -4,6 +4,7 @@
 
 import argparse
 import contextlib
+import importlib.metadata
 import logging
 import sys
 from argparse import RawTextHelpFormatter
@@ -288,7 +289,12 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
         "--voice_dir",
         type=str,
         default=None,
-        help="Voice dir for tortoise model",
+        help="Custom directory for caching of cloned voices.",
+    )
+    parser.add_argument(
+        "--version",
+        action="store_true",
+        help="Print the Coqui TTS version number and exit.",
     )
 
     args = parser.parse_args(arg_list)
@@ -304,6 +310,7 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
         args.model_info_by_name,
         args.source_wav,
         args.target_wav,
+        args.version,
     ]
     if not any(check_args):
         parser.parse_args(["-h"])
@@ -338,6 +345,11 @@ def main(arg_list: list[str] | None = None) -> None:
         vc_config_path = None
         model_dir = None
 
+        # 0) Print version number
+        if args.version:
+            logger.info(importlib.metadata.version("coqui-tts"))
+            sys.exit(0)
+
         # 1) List pre-trained TTS models
         if args.list_models:
             manager.list_models()
 
@@ -42,7 +42,7 @@ class VitsConfig(BaseTTSConfig):
             Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
 
         scheduler_after_epoch (bool):
-            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
+            If true, step the schedulers after each epoch else after each step. Defaults to `True`.
 
         optimizer (str):
             Name of the optimizer to use with both the generator and the discriminator networks. One of the
 
@@ -3,18 +3,19 @@
 import logging
 import os
 import random
+from math import floor
 from typing import Any
 
 import numpy as np
 import numpy.typing as npt
 import torch
-import torchaudio
 import tqdm
 from torch.utils.data import Dataset
 
 from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
+from TTS.utils.generic_utils import is_pytorch_at_least_2_9
 
 logger = logging.getLogger(__name__)
 
@@ -47,6 +48,20 @@ def string2filename(string: str) -> str:
     return base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
 
 
+def _get_audio_size_torchcodec(audiopath: str | os.PathLike[Any]) -> int:
+    try:
+        from torchcodec.decoders import AudioDecoder
+    except ImportError as e:
+        msg = "torchcodec not installed (available in the `codec` extra)"
+        raise ImportError(msg) from e
+    except RuntimeError as e:
+        msg = "Error while importing torchcodec, see the stacktrace for details."
+        raise ImportError(msg) from e
+
+    metadata = AudioDecoder(audiopath).metadata
+    return floor(metadata.duration_seconds_from_header * metadata.sample_rate)
+
+
 def get_audio_size(audiopath: str | os.PathLike[Any]) -> int:
     """Return the number of samples in the audio file."""
     if not isinstance(audiopath, str):
@@ -57,7 +72,12 @@ def get_audio_size(audiopath: str | os.PathLike[Any]) -> int:
         raise RuntimeError(msg)
 
     try:
-        return torchaudio.info(audiopath).num_frames
+        if is_pytorch_at_least_2_9():
+            return _get_audio_size_torchcodec(audiopath)
+        else:
+            import torchaudio
+
+            return torchaudio.info(audiopath).num_frames
     except RuntimeError as e:
         msg = f"Failed to decode {audiopath}"
         raise RuntimeError(msg) from e