Merge pull request #525 from idiap/torch29

eginhard · web-flow · commit 3250eddb8bc2 · 2025-12-12T14:31:54.000+01:00
fix: support pytorch 2.9
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -76,7 +76,7 @@ jobs:
           if [ "${{ matrix.python-version }}" == "3.10" ]; then
             resolution=lowest-direct
           fi
-          uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }}
+          uv run --resolution=$resolution --extra codec --extra server --extra languages make ${{ matrix.subset }}
       - name: Upload coverage data
         uses: actions/upload-artifact@v4
         with:
@@ -119,7 +119,7 @@ jobs:
           if [ "${{ matrix.python-version }}" == "3.10" ]; then
             resolution=lowest-direct
           fi
-          uv run --resolution=$resolution --extra languages coverage run -m pytest -x -v --durations=0 $shard_tests
+          uv run --resolution=$resolution --extra codec --extra languages coverage run -m pytest -x -v --durations=0 $shard_tests
       - name: Upload coverage data
         uses: actions/upload-artifact@v4
         with:
@@ -154,7 +154,7 @@ jobs:
             uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
           fi
       - name: Zoo tests
-        run: uv run --extra server --extra languages make test_zoo
+        run: uv run --extra codec --extra server --extra languages make test_zoo
         env:
           NUM_PARTITIONS: 3
           TEST_PARTITION: ${{ matrix.partition }}
diff --git a/Makefile b/Makefile
@@ -44,7 +44,7 @@ lint:	## run linters.
 	uv run --only-dev ruff format ${target_dirs} --check
 
 system-deps:	## install linux system deps
-	sudo apt-get install -y libsndfile1-dev
+	sudo apt-get install -y libsndfile1-dev ffmpeg
 
 install:	## install 🐸 TTS
 	uv sync --all-extras
diff --git a/README.md b/README.md
@@ -118,7 +118,9 @@ You can also help us implement more models.
 ## Installation
 
 🐸TTS is tested on Ubuntu 24.04 with **python >= 3.10, < 3.14**, but should also
-work on Mac and Windows.
+work on Mac and Windows. Depending on your platform, you might first want to
+separately install Pytorch, `torchaudio`, and `torchcodec` with their
+[official instructions](https://pytorch.org/get-started/locally/).
 
 If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the pretrained 🐸TTS models, installing from PyPI is the easiest option.
 
@@ -141,6 +143,7 @@ The following extras allow the installation of optional dependencies:
 | Name | Description |
 |------|-------------|
 | `all` | All optional dependencies |
+| `codec` | Installs torchcodec needed with Pytorch>=2.9 |
 | `notebooks` | Dependencies only used in notebooks |
 | `server` | Dependencies to run the TTS server |
 | `bn` | Bangla G2P |
diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py
@@ -42,7 +42,7 @@ class VitsConfig(BaseTTSConfig):
             Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
 
         scheduler_after_epoch (bool):
-            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
+            If true, step the schedulers after each epoch else after each step. Defaults to `True`.
 
         optimizer (str):
             Name of the optimizer to use with both the generator and the discriminator networks. One of the
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
@@ -3,18 +3,19 @@
 import logging
 import os
 import random
+from math import floor
 from typing import Any
 
 import numpy as np
 import numpy.typing as npt
 import torch
-import torchaudio
 import tqdm
 from torch.utils.data import Dataset
 
 from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
+from TTS.utils.generic_utils import is_pytorch_at_least_2_9
 
 logger = logging.getLogger(__name__)
 
@@ -47,6 +48,20 @@ def string2filename(string: str) -> str:
     return base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
 
 
+def _get_audio_size_torchcodec(audiopath: str | os.PathLike[Any]) -> int:
+    try:
+        from torchcodec.decoders import AudioDecoder
+    except ImportError as e:
+        msg = "torchcodec not installed (available in the `codec` extra)"
+        raise ImportError(msg) from e
+    except RuntimeError as e:
+        msg = "Error while importing torchcodec, see the stacktrace for details."
+        raise ImportError(msg) from e
+
+    metadata = AudioDecoder(audiopath).metadata
+    return floor(metadata.duration_seconds_from_header * metadata.sample_rate)
+
+
 def get_audio_size(audiopath: str | os.PathLike[Any]) -> int:
     """Return the number of samples in the audio file."""
     if not isinstance(audiopath, str):
@@ -57,7 +72,12 @@ def get_audio_size(audiopath: str | os.PathLike[Any]) -> int:
         raise RuntimeError(msg)
 
     try:
-        return torchaudio.info(audiopath).num_frames
+        if is_pytorch_at_least_2_9():
+            return _get_audio_size_torchcodec(audiopath)
+        else:
+            import torchaudio
+
+            return torchaudio.info(audiopath).num_frames
     except RuntimeError as e:
         msg = f"Failed to decode {audiopath}"
         raise RuntimeError(msg) from e
diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py
@@ -161,6 +161,11 @@ def is_pytorch_at_least_2_4() -> bool:
     return Version(torch.__version__) >= Version("2.4")
 
 
+def is_pytorch_at_least_2_9() -> bool:
+    """Check if the installed Pytorch version is 2.4 or higher."""
+    return Version(torch.__version__) >= Version("2.9")
+
+
 def optional_to_str(x: Any | None) -> str:
     """Convert input to string, using empty string if input is None."""
     return "" if x is None else str(x)
diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py
@@ -172,9 +172,6 @@ class FreeVCConfig(BaseVCConfig):
         lr_scheduler_disc_params (dict):
             Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
 
-        scheduler_after_epoch (bool):
-            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
-
         optimizer (str):
             Name of the optimizer to use with both the generator and the discriminator networks. One of the
             `torch.optim.*`. Defaults to `AdamW`.
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "coqui-tts"
-version = "0.27.2"
+version = "0.27.3"
 description = "Deep learning for Text to Speech."
 readme = "README.md"
 requires-python = ">=3.10, <3.14"
@@ -59,8 +59,8 @@ dependencies = [
     # Core
     "numpy>=1.26.0",
     "scipy>=1.13.0",
-    "torch>=2.2,<2.9",
-    "torchaudio>=2.2.0,<2.9",
+    "torch>=2.2",
+    "torchaudio>=2.2.0",
     "soundfile>=0.12.0",
     "librosa>=0.11.0",
     "numba>=0.58.0",
@@ -92,6 +92,10 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+# torchcodec needed from torch>=2.9
+codec = [
+    "torchcodec>=0.8.0",
+]
 # Only used in notebooks
 notebooks = [
     "bokeh>=3.0.3",
@@ -128,7 +132,7 @@ languages = [
 ]
 # Installs all extras (except dev and docs)
 all = [
-    "coqui-tts[notebooks,server,bn,ja,ko,zh]",
+    "coqui-tts[codec,notebooks,server,bn,ja,ko,zh]",
 ]
 
 [dependency-groups]