idiap
diff --git a/‎TTS/bin/compute_embeddings.py‎
Lines changed: 7 additions & 10 deletions b/‎TTS/bin/compute_embeddings.py‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎TTS/bin/compute_statistics.py‎
Lines changed: 4 additions & 4 deletions b/‎TTS/bin/compute_statistics.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎TTS/bin/extract_tts_spectrograms.py‎
Lines changed: 81 additions & 39 deletions b/‎TTS/bin/extract_tts_spectrograms.py‎
Lines changed: 81 additions & 39 deletions
diff --git a/‎TTS/config/__init__.py‎
Lines changed: 1 addition & 26 deletions b/‎TTS/config/__init__.py‎
Lines changed: 1 addition & 26 deletions
diff --git a/‎TTS/config/shared_configs.py‎
Lines changed: 4 additions & 4 deletions b/‎TTS/config/shared_configs.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎TTS/model.py‎
Lines changed: 5 additions & 3 deletions b/‎TTS/model.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎TTS/tts/configs/shared_configs.py‎
Lines changed: 3 additions & 3 deletions b/‎TTS/tts/configs/shared_configs.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎TTS/tts/configs/vits_config.py‎
Lines changed: 4 additions & 4 deletions b/‎TTS/tts/configs/vits_config.py‎
Lines changed: 4 additions & 4 deletions
@@ -94,19 +94,19 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
         help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
         default=None,
     )
-    return parser.parse_args()
+    return parser.parse_args(arg_list)
 
 
 def compute_embeddings(
     model_path,
     config_path,
     output_path,
-    old_speakers_file=None,
+    old_speakers_file: str | None = None,
     old_append=False,
     config_dataset_path=None,
-    formatter_name=None,
-    dataset_name=None,
-    dataset_path=None,
+    formatter_name: str | None = None,
+    dataset_name: str | None = None,
+    dataset_path: str | None = None,
     meta_file_train=None,
     meta_file_val=None,
     disable_cuda=False,
@@ -128,11 +128,7 @@ def compute_embeddings(
             c_dataset.meta_file_val = meta_file_val
         meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
 
-    if meta_data_eval is None:
-        samples = meta_data_train
-    else:
-        samples = meta_data_train + meta_data_eval
-
+    samples = meta_data_train + meta_data_eval
     encoder_manager = SpeakerManager(
         encoder_model_path=model_path,
         encoder_config_path=config_path,
@@ -182,6 +178,7 @@ def compute_embeddings(
 
         save_file(speaker_mapping, mapping_file_path)
         print("Speaker embeddings saved at:", mapping_file_path)
+    sys.exit(0)
 
 
 def main(arg_list: list[str] | None = None):
 
@@ -1,10 +1,9 @@
 #!/usr/bin/env python3
 
 import argparse
-import glob
 import logging
-import os
 import sys
+from pathlib import Path
 
 import numpy as np
 from tqdm import tqdm
@@ -19,7 +18,7 @@
 def parse_args(arg_list: list[str] | None) -> tuple[argparse.Namespace, list[str]]:
     parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
     parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
-    parser.add_argument("out_path", type=str, help="save path (directory and filename).")
+    parser.add_argument("out_path", type=str, help="save path (directory and filename).", default="scale_stats.npy")
     parser.add_argument(
         "--data_path",
         type=str,
@@ -46,7 +45,7 @@ def main(arg_list: list[str] | None = None):
 
     # load the meta data of target dataset
     if args.data_path:
-        dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
+        dataset_items = list(Path(args.data_path).rglob("*.wav"))
     else:
         dataset_items = load_tts_samples(CONFIG.datasets)[0]  # take only train data
     print(f" > There are {len(dataset_items)} files.")
@@ -95,6 +94,7 @@ def main(arg_list: list[str] | None = None):
     del CONFIG.audio.symmetric_norm
     del CONFIG.audio.clip_norm
     stats["audio_config"] = CONFIG.audio.to_dict()
+    Path(output_file_path).parent.mkdir(exist_ok=True, parents=True)
     np.save(output_file_path, stats, allow_pickle=True)
     print(f" > stats saved to {output_file_path}")
     sys.exit(0)
 
@@ -27,14 +27,72 @@
 
 
 def parse_args(arg_list: list[str] | None) -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
-    parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
-    parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
-    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
-    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
-    parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
-    parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
+    parser = argparse.ArgumentParser(
+        description="""Extract mel spectrograms from audio using teacher forcing with a trained TTS model.
+
+This script loads a trained TTS model and extracts mel spectrograms by running the model with teacher forcing.
+This is useful for analyzing model predictions, creating training data for downstream models, or debugging
+model behavior. Supports Tacotron, Tacotron2, and Glow-TTS models.
+
+The script will create subdirectories in the output path:
+  - mel/: Extracted mel spectrograms (.npy files)
+  - wav/: Original audio files (if --save_audio is enabled)
+  - wav_gl/: Griffin-Lim reconstructed audio from mels (if --debug is enabled)
+  - quant/: Quantized audio files (if --quantize_bits > 0)""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""Example usage:
+  python extract_tts_spectrograms.py \\
+    --config_path /path/to/config.json \\
+    --checkpoint_path /path/to/checkpoint.pth \\
+    --output_path /path/to/output""",
+    )
+    parser.add_argument(
+        "--config_path",
+        type=str,
+        help="Path to the model configuration file (JSON) used during training. "
+        "This config defines the model architecture, audio parameters, and dataset settings.",
+        required=True,
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        help="Path to the trained model checkpoint file (.pth) to be loaded for inference.",
+        required=True,
+    )
+    parser.add_argument(
+        "--output_path",
+        type=str,
+        help="Directory path where extracted mel spectrograms and optional audio files will be saved. "
+        "Subdirectories will be created automatically.",
+        default="output_extract_tts_spectrograms",
+    )
+    parser.add_argument(
+        "--debug",
+        default=False,
+        action="store_true",
+        help="Enable debug mode: saves Griffin-Lim reconstructed audio files from the extracted mel spectrograms "
+        "to wav_gl/ subdirectory for quality inspection.",
+    )
+    parser.add_argument(
+        "--save_audio",
+        default=False,
+        action="store_true",
+        help="Save the original audio files to the wav/ subdirectory alongside the extracted mel spectrograms.",
+    )
+    parser.add_argument(
+        "--quantize_bits",
+        type=int,
+        default=0,
+        help="Bit depth for audio quantization (e.g., 8, 16). If set to a non-zero value, saves quantized versions "
+        "of audio files to the quant/ subdirectory. Set to 0 (default) to disable quantization.",
+    )
+    parser.add_argument(
+        "--eval",
+        action=argparse.BooleanOptionalAction,
+        help="Include evaluation split in processing. When enabled (default), processes both training and evaluation "
+        "samples. Use --no-eval to process only training samples.",
+        default=True,
+    )
     return parser.parse_args(arg_list)
 
 
@@ -75,19 +133,6 @@ def setup_loader(config: BaseTTSConfig, ap: AudioProcessor, r, speaker_manager:
     )
 
 
-def set_filename(wav_path: str, out_path: Path) -> tuple[Path, Path, Path, Path]:
-    wav_name = Path(wav_path).stem
-    (out_path / "quant").mkdir(exist_ok=True, parents=True)
-    (out_path / "mel").mkdir(exist_ok=True, parents=True)
-    (out_path / "wav_gl").mkdir(exist_ok=True, parents=True)
-    (out_path / "wav").mkdir(exist_ok=True, parents=True)
-    wavq_path = out_path / "quant" / wav_name
-    mel_path = out_path / "mel" / wav_name
-    wav_gl_path = out_path / "wav_gl" / f"{wav_name}.wav"
-    out_wav_path = out_path / "wav" / f"{wav_name}.wav"
-    return wavq_path, mel_path, wav_gl_path, out_wav_path
-
-
 def format_data(data):
     # setup input data
     text_input = data["token_id"]
@@ -213,34 +258,36 @@ def extract_spectrograms(
             d_vectors,
         )
 
+        (output_path / "mel").mkdir(exist_ok=True, parents=True)
         for idx in range(text_input.shape[0]):
-            wav_file_path = item_idx[idx]
+            wav_file_path = Path(item_idx[idx])
             wav = ap.load_wav(wav_file_path)
-            wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
 
             # quantize and save wav
             if quantize_bits > 0:
-                wavq = quantize(wav, quantize_bits)
-                np.save(wavq_path, wavq)
+                wavq = quantize(x=wav, quantize_bits=quantize_bits)
+                (output_path / "quant").mkdir(exist_ok=True)
+                np.save(output_path / "quant" / wav_file_path.stem, wavq)
 
             # save TTS mel
             mel = model_output[idx]
             mel_length = mel_lengths[idx]
             mel = mel[:mel_length, :].T
-            np.save(mel_path, mel)
+            np.save(output_path / "mel" / wav_file_path.stem, mel)
 
-            export_metadata.append([wav_file_path, mel_path])
+            export_metadata.append(output_path / "mel" / wav_file_path.stem)
             if save_audio:
-                ap.save_wav(wav, wav_path)
+                (output_path / "wav").mkdir(exist_ok=True)
+                ap.save_wav(wav, output_path / "wav" / f"{wav_file_path.stem}.wav")
 
             if debug:
-                print("Audio for debug saved at:", wav_gl_path)
-                wav = ap.inv_melspectrogram(mel)
-                ap.save_wav(wav, wav_gl_path)
+                wav_gl = ap.inv_melspectrogram(mel)
+                (output_path / "wav_gl").mkdir(exist_ok=True)
+                ap.save_wav(wav_gl, output_path / "wav_gl" / f"{wav_file_path.stem}.wav")
 
     with (output_path / metadata_name).open("w") as f:
-        for data in export_metadata:
-            f.write(f"{data[0] / data[1]}.npy\n")
+        for path in export_metadata:
+            f.write(f"{path}.npy\n")
 
 
 def main(arg_list: list[str] | None = None) -> None:
@@ -264,12 +311,7 @@ def main(arg_list: list[str] | None = None) -> None:
     meta_data = meta_data_train + meta_data_eval
 
     # init speaker manager
-    if config.use_speaker_embedding:
-        speaker_manager = SpeakerManager(data_items=meta_data)
-    elif config.use_d_vector_file:
-        speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
-    else:
-        speaker_manager = None
+    speaker_manager = SpeakerManager.init_from_config(config)
 
     # setup model
     model = setup_model(config)
 
@@ -108,32 +108,7 @@ def load_config(config_path: str | os.PathLike[Any]) -> BaseTrainingConfig:
     return config
 
 
-def check_config_and_model_args(config, arg_name, value):
-    """Check the give argument in `config.model_args` if exist or in `config` for
-    the given value.
-
-    Return False if the argument does not exist in `config.model_args` or `config`.
-    This is to patch up the compatibility between models with and without `model_args`.
-
-    TODO: Remove this in the future with a unified approach.
-    """
-    if getattr(config, "model_args", None) is not None:
-        if arg_name in config.model_args:
-            return config.model_args[arg_name] == value
-    if hasattr(config, arg_name):
-        return config[arg_name] == value
-    return False
-
-
-def get_from_config_or_model_args(config, arg_name):
-    """Get the given argument from `config.model_args` if exist or in `config`."""
-    if getattr(config, "model_args", None) is not None:
-        if arg_name in config.model_args:
-            return config.model_args[arg_name]
-    return config[arg_name]
-
-
-def get_from_config_or_model_args_with_default(config, arg_name, def_val):
+def get_from_config_or_model_args(config: Coqpit, arg_name: str, def_val: Any = None) -> Any:
     """Get the given argument from `config.model_args` if exist or in `config`."""
     if getattr(config, "model_args", None) is not None:
         if arg_name in config.model_args:
 
@@ -222,11 +222,11 @@ class BaseDatasetConfig(Coqpit):
             train the duration predictor.
     """
 
-    formatter: str = ""
-    dataset_name: str = ""
-    path: str = ""
+    formatter: str | None = ""
+    dataset_name: str | None = ""
+    path: str | None = ""
     meta_file_train: str = ""
-    ignored_speakers: list[str] = None
+    ignored_speakers: list[str] | None = None
     language: str = ""
     phonemizer: str = ""
     meta_file_val: str = ""
 
@@ -7,8 +7,6 @@
 from trainer import TrainerModel
 from trainer.io import load_fsspec
 
-# pylint: skip-file
-
 
 class BaseTrainerModel(TrainerModel):
     """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
@@ -29,7 +27,7 @@ def init_from_config(config: Coqpit) -> "BaseTrainerModel":
     def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict[str, Any]:
         """Forward pass for inference.
 
-        It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
+        Must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
         is considered to be the main output and you can add any other auxiliary outputs as you want.
 
         We don't use `*kwargs` since it is problematic with the TorchScript API.
@@ -40,6 +38,7 @@ def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict
 
         Returns:
             Dict: [description]
+
         """
         outputs_dict = {"model_outputs": None}
         ...
@@ -53,6 +52,7 @@ def load_checkpoint(
         eval: bool = False,
         strict: bool = True,
         cache: bool = False,
+        **kwargs: Any,
     ) -> None:
         """Load a model checkpoint file and get ready for training or inference.
 
@@ -63,6 +63,7 @@ def load_checkpoint(
             strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
             cache (bool, optional): If True, cache the file locally for subsequent calls.
                 It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False.
+
         """
         state = load_fsspec(checkpoint_path, map_location="cpu", cache=cache)
         self.load_state_dict(state["model"], strict=strict)
@@ -71,4 +72,5 @@ def load_checkpoint(
 
     @property
     def device(self) -> torch.device:
+        """Return device of the model based on its parameters."""
         return next(self.parameters()).device
@@ -7,7 +7,7 @@
     BaseAudioConfig,
     BaseDatasetConfig,
     BaseTrainingConfig,
-    get_from_config_or_model_args_with_default,
+    get_from_config_or_model_args,
 )
 
 
@@ -357,6 +357,6 @@ class BaseTTSConfig(BaseTrainingConfig):
     @property
     def supports_cloning(self) -> bool:
         return self._supports_cloning or (
-            Path(get_from_config_or_model_args_with_default(self, "speaker_encoder_model_path", "")).is_file()
-            and Path(get_from_config_or_model_args_with_default(self, "speaker_encoder_config_path", "")).is_file()
+            Path(get_from_config_or_model_args(self, "speaker_encoder_model_path", "")).is_file()
+            and Path(get_from_config_or_model_args(self, "speaker_encoder_config_path", "")).is_file()
         )
@@ -159,15 +159,15 @@ class VitsConfig(BaseTTSConfig):
     # use speaker embedding layer
     num_speakers: int = 0
     use_speaker_embedding: bool = False
-    speakers_file: str = None
+    speakers_file: str | None = None
     speaker_embedding_channels: int = 256
-    language_ids_file: str = None
+    language_ids_file: str | None = None
     use_language_embedding: bool = False
 
     # use d-vectors
     use_d_vector_file: bool = False
-    d_vector_file: list[str] = None
-    d_vector_dim: int = None
+    d_vector_file: str | list[str] | None = None
+    d_vector_dim: int | None = None
 
     def __post_init__(self):
         for key, val in self.model_args.items():