2727
2828
2929def parse_args (arg_list : list [str ] | None ) -> argparse .Namespace :
30- parser = argparse .ArgumentParser ()
31- parser .add_argument ("--config_path" , type = str , help = "Path to config file for training." , required = True )
32- parser .add_argument ("--checkpoint_path" , type = str , help = "Model file to be restored." , required = True )
33- parser .add_argument ("--output_path" , type = str , help = "Path to save mel specs" , required = True )
34- parser .add_argument ("--debug" , default = False , action = "store_true" , help = "Save audio files for debug" )
35- parser .add_argument ("--save_audio" , default = False , action = "store_true" , help = "Save audio files" )
36- parser .add_argument ("--quantize_bits" , type = int , default = 0 , help = "Save quantized audio files if non-zero" )
37- parser .add_argument ("--eval" , action = argparse .BooleanOptionalAction , help = "compute eval." , default = True )
30+ parser = argparse .ArgumentParser (
31+ description = """Extract mel spectrograms from audio using teacher forcing with a trained TTS model.
32+
33+ This script loads a trained TTS model and extracts mel spectrograms by running the model with teacher forcing.
34+ This is useful for analyzing model predictions, creating training data for downstream models, or debugging
35+ model behavior. Supports Tacotron, Tacotron2, and Glow-TTS models.
36+
37+ The script will create subdirectories in the output path:
38+ - mel/: Extracted mel spectrograms (.npy files)
39+ - wav/: Original audio files (if --save_audio is enabled)
40+ - wav_gl/: Griffin-Lim reconstructed audio from mels (if --debug is enabled)
41+ - quant/: Quantized audio files (if --quantize_bits > 0)""" ,
42+ formatter_class = argparse .RawDescriptionHelpFormatter ,
43+ epilog = """Example usage:
44+ python extract_tts_spectrograms.py \\
45+ --config_path /path/to/config.json \\
46+ --checkpoint_path /path/to/checkpoint.pth \\
47+ --output_path /path/to/output""" ,
48+ )
49+ parser .add_argument (
50+ "--config_path" ,
51+ type = str ,
52+ help = "Path to the model configuration file (JSON) used during training. "
53+ "This config defines the model architecture, audio parameters, and dataset settings." ,
54+ required = True ,
55+ )
56+ parser .add_argument (
57+ "--checkpoint_path" ,
58+ type = str ,
59+ help = "Path to the trained model checkpoint file (.pth) to be loaded for inference." ,
60+ required = True ,
61+ )
62+ parser .add_argument (
63+ "--output_path" ,
64+ type = str ,
65+ help = "Directory path where extracted mel spectrograms and optional audio files will be saved. "
66+ "Subdirectories will be created automatically." ,
67+ default = "output_extract_tts_spectrograms" ,
68+ )
69+ parser .add_argument (
70+ "--debug" ,
71+ default = False ,
72+ action = "store_true" ,
73+ help = "Enable debug mode: saves Griffin-Lim reconstructed audio files from the extracted mel spectrograms "
74+ "to wav_gl/ subdirectory for quality inspection." ,
75+ )
76+ parser .add_argument (
77+ "--save_audio" ,
78+ default = False ,
79+ action = "store_true" ,
80+ help = "Save the original audio files to the wav/ subdirectory alongside the extracted mel spectrograms." ,
81+ )
82+ parser .add_argument (
83+ "--quantize_bits" ,
84+ type = int ,
85+ default = 0 ,
86+ help = "Bit depth for audio quantization (e.g., 8, 16). If set to a non-zero value, saves quantized versions "
87+ "of audio files to the quant/ subdirectory. Set to 0 (default) to disable quantization." ,
88+ )
89+ parser .add_argument (
90+ "--eval" ,
91+ action = argparse .BooleanOptionalAction ,
92+ help = "Include evaluation split in processing. When enabled (default), processes both training and evaluation "
93+ "samples. Use --no-eval to process only training samples." ,
94+ default = True ,
95+ )
3896 return parser .parse_args (arg_list )
3997
4098
@@ -75,19 +133,6 @@ def setup_loader(config: BaseTTSConfig, ap: AudioProcessor, r, speaker_manager:
75133 )
76134
77135
78- def set_filename (wav_path : str , out_path : Path ) -> tuple [Path , Path , Path , Path ]:
79- wav_name = Path (wav_path ).stem
80- (out_path / "quant" ).mkdir (exist_ok = True , parents = True )
81- (out_path / "mel" ).mkdir (exist_ok = True , parents = True )
82- (out_path / "wav_gl" ).mkdir (exist_ok = True , parents = True )
83- (out_path / "wav" ).mkdir (exist_ok = True , parents = True )
84- wavq_path = out_path / "quant" / wav_name
85- mel_path = out_path / "mel" / wav_name
86- wav_gl_path = out_path / "wav_gl" / f"{ wav_name } .wav"
87- out_wav_path = out_path / "wav" / f"{ wav_name } .wav"
88- return wavq_path , mel_path , wav_gl_path , out_wav_path
89-
90-
91136def format_data (data ):
92137 # setup input data
93138 text_input = data ["token_id" ]
@@ -213,34 +258,36 @@ def extract_spectrograms(
213258 d_vectors ,
214259 )
215260
261+ (output_path / "mel" ).mkdir (exist_ok = True , parents = True )
216262 for idx in range (text_input .shape [0 ]):
217- wav_file_path = item_idx [idx ]
263+ wav_file_path = Path ( item_idx [idx ])
218264 wav = ap .load_wav (wav_file_path )
219- wavq_path , mel_path , wav_gl_path , wav_path = set_filename (wav_file_path , output_path )
220265
221266 # quantize and save wav
222267 if quantize_bits > 0 :
223- wavq = quantize (wav , quantize_bits )
224- np .save (wavq_path , wavq )
268+ wavq = quantize (x = wav , quantize_bits = quantize_bits )
269+ (output_path / "quant" ).mkdir (exist_ok = True )
270+ np .save (output_path / "quant" / wav_file_path .stem , wavq )
225271
226272 # save TTS mel
227273 mel = model_output [idx ]
228274 mel_length = mel_lengths [idx ]
229275 mel = mel [:mel_length , :].T
230- np .save (mel_path , mel )
276+ np .save (output_path / "mel" / wav_file_path . stem , mel )
231277
232- export_metadata .append ([ wav_file_path , mel_path ] )
278+ export_metadata .append (output_path / "mel" / wav_file_path . stem )
233279 if save_audio :
234- ap .save_wav (wav , wav_path )
280+ (output_path / "wav" ).mkdir (exist_ok = True )
281+ ap .save_wav (wav , output_path / "wav" / f"{ wav_file_path .stem } .wav" )
235282
236283 if debug :
237- print ( "Audio for debug saved at:" , wav_gl_path )
238- wav = ap . inv_melspectrogram ( mel )
239- ap .save_wav (wav , wav_gl_path )
284+ wav_gl = ap . inv_melspectrogram ( mel )
285+ ( output_path / "wav_gl" ). mkdir ( exist_ok = True )
286+ ap .save_wav (wav_gl , output_path / "wav_gl" / f" { wav_file_path . stem } .wav" )
240287
241288 with (output_path / metadata_name ).open ("w" ) as f :
242- for data in export_metadata :
243- f .write (f"{ data [ 0 ] / data [ 1 ] } .npy\n " )
289+ for path in export_metadata :
290+ f .write (f"{ path } .npy\n " )
244291
245292
246293def main (arg_list : list [str ] | None = None ) -> None :
@@ -264,12 +311,7 @@ def main(arg_list: list[str] | None = None) -> None:
264311 meta_data = meta_data_train + meta_data_eval
265312
266313 # init speaker manager
267- if config .use_speaker_embedding :
268- speaker_manager = SpeakerManager (data_items = meta_data )
269- elif config .use_d_vector_file :
270- speaker_manager = SpeakerManager (d_vectors_file_path = config .d_vector_file )
271- else :
272- speaker_manager = None
314+ speaker_manager = SpeakerManager .init_from_config (config )
273315
274316 # setup model
275317 model = setup_model (config )
0 commit comments