diff --git a/helperScripts/MRS2LJSpeech.py b/helperScripts/MRS2LJSpeech.py new file mode 100644 index 0000000..87389ad --- /dev/null +++ b/helperScripts/MRS2LJSpeech.py @@ -0,0 +1,119 @@ +# This script generates the folder structure for ljspeech-1.1 processing from mimic-recording-studio database + +# Changelog +# v1.0 - Initial release by Thorsten Müller (https://github.com/thorstenMueller/deep-learning-german-tts) +# v1.1 - Great improvements by Peter Schmalfeldt (https://github.com/manifestinteractive) +# - Audio processing with ffmpeg (mono and samplerate of 22.050 Hz) +# - Much better Python coding than my original version +# - Greater logging output to command line +# - See more details here: https://gist.github.com/manifestinteractive/6fd9be62d0ede934d4e1171e5e751aba +# - Thanks Peter, it's a great contribution :-) +# v1.2 - Added choice for choosing which recording session should be exported as LJSpeech + +import glob +import sqlite3 +import ffmpeg +import os + +from shutil import copyfile +from shutil import rmtree + +# Setup Directory Data +cwd = os.path.dirname(os.path.abspath(__file__)) +mrs_dir = os.path.join(cwd, os.pardir, "mimic-recording-studio") +output_dir = os.path.join(cwd, "dataset") +output_dir_audio = "" +output_dir_audio_temp="" +output_dir_speech = "" + +# Create folders needed for ljspeech +def create_folders(): + global output_dir + global output_dir_audio + global output_dir_audio_temp + global output_dir_speech + + print('→ Creating Dataset Folders') + + output_dir_speech = os.path.join(output_dir, "LJSpeech-1.1") + + # Delete existing folder if exists for clean run + if os.path.exists(output_dir_speech): + rmtree(output_dir_speech) + + output_dir_audio = os.path.join(output_dir_speech, "wavs") + output_dir_audio_temp = os.path.join(output_dir_speech, "temp") + + # Create Clean Folders + os.makedirs(output_dir_speech) + os.makedirs(output_dir_audio) + os.makedirs(output_dir_audio_temp) + +def convert_audio(): + global output_dir_audio + global output_dir_audio_temp + + recordings = len([name for name in os.listdir(output_dir_audio_temp) if os.path.isfile(os.path.join(output_dir_audio_temp,name))]) + + print('→ Converting %s Audio Files to 22050 Hz, 16 Bit, Mono\n' % "{:,}".format(recordings)) + + for idx, wav in enumerate(glob.glob(os.path.join(output_dir_audio_temp, "*.wav"))): + + percent = (idx + 1) / recordings + + print('› \033[96m%s\033[0m \033[2m%s / %s (%s)\033[0m ' % (os.path.basename(wav), "{:,}".format((idx + 1)), "{:,}".format(recordings), "{:.0%}".format(percent))) + + # Convert WAV file to required format + (ffmpeg + .input(wav) + .output(os.path.join(output_dir_audio, os.path.basename(wav)), acodec='pcm_s16le', ac=1, ar=22050, loglevel='error') + .overwrite_output() + .run(capture_stdout=True) + ) + + # Delete Temp File + os.remove(wav) + + # Remove Temp Folder + rmtree(output_dir_audio_temp) + +def create_meta_data(): + print('→ Creating META Data') + + conn = sqlite3.connect(os.path.join(mrs_dir, "backend", "db", "mimicstudio.db")) + c = conn.cursor() + + # Create metadata.csv for ljspeech + metadata = open(os.path.join(output_dir_speech, "metadata.csv"), mode="w", encoding="utf8") + + # List available recording sessions + user_models = c.execute('SELECT uuid, user_name from usermodel ORDER BY created_date DESC').fetchall() + user_id = user_models[0][0] + + for row in user_models: + print(row[0] + ' -> ' + row[1]) + + user_answer = input('Please choose ID of recording session to export (default is newest session) [' + user_id + ']: ') + + if user_answer: + user_id = user_answer + + + for row in c.execute('SELECT audio_id, prompt, lower(prompt) FROM audiomodel WHERE user_id = "' + user_id + '" ORDER BY length(prompt)'): + metadata.write(row[0] + "|" + row[1] + "|" + row[2] + "\n") + copyfile(os.path.join(mrs_dir, "backend", "audio_files", user_id, row[0] + ".wav"), os.path.join(output_dir_audio_temp, row[0] + ".wav")) + + metadata.close() + conn.close() + +def main(): + print('\n\033[48;5;22m MRS to LJ Speech Processor \033[0m\n') + + create_folders() + create_meta_data() + convert_audio() + + print('\n\033[38;5;86;1m✔\033[0m COMPLETE【ツ】\n') + +if __name__ == '__main__': + main()