From 33c030f844b9716914bc1133352b63d8857d0818 Mon Sep 17 00:00:00 2001 From: Thorsten Mueller Date: Tue, 28 Sep 2021 06:10:21 +0200 Subject: [PATCH] Added two scripts for dataset analysis/cleaning. --- helperScripts/getDatasetSpeechRate.py | 37 +++++++++++++++++++ helperScripts/removeFilesFromDataset.py | 48 +++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 helperScripts/getDatasetSpeechRate.py create mode 100644 helperScripts/removeFilesFromDataset.py diff --git a/helperScripts/getDatasetSpeechRate.py b/helperScripts/getDatasetSpeechRate.py new file mode 100644 index 0000000..178bb19 --- /dev/null +++ b/helperScripts/getDatasetSpeechRate.py @@ -0,0 +1,37 @@ +# This script gets speech rate per audio recording from a voice dataset (ljspeech file and directory structure) +# Writte by Thorsten Müller (deep-learning-german@gmx.net) and provided without any warranty. +# https://github.com/thorstenMueller/deep-learning-german-tts/ +# https://twitter.com/ThorstenVoice + +# Changelog: +# v0.1 - 26.09.2021 - Initial version + +import os +import librosa +import csv + +dataset_dir = "/Users/thorsten/Downloads/thorsten-export-20210909/" # Directory where metadata.csv is in +out_csv_file = os.path.join(dataset_dir,"speech_rate_report.csv") +decimal_use_comma = True # False: Splitting decimal value with a dot (.); True: Comma (,) + +out_csv = open(out_csv_file,"w") +out_csv.write("filename;audiolength_sec;number_chars;chars_per_sec;remove_from_dataset\n") + +# Open metadata.csv file +with open(os.path.join(dataset_dir,"metadata.csv")) as csvfile: + reader = csv.reader(csvfile, delimiter='|') + for row in reader: + wav_file = os.path.join(dataset_dir,"wavs",row[0] + ".wav") + + # Gather values for report.csv output + phrase_len = len(row[1]) - 1 # Do not count punctuation marks. + duration = round(librosa.get_duration(filename=wav_file),2) + char_per_sec = round(phrase_len / duration,2) + + if decimal_use_comma: + duration = str(duration).replace(".",",") + char_per_sec = str(char_per_sec).replace(".",",") + + out_csv.write(row[0] + ".wav;" + str(duration) + ";" + str(phrase_len) + ";" + str(char_per_sec) + ";no\n") + +out_csv.close() diff --git a/helperScripts/removeFilesFromDataset.py b/helperScripts/removeFilesFromDataset.py new file mode 100644 index 0000000..ef8789f --- /dev/null +++ b/helperScripts/removeFilesFromDataset.py @@ -0,0 +1,48 @@ +# This script removes recordings from an ljspeech file/directory structured dataset based on CSV file from "getDatasetSpeechRate" +# Writte by Thorsten Müller (deep-learning-german@gmx.net) and provided without any warranty. +# https://github.com/thorstenMueller/deep-learning-german-tts/ +# https://twitter.com/ThorstenVoice + +# Changelog: +# v0.1 - 26.09.2021 - Initial version + +import os +import csv +import shutil + +dataset_dir = "/Users/thorsten/Downloads/thorsten-export-20210909/" # Directory where metadata.csv is in +subfolder_removed = "___removed" +in_csv_file = os.path.join(dataset_dir,"speech_rate_report.csv") +to_remove = [] + +# Open metadata.csv file +with open(os.path.join(dataset_dir,in_csv_file)) as csvfile: + reader = csv.reader(csvfile, delimiter=';') + for row in reader: + if row[4] == "yes": + # Recording in that row should be removed from dataset + to_remove.append(row[0]) + print("Recording " + row[0] + " will be removed from dataset.") + +print("\n" + str(len(to_remove)) + " recordings has been marked for deletion.") + +if len(to_remove) > 0: + + metadata_cleaned = open(os.path.join(dataset_dir,"metadata_cleaned.csv"),"w") + + # Create new subdirectory for removed wav files + removed_dir = os.path.join(dataset_dir,subfolder_removed) + if not os.path.exists(removed_dir): + os.makedirs(removed_dir) + + # Remove lines from metadata.csv and move wav files to new subdirectory + with open(os.path.join(dataset_dir,"metadata.csv")) as csvfile: + reader = csv.reader(csvfile, delimiter='|') + for row in reader: + if (row[0] + ".wav") not in to_remove: + metadata_cleaned.write(row[0] + "|" + row[1] + "|" + row[2] + "\n") + else: + # Move recording to new subfolder + shutil.move(os.path.join(dataset_dir,"wavs",row[0] + ".wav"),removed_dir) + + metadata_cleaned.close()