Thorsten-Voice/helperScripts/removeFilesFromDataset.py

49 lines
1.9 KiB
Python
Raw Permalink Normal View History

# This script removes recordings from an ljspeech file/directory structured dataset based on CSV file from "getDatasetSpeechRate"
# Writte by Thorsten Müller (deep-learning-german@gmx.net) and provided without any warranty.
# https://github.com/thorstenMueller/deep-learning-german-tts/
# https://twitter.com/ThorstenVoice
# Changelog:
# v0.1 - 26.09.2021 - Initial version
import os
import csv
import shutil
dataset_dir = "/Users/thorsten/Downloads/thorsten-export-20210909/" # Directory where metadata.csv is in
subfolder_removed = "___removed"
in_csv_file = os.path.join(dataset_dir,"speech_rate_report.csv")
to_remove = []
# Open metadata.csv file
with open(os.path.join(dataset_dir,in_csv_file)) as csvfile:
reader = csv.reader(csvfile, delimiter=';')
for row in reader:
if row[4] == "yes":
# Recording in that row should be removed from dataset
to_remove.append(row[0])
print("Recording " + row[0] + " will be removed from dataset.")
print("\n" + str(len(to_remove)) + " recordings has been marked for deletion.")
if len(to_remove) > 0:
metadata_cleaned = open(os.path.join(dataset_dir,"metadata_cleaned.csv"),"w")
# Create new subdirectory for removed wav files
removed_dir = os.path.join(dataset_dir,subfolder_removed)
if not os.path.exists(removed_dir):
os.makedirs(removed_dir)
# Remove lines from metadata.csv and move wav files to new subdirectory
with open(os.path.join(dataset_dir,"metadata.csv")) as csvfile:
reader = csv.reader(csvfile, delimiter='|')
for row in reader:
if (row[0] + ".wav") not in to_remove:
metadata_cleaned.write(row[0] + "|" + row[1] + "|" + row[2] + "\n")
else:
# Move recording to new subfolder
shutil.move(os.path.join(dataset_dir,"wavs",row[0] + ".wav"),removed_dir)
metadata_cleaned.close()