DL-Art-School/codes/scripts/audio/preparation/phase_1_split_files.py
2022-03-08 15:49:51 -07:00

69 lines
2.8 KiB
Python

"""
Master script that processes all MP3 files found in an input directory. Performs the following operations, per-file:
1. Splits the file on silence intervals, throwing out all clips that are too short or long.
2.
"""
import argparse
import functools
import os
from multiprocessing.pool import ThreadPool
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
from pydub.silence import split_on_silence
from tqdm import tqdm
from data.util import find_audio_files
def report_progress(progress_file, file):
with open(progress_file, 'a', encoding='utf-8') as f:
f.write(f'{file}\n')
def process_file(file, base_path, output_path, progress_file):
# Hyper-parameters; feel free to adjust.
minimum_duration = 4
maximum_duration = 20
# Part 1 is to split a large file into chunks.
try:
speech = AudioSegment.from_file(file)
except CouldntDecodeError as e:
print(e)
report_progress(progress_file, file)
return
outdir = os.path.join(output_path, f'{os.path.relpath(file, base_path)[:-4]}').replace('.', '').strip()
os.makedirs(outdir, exist_ok=True)
chunks = split_on_silence(speech, min_silence_len=600, silence_thresh=-40, seek_step=100, keep_silence=50)
for i in range(0, len(chunks)):
if chunks[i].duration_seconds < minimum_duration or chunks[i].duration_seconds > maximum_duration:
continue
chunks[i].export(f"{outdir}/{i:05d}.mp3", format='mp3', parameters=["-ac", "1"])
report_progress(progress_file, file)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-path', type=str, help='Path to search for files', default='Y:\\sources\\big_podcast')
parser.add_argument('-progress_file', type=str, help='Place to store all files that have already been processed', default='Y:\\sources\\big_podcast\\already_processed.txt')
parser.add_argument('-output_path', type=str, help='Path for output files', default='Y:\\split\\big_podcast')
parser.add_argument('-num_threads', type=int, help='Number of concurrent workers processing files.', default=4)
args = parser.parse_args()
processed_files = set()
if os.path.exists(args.progress_file):
with open(args.progress_file, 'r', encoding='utf-8') as f:
for line in f.readlines():
processed_files.add(line.strip())
files = set(find_audio_files(args.path, include_nonwav=True))
orig_len = len(files)
files = files - processed_files
print(f"Found {len(files)} files to process. Total processing is {100*(orig_len-len(files))/orig_len}% complete.")
with ThreadPool(args.num_threads) as pool:
list(tqdm(pool.imap(functools.partial(process_file, output_path=args.output_path, base_path=args.path, progress_file=args.progress_file), files), total=len(files)))