DL-Art-School/codes/scripts/audio/preparation/phase_1_split_files.py



"""
Master script that processes all MP3 files found in an input directory. Performs the following operations, per-file:
1. Splits the file on silence intervals, throwing out all clips that are too short or long.
2.
"""
import argparse
import functools
import os
from multiprocessing.pool import ThreadPool

from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
from pydub.silence import split_on_silence
from tqdm import tqdm

from data.util import find_audio_files


def report_progress(progress_file, file):
    with open(progress_file, 'a', encoding='utf-8') as f:
        f.write(f'{file}\n')


def process_file(file, base_path, output_path, progress_file):
    # Hyper-parameters; feel free to adjust.
    minimum_duration = 4
    maximum_duration = 20

    # Part 1 is to split a large file into chunks.
    try:
        speech = AudioSegment.from_file(file)
    except CouldntDecodeError as e:
        print(e)
        report_progress(progress_file, file)
        return
    outdir = os.path.join(output_path, f'{os.path.relpath(file, base_path)[:-4]}').replace('.', '').strip()
    os.makedirs(outdir, exist_ok=True)
    chunks = split_on_silence(speech, min_silence_len=600, silence_thresh=-40, seek_step=100, keep_silence=50)
    for i in range(0, len(chunks)):
        if chunks[i].duration_seconds < minimum_duration or chunks[i].duration_seconds > maximum_duration:
            continue
        chunks[i].export(f"{outdir}/{i:05d}.mp3", format='mp3', parameters=["-ac", "1"])
    report_progress(progress_file, file)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-path', type=str, help='Path to search for files', default='Y:\\sources\\big_podcast')
    parser.add_argument('-progress_file', type=str, help='Place to store all files that have already been processed', default='Y:\\sources\\big_podcast\\already_processed.txt')
    parser.add_argument('-output_path', type=str, help='Path for output files', default='Y:\\split\\big_podcast')
    parser.add_argument('-num_threads', type=int, help='Number of concurrent workers processing files.', default=4)
    args = parser.parse_args()

    processed_files = set()
    if os.path.exists(args.progress_file):
        with open(args.progress_file, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                processed_files.add(line.strip())

    files = set(find_audio_files(args.path, include_nonwav=True))
    orig_len = len(files)
    files = files - processed_files
    print(f"Found {len(files)} files to process. Total processing is {100*(orig_len-len(files))/orig_len}% complete.")

    with ThreadPool(args.num_threads) as pool:
        list(tqdm(pool.imap(functools.partial(process_file, output_path=args.output_path, base_path=args.path, progress_file=args.progress_file), files), total=len(files)))