Add a better split_on_silence (processing_pipeline)

Going to extend this a bit more going forwards to support the entire pipeline.
This commit is contained in:
James Betker 2022-02-03 20:00:26 -07:00
parent 1d29999648
commit 687393de59
2 changed files with 69 additions and 3 deletions

View File

@ -0,0 +1,68 @@
"""
Master script that processes all MP3 files found in an input directory. Performs the following operations, per-file:
1. Splits the file on silence intervals, throwing out all clips that are too short or long.
2.
"""
import argparse
import functools
import os
from multiprocessing.pool import ThreadPool
from pydub import AudioSegment
from pydub.exceptions import CouldntDecodeError
from pydub.silence import split_on_silence
from tqdm import tqdm
from data.util import find_audio_files
def report_progress(progress_file, file):
with open(progress_file, 'a', encoding='utf-8') as f:
f.write(f'{file}\n')
def process_file(file, base_path, output_path, progress_file):
# Hyper-parameters; feel free to adjust.
minimum_duration = 4
maximum_duration = 20
# Part 1 is to split a large file into chunks.
try:
speech = AudioSegment.from_file(file)
except CouldntDecodeError as e:
print(e)
report_progress(progress_file, file)
return
outdir = os.path.join(output_path, f'{os.path.relpath(file, base_path)[:-4]}').replace('.', '').strip()
os.makedirs(outdir, exist_ok=True)
chunks = split_on_silence(speech, min_silence_len=600, silence_thresh=-40, seek_step=100, keep_silence=50)
for i in range(0, len(chunks)):
if chunks[i].duration_seconds < minimum_duration or chunks[i].duration_seconds > maximum_duration:
continue
chunks[i].export(f"{outdir}/{i:05d}.mp3", format='mp3', parameters=["-ac", "1"])
report_progress(progress_file, file)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-path', type=str, help='Path to search for files', default='Y:\\sources\\big_podcast')
parser.add_argument('-progress_file', type=str, help='Place to store all files that have already been processed', default='Y:\\sources\\big_podcast\\already_processed.txt')
parser.add_argument('-output_path', type=str, help='Path for output files', default='Y:\\split\\big_podcast')
parser.add_argument('-num_threads', type=int, help='Number of concurrent workers processing files.', default=4)
args = parser.parse_args()
processed_files = set()
if os.path.exists(args.progress_file):
with open(args.progress_file, 'r', encoding='utf-8') as f:
for line in f.readlines():
processed_files.add(line.strip())
files = set(find_audio_files(args.path, include_nonwav=True))
orig_len = len(files)
files = files - processed_files
print(f"Found {len(files)} files to process. Total processing is {100*(orig_len-len(files))/orig_len}% complete.")
with ThreadPool(args.num_threads) as pool:
list(tqdm(pool.imap(functools.partial(process_file, output_path=args.output_path, base_path=args.path, progress_file=args.progress_file), files), total=len(files)))

View File

@ -19,8 +19,6 @@ def main():
maximum_duration = 20
files = find_audio_files(args.path, include_nonwav=True)
for e, wav_file in enumerate(tqdm(files)):
if e < 12593:
continue
print(f"Processing {wav_file}..")
outdir = os.path.join(args.out, f'{e}_{os.path.basename(wav_file[:-4])}').replace('.', '').strip()
os.makedirs(outdir, exist_ok=True)
@ -36,7 +34,7 @@ def main():
for i in range(0, len(chunks)):
if chunks[i].duration_seconds < minimum_duration or chunks[i].duration_seconds > maximum_duration:
continue
chunks[i].export(f"{outdir}/{i:05d}.wav", format='wav', parameters=["-ar", "22050", "-ac", "1"])
chunks[i].export(f"{outdir}/{i:05d}.mp3", format='mp3', parameters=["-ac", "1"])
if __name__ == '__main__':
main()