From 712ce4af5dca646a4b9cd3a8e149e7104cfa3cc8 Mon Sep 17 00:00:00 2001 From: mrq Date: Thu, 6 Feb 2025 12:37:18 -0600 Subject: [PATCH] maybe fixed errors with DAC backend, added option to limit by duration in emb.process (because I only really need short utternaces right now and I'm not ready to spend a week on processing everything again) --- vall_e/emb/codecs/dac.py | 2 ++ vall_e/emb/process.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/vall_e/emb/codecs/dac.py b/vall_e/emb/codecs/dac.py index 9f545e4..ff6f660 100644 --- a/vall_e/emb/codecs/dac.py +++ b/vall_e/emb/codecs/dac.py @@ -4,6 +4,8 @@ from dac import DACFile from audiotools import AudioSignal from dac.utils import load_model as __load_dac_model +from typing import Union +from pathlib import Path """ Patch decode to skip things related to the metadata (namely the waveform trimming) So far it seems the raw waveform can just be returned without any post-processing diff --git a/vall_e/emb/process.py b/vall_e/emb/process.py index 49fb22f..78432bb 100644 --- a/vall_e/emb/process.py +++ b/vall_e/emb/process.py @@ -174,6 +174,7 @@ def process( stride_offset=0, slice="auto", batch_size=1, + max_duration=None, low_memory=False, @@ -326,6 +327,9 @@ def process( start = int((segment['start']-0.05) * sample_rate) end = int((segment['end']+0.5) * sample_rate) + if max_duration and (end - start) / sample_rate > max_duration: + continue + if not presliced: if start < 0: start = 0 @@ -364,6 +368,7 @@ def main(): parser.add_argument("--stride-offset", type=int, default=0) parser.add_argument("--slice", type=str, default="auto") parser.add_argument("--batch-size", type=int, default=0) + parser.add_argument("--max-duration", type=int, default=0) parser.add_argument("--device", type=str, default="cuda") parser.add_argument("--dtype", type=str, default="bfloat16") @@ -394,6 +399,7 @@ def main(): stride_offset=args.stride_offset, slice=args.slice, batch_size=args.batch_size, + max_duration=args.max_duration, low_memory=args.low_memory,