From 3e8365fdec216573e53d982878c92ba6133b1723 Mon Sep 17 00:00:00 2001 From: mrq Date: Wed, 15 Feb 2023 05:49:28 +0000 Subject: [PATCH] voicefixed files do not overwrite, as my autism wants to hear the difference between them, incrementing file format fixed for real --- webui.py | 48 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/webui.py b/webui.py index f7e5130..cb37416 100755 --- a/webui.py +++ b/webui.py @@ -151,13 +151,31 @@ def generate( volume_adjust = torchaudio.transforms.Vol(gain=args.output_volume, gain_type="amplitude") if args.output_volume != 1 else None - idx = 1 + idx = 0 + idx_cache = {} for i, file in enumerate(os.listdir(outdir)): - if file[-5:] == ".json": - idx = idx + 1 + filename = os.path.basename(file) + if filename[-5:] == ".json": + match = re.findall(rf"^{voice}_(\d+)(?:.+?)\.json$", filename) + elif filename[-4:] == ".wav": + match = re.findall(rf"^{voice}_(\d+)(?:.+?)\.wav$", filename) + else: + continue + if match is None or len(match) == 0: + idx = idx + 1 # safety + continue + match = match[0] + key = match[0] + idx_cache[key] = True + + idx = idx + len(idx_cache) # I know there's something to pad I don't care pad = "" + if idx < 10000: + pad = f"{pad}0" + if idx < 1000: + pad = f"{pad}0" if idx < 100: pad = f"{pad}0" if idx < 10: @@ -272,26 +290,28 @@ def generate( f.write(json.dumps(info, indent='\t') ) if args.voice_fixer and voicefixer: - # we could do this on the pieces before they get stiched up anyways to save some compute - # but the stitching would need to read back from disk, defeating the point of caching the waveform + fixed_output_voices = [] for path in progress.tqdm(output_voices, desc="Running voicefix..."): + fixed = path.replace(".wav", "_fixed.wav") voicefixer.restore( input=path, - output=path, + output=fixed, cuda=get_device_name() == "cuda" and args.voice_fixer_use_cuda, #mode=mode, ) + fixed_output_voices.append(fixed) + output_voices = fixed_output_voices if voice is not None and conditioning_latents is not None: with open(f'{get_voice_dir()}/{voice}/cond_latents.pth', 'rb') as f: info['latents'] = base64.b64encode(f.read()).decode("ascii") if args.embed_output_metadata: - for path in progress.tqdm(audio_cache, desc="Embedding metadata..."): - info['text'] = audio_cache[path]['text'] - info['time'] = audio_cache[path]['time'] + for name in progress.tqdm(audio_cache, desc="Embedding metadata..."): + info['text'] = audio_cache[name]['text'] + info['time'] = audio_cache[name]['time'] - metadata = music_tag.load_file(f"{outdir}/{voice}_{path}.wav") + metadata = music_tag.load_file(f"{outdir}/{voice}_{name}.wav") metadata['lyrics'] = json.dumps(info) metadata.save() @@ -354,7 +374,7 @@ def update_presets(value): else: return (gr.update(), gr.update()) -def read_generate_settings(file, read_latents=True): +def read_generate_settings(file, read_latents=True, read_json=True): j = None latents = None @@ -699,7 +719,7 @@ def setup_gradio(): inputs=None, outputs=voice ) - voice_latents_chunks = gr.Slider(label="Voice Chunks", minimum=1, maximum=16, value=1, step=1) + voice_latents_chunks = gr.Slider(label="Voice Chunks", minimum=1, maximum=32, value=1, step=1) recompute_voice_latents = gr.Button(value="(Re)Compute Voice Latents") recompute_voice_latents.click(compute_latents, inputs=[ @@ -816,7 +836,7 @@ def setup_gradio(): if file[-4:] != ".wav": continue - metadata, _ = read_generate_settings(f"{outdir}/{file}", read_latents=False) + metadata, _ = read_generate_settings(f"{outdir}/{file}", read_latents=False, use_json=True) if metadata is None: continue @@ -911,12 +931,12 @@ def setup_gradio(): gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean), gr.Checkbox(label="Voice Fixer", value=args.voice_fixer), gr.Checkbox(label="Use CUDA for Voice Fixer", value=args.voice_fixer_use_cuda), + gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents), ] gr.Button(value="Check for Updates").click(check_for_updates) gr.Button(value="Reload TTS").click(reload_tts) with gr.Column(): exec_inputs = exec_inputs + [ - gr.Number(label="Voice Latents Max Chunk Size", precision=0, value=args.force_cpu_for_conditioning_latents), gr.Number(label="Sample Batch Size", precision=0, value=args.sample_batch_size), gr.Number(label="Concurrency Count", precision=0, value=args.concurrency_count), gr.Number(label="Ouptut Sample Rate", precision=0, value=args.output_sample_rate),