added another (somewhat adequate) example, added metadata storage to generated files (need to add in a viewer later)

This commit is contained in:
mrq 2023-02-06 14:17:41 +00:00
parent b441a84615
commit 5affc777e0
3 changed files with 60 additions and 14 deletions

View File

@ -152,6 +152,19 @@ Output (The McDonalds building creepypasta, custom preset of 128 samples, 256 it
This took quite a while, over the course of a day half-paying-attention at the command prompt to generate the next piece. I only had to regenerate one section that sounded funny, but compared to 11.AI requiring tons of regenerations for something usable, this is nice to just let run and forget. Initially he sounds rather passable as Harry Mason, but as it goes on it seems to kinda falter. Sound effects and music are added in post and aren't generated by TorToiSe. This took quite a while, over the course of a day half-paying-attention at the command prompt to generate the next piece. I only had to regenerate one section that sounded funny, but compared to 11.AI requiring tons of regenerations for something usable, this is nice to just let run and forget. Initially he sounds rather passable as Harry Mason, but as it goes on it seems to kinda falter. Sound effects and music are added in post and aren't generated by TorToiSe.
Source (James Sunderland):
* https://files.catbox.moe/ynoeld.mp3
* https://files.catbox.moe/lxgbsm.mp3
Output (The McDonalds building creepypasta, 256 samples, 256 iterations, 0.1 temp, pause size 8, DDIM, conditioning free, seed 1675690127):
* https://vocaroo.com/1nXmip0oJu8Z
This took a while to generate while I slept (and even managed to wake up before it finished). Using the batch function, this took 6.919 hours on my 2060 to generate the 27 pieces with zero editing on my end.
I'm providing this even with its nasty warts to highlight the quirks: the weird gaps where there's a strange sound instead, the random pauses for "thought", etc.
I think this also highlights how just combining your entire source sample gung-ho isn't a good idea, as he's not as high of a pitch in his delivery compared to how he usually is throughout most of the game (a sort of average between his two ranges). I can't gauge how well it did in reproducing it, since my ears are pretty much burnt out from listening to so many clips, but I believe he's pretty believable as a James Sunderland.
## Caveats (and Upsides) ## Caveats (and Upsides)
To me, I find a few problems with TorToiSe over 11.AI: To me, I find a few problems with TorToiSe over 11.AI:
@ -166,4 +179,4 @@ To me, I find a few problems with TorToiSe over 11.AI:
However, I can look past these as TorToiSe offers, in comparison to 11.AI: However, I can look past these as TorToiSe offers, in comparison to 11.AI:
* the "speaking too fast" issue does not exist with TorToiSe. I don't need to fight with it by pretending I'm a Gaia user in the early 2000s by sprinkling ellipses. * the "speaking too fast" issue does not exist with TorToiSe. I don't need to fight with it by pretending I'm a Gaia user in the early 2000s by sprinkling ellipses.
* the overall delivery seems very natural, sometimes small, dramatic pauses gets added at the legitimately most convenient moments, and the inhales tend to be more natural. Many of vocaroos from 11.AI where it just does not seem properly delivered. * the overall delivery seems very natural, sometimes small, dramatic pauses gets added at the legitimately most convenient moments, and the inhales tend to be more natural. Many of vocaroos from 11.AI where it just does not seem properly delivered.
* being able to run it locally means I do not have to worry about some Polack seeing me use the "dick" word. * being able to run it locally means I do not have to worry about some Polack seeing me use the "dick" word.

55
app.py
View File

@ -4,12 +4,15 @@ import gradio as gr
import torch import torch
import torchaudio import torchaudio
import time import time
import json
from datetime import datetime from datetime import datetime
from tortoise.api import TextToSpeech from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_audio, load_voice, load_voices from tortoise.utils.audio import load_audio, load_voice, load_voices
from tortoise.utils.text import split_and_recombine_text from tortoise.utils.text import split_and_recombine_text
import music_tag
def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, experimentals, progress=gr.Progress()): def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, experimentals, progress=gr.Progress()):
if voice != "microphone": if voice != "microphone":
voices = [voice] voices = [voice]
@ -88,13 +91,19 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
if isinstance(gen, list): if isinstance(gen, list):
for j, g in enumerate(gen): for j, g in enumerate(gen):
audio = g.squeeze(0).cpu() audio = g.squeeze(0).cpu()
audio_cache[f"candidate_{j}/result_{line}.wav"] = audio audio_cache[f"candidate_{j}/result_{line}.wav"] = {
'audio': audio,
'text': cut_text,
}
os.makedirs(os.path.join(outdir, f'candidate_{j}'), exist_ok=True) os.makedirs(os.path.join(outdir, f'candidate_{j}'), exist_ok=True)
torchaudio.save(os.path.join(outdir, f'candidate_{j}/result_{line}.wav'), audio, 24000) torchaudio.save(os.path.join(outdir, f'candidate_{j}/result_{line}.wav'), audio, 24000)
else: else:
audio = gen.squeeze(0).cpu() audio = gen.squeeze(0).cpu()
audio_cache[f"result_{line}.wav"] = audio audio_cache[f"result_{line}.wav"] = {
'audio': audio,
'text': cut_text,
}
torchaudio.save(os.path.join(outdir, f'result_{line}.wav'), audio, 24000) torchaudio.save(os.path.join(outdir, f'result_{line}.wav'), audio, 24000)
output_voice = None output_voice = None
@ -103,10 +112,10 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
audio_clips = [] audio_clips = []
for line in range(len(texts)): for line in range(len(texts)):
if isinstance(gen, list): if isinstance(gen, list):
piece = audio_cache[f'candidate_{candidate}/result_{line}.wav'] audio = audio_cache[f'candidate_{candidate}/result_{line}.wav']['audio']
else: else:
piece = audio_cache[f'result_{line}.wav'] audio = audio_cache[f'result_{line}.wav']['audio']
audio_clips.append(piece) audio_clips.append(audio)
audio_clips = torch.cat(audio_clips, dim=-1) audio_clips = torch.cat(audio_clips, dim=-1)
torchaudio.save(os.path.join(outdir, f'combined_{candidate}.wav'), audio_clips, 24000) torchaudio.save(os.path.join(outdir, f'combined_{candidate}.wav'), audio_clips, 24000)
@ -118,17 +127,39 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c
else: else:
output_voice = gen output_voice = gen
output_voice = (24000, output_voice.squeeze().cpu().numpy()) output_voice = (24000, output_voice.squeeze().cpu().numpy())
info = f"{datetime.now()} | Voice: {','.join(voices)} | Text: {text} | Quality: {preset} preset / {num_autoregressive_samples} samples / {diffusion_iterations} iterations | Temperature: {temperature} | Time Taken (s): {time.time()-start_time} | Seed: {seed}\n" info = {
'text': text,
'delimiter': delimiter,
'emotion': emotion,
'prompt': prompt,
'voice': voice,
'mic_audio': mic_audio,
'preset': preset,
'seed': seed,
'candidates': candidates,
'num_autoregressive_samples': num_autoregressive_samples,
'diffusion_iterations': diffusion_iterations,
'temperature': temperature,
'diffusion_sampler': diffusion_sampler,
'breathing_room': breathing_room,
'experimentals': experimentals,
'time': time.time()-start_time,
}
with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f: with open(os.path.join(outdir, f'input.txt'), 'w', encoding="utf-8") as f:
f.write(info) f.write(json.dumps(info, indent='\t') )
with open("results.log", "w", encoding="utf-8") as f:
f.write(info)
print(f"Saved to '{outdir}'") print(f"Saved to '{outdir}'")
for path in audio_cache:
info['text'] = audio_cache[path]['text']
metadata = music_tag.load_file(os.path.join(outdir, path))
metadata['lyrics'] = json.dumps(info)
metadata.save()
if sample_voice is not None: if sample_voice is not None:
sample_voice = (22050, sample_voice.squeeze().cpu().numpy()) sample_voice = (22050, sample_voice.squeeze().cpu().numpy())
@ -265,7 +296,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--share", action='store_true', help="Lets Gradio return a public URL to use anywhere") parser.add_argument("--share", action='store_true', help="Lets Gradio return a public URL to use anywhere")
parser.add_argument("--low-vram", action='store_true', help="Disables some optimizations that increases VRAM usage") parser.add_argument("--low-vram", action='store_true', help="Disables some optimizations that increases VRAM usage")
parser.add_argument("--cond-latent-max-chunk-size", type=int, default=None, help="Sets an upper limit to audio chunk size when computing conditioning latents") parser.add_argument("--cond-latent-max-chunk-size", type=int, default=1000000, help="Sets an upper limit to audio chunk size when computing conditioning latents")
args = parser.parse_args() args = parser.parse_args()
tts = TextToSpeech(minor_optimizations=not args.low_vram) tts = TextToSpeech(minor_optimizations=not args.low_vram)

View File

@ -13,4 +13,6 @@ threadpoolctl
appdirs appdirs
numpy numpy
numba numba
gradio gradio
music-tag
k-diffusion