diff --git a/README.md b/README.md index 4627f62..bd696f2 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,18 @@ python do_tts.py --text "I'm going to speak this" --voice dotrice --preset fast ### read.py This script provides tools for reading large amounts of text. + ```shell python read.py --textfile --voice dotrice ``` +This will break up the textfile into sentences, and then convert them to speech one at a time. It will output a series +of spoken clips as they are generated. Once all the clips are generated, it will combine them into a single file and +output that as well. + +Sometimes Tortoise screws up an output. You can re-generate any bad clips by re-running `read.py` with the --regenerate +argument. + ### API Tortoise can be used programmatically, like so: diff --git a/read.py b/read.py index 04cd7a0..9506e97 100644 --- a/read.py +++ b/read.py @@ -35,6 +35,7 @@ if __name__ == '__main__': 'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='patrick_stewart') parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/longform/') parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='standard') + parser.add_argument('--regenerate', type=str, help='Comma-separated list of clip numbers to re-generate, or nothing.', default=None) parser.add_argument('--voice_diversity_intelligibility_slider', type=float, help='How to balance vocal diversity with the quality/intelligibility of the spoken text. 0 means highly diverse voice (not recommended), 1 means maximize intellibility', default=.5) @@ -43,6 +44,9 @@ if __name__ == '__main__': outpath = args.output_path voices = get_voices() selected_voices = args.voice.split(',') + regenerate = args.regenerate + if regenerate is not None: + regenerate = [int(e) for e in regenerate.split(',')] for selected_voice in selected_voices: voice_outpath = os.path.join(outpath, selected_voice) os.makedirs(voice_outpath, exist_ok=True) @@ -71,6 +75,9 @@ if __name__ == '__main__': conds.append(c) all_parts = [] for j, text in enumerate(texts): + if regenerate is not None and j not in regenerate: + all_parts.append(load_audio(os.path.join(voice_outpath, f'{j}.wav'), 24000)) + continue gen = tts.tts_with_preset(text, conds, preset=args.preset, clvp_cvvp_slider=args.voice_diversity_intelligibility_slider) gen = gen.squeeze(0).cpu() torchaudio.save(os.path.join(voice_outpath, f'{j}.wav'), gen, 24000)