From 90916da81a6be9579e84dfa90169a2c596b94d58 Mon Sep 17 00:00:00 2001 From: faad3 <79604071+faad3@users.noreply.github.com> Date: Fri, 13 May 2022 12:19:32 +0300 Subject: [PATCH] Upgrade of text splitting Upgrade of text splitting in read.py. Now text splits across '!' '?' ';', and double quotations ("") keeps together. Example: Original: They're often cooked in an unusual way, too. What they are doing? Take carpa regina in porchetta, one of the lake's signature dishes. "Carp baked like porchetta!" (herb-roasted pork) takes one of the lake's biggest fishes, slathers it in strong herbs, and roasts it - just as is done with Italy's classic meat, porchetta. Splitted: "I come to Trasimeno for the food - because it's like nowhere else in Italy," says Veronica Grechi, a B&B owner from Florence, and regular visitor to the lake. ["They're often cooked in an unusual way, too.", 'What they are doing?', "Take carpa regina in porchetta, one of the lake's signature dishes.", '"Carp baked like porchetta!" (herb-roasted pork) takes one of the lake\'s biggest fishes, slathers it in strong herbs, and roasts it - just as is done with Italy\'s classic meat, porchetta.', '"I come to Trasimeno for the food - because it\'s like nowhere else in Italy," says Veronica Grechi, a B&B owner from Florence, and regular visitor to the lake.'] --- tortoise/read.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tortoise/read.py b/tortoise/read.py index 9ee9ad6..100f06e 100644 --- a/tortoise/read.py +++ b/tortoise/read.py @@ -1,5 +1,6 @@ import argparse import os +import re import torch import torchaudio @@ -9,8 +10,8 @@ from tortoise.utils.audio import load_audio, get_voices, load_voices def split_and_recombine_text(texts, desired_length=200, max_len=300): - # TODO: also split across '!' and '?'. Attempt to keep quotations together. - texts = [s.strip() + "." for s in texts.split('.')] + split = re.split(r'([\!\?\;]|\.{1,3})+(?=(?:(?:[^\"\n]*\"){2})*[^\"\n]*$)',texts, flags=re.MULTILINE) + texts = [split[i-1].strip()+split[i].strip() for i in range(1,len(split),2)] i = 0 while i < len(texts):