From acc0891e8502799bdd0a7fb3a58cdc87ab7892ec Mon Sep 17 00:00:00 2001 From: Johan Nordberg Date: Fri, 27 May 2022 05:58:09 +0000 Subject: [PATCH] Improve sentence boundary detection --- tortoise/utils/text.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tortoise/utils/text.py b/tortoise/utils/text.py index 18bcebb..fb36311 100644 --- a/tortoise/utils/text.py +++ b/tortoise/utils/text.py @@ -50,14 +50,18 @@ def split_and_recombine_text(text, desired_length=200, max_length=300): commit() # check for sentence boundaries elif not in_quote and (c in '!?\n' or (c == '.' and next_c in '\n ')): + # seek forward if we have consecutive boundary markers but still within the max length + while len(current) < max_length and next_c in '!?.': + c, next_c = seek(1) + current += c split_pos.append(pos) if len(current) >= desired_length: commit() rv.append(current) - # clean up + # clean up, remove lines with only whitespace or punctuation rv = [s.strip() for s in rv] - rv = [s for s in rv if len(s) > 0] + rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)] return rv @@ -81,4 +85,15 @@ if __name__ == '__main__': 'inthemiddlebutinotinthislongword.', '"Don\'t split my quote... please"']) + def test_split_and_recombine_text_2(self): + text = """ + When you are really angry sometimes you use consecutive exclamation marks!!!!!! Is this a good thing to do?!?!?! + I don't know but we should handle this situation.......................... + """ + self.assertEqual(split_and_recombine_text(text, desired_length=30, max_length=50), + ['When you are really angry sometimes you use', + 'consecutive exclamation marks!!!!!!', + 'Is this a good thing to do?!?!?!', + 'I don\'t know but we should handle this situation.']) + unittest.main()