From acc0891e8502799bdd0a7fb3a58cdc87ab7892ec Mon Sep 17 00:00:00 2001
From: Johan Nordberg <its@johan-nordberg.com>
Date: Fri, 27 May 2022 05:58:09 +0000
Subject: [PATCH] Improve sentence boundary detection

---
 tortoise/utils/text.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/tortoise/utils/text.py b/tortoise/utils/text.py
index 18bcebb..fb36311 100644
--- a/tortoise/utils/text.py
+++ b/tortoise/utils/text.py
@@ -50,14 +50,18 @@ def split_and_recombine_text(text, desired_length=200, max_length=300):
             commit()
         # check for sentence boundaries
         elif not in_quote and (c in '!?\n' or (c == '.' and next_c in '\n ')):
+            # seek forward if we have consecutive boundary markers but still within the max length
+            while len(current) < max_length and next_c in '!?.':
+                c, next_c = seek(1)
+                current += c
             split_pos.append(pos)
             if len(current) >= desired_length:
                 commit()
     rv.append(current)
 
-    # clean up
+    # clean up, remove lines with only whitespace or punctuation
     rv = [s.strip() for s in rv]
-    rv = [s for s in rv if len(s) > 0]
+    rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]
 
     return rv
 
@@ -81,4 +85,15 @@ if __name__ == '__main__':
                               'inthemiddlebutinotinthislongword.',
                               '"Don\'t split my quote... please"'])
 
+        def test_split_and_recombine_text_2(self):
+            text = """
+            When you are really angry sometimes you use consecutive exclamation marks!!!!!! Is this a good thing to do?!?!?!
+            I don't know but we should handle this situation..........................
+            """
+            self.assertEqual(split_and_recombine_text(text, desired_length=30, max_length=50),
+                             ['When you are really angry sometimes you use',
+                              'consecutive exclamation marks!!!!!!',
+                              'Is this a good thing to do?!?!?!',
+                              'I don\'t know but we should handle this situation.'])
+
     unittest.main()