Improve sentence boundary detection
This commit is contained in:
parent
6206436e76
commit
acc0891e85
|
@ -50,14 +50,18 @@ def split_and_recombine_text(text, desired_length=200, max_length=300):
|
||||||
commit()
|
commit()
|
||||||
# check for sentence boundaries
|
# check for sentence boundaries
|
||||||
elif not in_quote and (c in '!?\n' or (c == '.' and next_c in '\n ')):
|
elif not in_quote and (c in '!?\n' or (c == '.' and next_c in '\n ')):
|
||||||
|
# seek forward if we have consecutive boundary markers but still within the max length
|
||||||
|
while len(current) < max_length and next_c in '!?.':
|
||||||
|
c, next_c = seek(1)
|
||||||
|
current += c
|
||||||
split_pos.append(pos)
|
split_pos.append(pos)
|
||||||
if len(current) >= desired_length:
|
if len(current) >= desired_length:
|
||||||
commit()
|
commit()
|
||||||
rv.append(current)
|
rv.append(current)
|
||||||
|
|
||||||
# clean up
|
# clean up, remove lines with only whitespace or punctuation
|
||||||
rv = [s.strip() for s in rv]
|
rv = [s.strip() for s in rv]
|
||||||
rv = [s for s in rv if len(s) > 0]
|
rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]
|
||||||
|
|
||||||
return rv
|
return rv
|
||||||
|
|
||||||
|
@ -81,4 +85,15 @@ if __name__ == '__main__':
|
||||||
'inthemiddlebutinotinthislongword.',
|
'inthemiddlebutinotinthislongword.',
|
||||||
'"Don\'t split my quote... please"'])
|
'"Don\'t split my quote... please"'])
|
||||||
|
|
||||||
|
def test_split_and_recombine_text_2(self):
|
||||||
|
text = """
|
||||||
|
When you are really angry sometimes you use consecutive exclamation marks!!!!!! Is this a good thing to do?!?!?!
|
||||||
|
I don't know but we should handle this situation..........................
|
||||||
|
"""
|
||||||
|
self.assertEqual(split_and_recombine_text(text, desired_length=30, max_length=50),
|
||||||
|
['When you are really angry sometimes you use',
|
||||||
|
'consecutive exclamation marks!!!!!!',
|
||||||
|
'Is this a good thing to do?!?!?!',
|
||||||
|
'I don\'t know but we should handle this situation.'])
|
||||||
|
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user