Improve splitting on text that has many quotes

This commit is contained in:
Johan Nordberg 2022-05-28 01:22:21 +00:00
parent cf26074fa5
commit 069e7001ad

View File

@ -13,18 +13,25 @@ def split_and_recombine_text(text, desired_length=200, max_length=300):
current = "" current = ""
split_pos = [] split_pos = []
pos = -1 pos = -1
end_pos = len(text) - 1
def seek(delta): def seek(delta):
nonlocal pos, in_quote, text nonlocal pos, in_quote, current
is_neg = delta < 0 is_neg = delta < 0
for _ in range(abs(delta)): for _ in range(abs(delta)):
if is_neg: if is_neg:
pos -= 1 pos -= 1
current = current[:-1]
else: else:
pos += 1 pos += 1
current += text[pos]
if text[pos] == '"': if text[pos] == '"':
in_quote = not in_quote in_quote = not in_quote
return text[pos], text[pos+1] if pos < len(text)-1 else "" return text[pos]
def peek(delta):
p = pos + delta
return text[p] if p < end_pos and p >= 0 else ""
def commit(): def commit():
nonlocal rv, current, split_pos nonlocal rv, current, split_pos
@ -32,31 +39,31 @@ def split_and_recombine_text(text, desired_length=200, max_length=300):
current = "" current = ""
split_pos = [] split_pos = []
while pos < len(text) - 1: while pos < end_pos:
c, next_c = seek(1) c = seek(1)
current += c
# do we need to force a split? # do we need to force a split?
if len(current) >= max_length: if len(current) >= max_length:
if len(split_pos) > 0 and len(current) > (desired_length / 2): if len(split_pos) > 0 and len(current) > (desired_length / 2):
# we have at least one sentence and we are over half the desired length, seek back to the last split # we have at least one sentence and we are over half the desired length, seek back to the last split
d = pos - split_pos[-1] d = pos - split_pos[-1]
seek(-d) seek(-d)
current = current[:-d]
else: else:
# no full sentences, seek back until we are not in the middle of a word and split there # no full sentences, seek back until we are not in the middle of a word and split there
while c not in '!?.\n ' and pos > 0 and len(current) > desired_length: while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
c, _ = seek(-1) c = seek(-1)
current = current[:-1]
commit() commit()
# check for sentence boundaries # check for sentence boundaries
elif not in_quote and (c in '!?\n' or (c == '.' and next_c in '\n ')): elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')):
# seek forward if we have consecutive boundary markers but still within the max length # seek forward if we have consecutive boundary markers but still within the max length
while pos < len(text) - 1 and len(current) < max_length and next_c in '!?.': while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
c, next_c = seek(1) c = seek(1)
current += c
split_pos.append(pos) split_pos.append(pos)
if len(current) >= desired_length: if len(current) >= desired_length:
commit() commit()
# treat end of quote as a boundary if its preceded by a space or newline
elif in_quote and peek(1) == '"' and peek(2) in '\n ':
seek(2)
split_pos.append(pos)
rv.append(current) rv.append(current)
# clean up, remove lines with only whitespace or punctuation # clean up, remove lines with only whitespace or punctuation
@ -103,21 +110,23 @@ if __name__ == '__main__':
text = f.read() text = f.read()
self.assertEqual( self.assertEqual(
split_and_recombine_text(text), split_and_recombine_text(text),
['Once upon a time there lived in a certain village a little country girl, the prettiest creature who was ever seen. Her mother was excessively fond of her; and her grandmother doted on her still more. This good woman had a little red riding hood made for her.', [
'It suited the girl so extremely well that everybody called her Little Red Riding Hood.', 'Once upon a time there lived in a certain village a little country girl, the prettiest creature who was ever seen. Her mother was excessively fond of her; and her grandmother doted on her still more. This good woman had a little red riding hood made for her.',
'One day her mother, having made some cakes, said to her, "Go, my dear, and see how your grandmother is doing, for I hear she has been very ill. Take her a cake, and this little pot of butter." Little Red Riding Hood set out immediately to go to her grandmother, who lived in another village.', 'It suited the girl so extremely well that everybody called her Little Red Riding Hood. One day her mother, having made some cakes, said to her, "Go, my dear, and see how your grandmother is doing, for I hear she has been very ill. Take her a cake, and this little pot of butter."',
'As she was going through the wood, she met with a wolf, who had a very great mind to eat her up, but he dared not, because of some woodcutters working nearby in the forest. He asked her where she was going.', 'Little Red Riding Hood set out immediately to go to her grandmother, who lived in another village. As she was going through the wood, she met with a wolf, who had a very great mind to eat her up, but he dared not, because of some woodcutters working nearby in the forest.',
'The poor child, who did not know that it was dangerous to stay and talk to a wolf, said to him, "I am going to see my grandmother and carry her a cake and a little pot of butter from my mother." "Does she live far off?" said the wolf "Oh I say," answered Little Red Riding Hood; "it is beyond that', 'He asked her where she was going. The poor child, who did not know that it was dangerous to stay and talk to a wolf, said to him, "I am going to see my grandmother and carry her a cake and a little pot of butter from my mother." "Does she live far off?" said the wolf "Oh I say,"',
'mill you see there, at the first house in the village." "Well," said the wolf, "and I\'ll go and see her too. I\'ll go this way and go you that, and we shall see who will be there first." The wolf ran as fast as he could, taking the shortest path, and the little girl took a roundabout way,', 'answered Little Red Riding Hood; "it is beyond that mill you see there, at the first house in the village." "Well," said the wolf, "and I\'ll go and see her too. I\'ll go this way and go you that, and we shall see who will be there first."',
"entertaining herself by gathering nuts, running after butterflies, and gathering bouquets of little flowers. It was not long before the wolf arrived at the old woman's house. He knocked at the door: tap, tap.", 'The wolf ran as fast as he could, taking the shortest path, and the little girl took a roundabout way, entertaining herself by gathering nuts, running after butterflies, and gathering bouquets of little flowers.',
'"Who\'s there?" "Your grandchild, Little Red Riding Hood," replied the wolf, counterfeiting her voice; "who has brought you a cake and a little pot of butter sent you by mother." The good grandmother, who was in bed, because she was somewhat ill, cried out, "Pull the bobbin, and the latch will go', 'It was not long before the wolf arrived at the old woman\'s house. He knocked at the door: tap, tap. "Who\'s there?" "Your grandchild, Little Red Riding Hood," replied the wolf, counterfeiting her voice; "who has brought you a cake and a little pot of butter sent you by mother."',
'up." The wolf pulled the bobbin, and the door opened, and then he immediately fell upon the good woman and ate her up in a moment, for it been more than three days since he had eaten.', 'The good grandmother, who was in bed, because she was somewhat ill, cried out, "Pull the bobbin, and the latch will go up."',
"He then shut the door and got into the grandmother's bed, expecting Little Red Riding Hood, who came some time afterwards and knocked at the door: tap, tap.", 'The wolf pulled the bobbin, and the door opened, and then he immediately fell upon the good woman and ate her up in a moment, for it been more than three days since he had eaten.',
'"Who\'s there?" Little Red Riding Hood, hearing the big voice of the wolf, was at first afraid; but believing her grandmother had a cold and was hoarse, answered, "It is your grandchild Little Red Riding Hood, who has brought you a cake and a little pot of butter mother sends you." The wolf cried', 'He then shut the door and got into the grandmother\'s bed, expecting Little Red Riding Hood, who came some time afterwards and knocked at the door: tap, tap. "Who\'s there?"',
'out to her, softening his voice as much as he could, "Pull the bobbin, and the latch will go up." Little Red Riding Hood pulled the bobbin, and the door opened.', 'Little Red Riding Hood, hearing the big voice of the wolf, was at first afraid; but believing her grandmother had a cold and was hoarse, answered, "It is your grandchild Little Red Riding Hood, who has brought you a cake and a little pot of butter mother sends you."',
'The wolf cried out to her, softening his voice as much as he could, "Pull the bobbin, and the latch will go up." Little Red Riding Hood pulled the bobbin, and the door opened.',
'The wolf, seeing her come in, said to her, hiding himself under the bedclothes, "Put the cake and the little pot of butter upon the stool, and come get into bed with me." Little Red Riding Hood took off her clothes and got into bed.', 'The wolf, seeing her come in, said to her, hiding himself under the bedclothes, "Put the cake and the little pot of butter upon the stool, and come get into bed with me." Little Red Riding Hood took off her clothes and got into bed.',
'She was greatly amazed to see how her grandmother looked in her nightclothes, and said to her, "Grandmother, what big arms you have!" "All the better to hug you with, my dear." "Grandmother, what big legs you have!" "All the better to run with, my child." "Grandmother, what big ears you have!"', 'She was greatly amazed to see how her grandmother looked in her nightclothes, and said to her, "Grandmother, what big arms you have!" "All the better to hug you with, my dear." "Grandmother, what big legs you have!" "All the better to run with, my child." "Grandmother, what big ears you have!"',
'"All the better to hear with, my child." "Grandmother, what big eyes you have!" "All the better to see with, my child." "Grandmother, what big teeth you have got!" "All the better to eat you up with." And, saying these words, this wicked wolf fell upon Little Red Riding Hood, and ate her all up.'] '"All the better to hear with, my child." "Grandmother, what big eyes you have!" "All the better to see with, my child." "Grandmother, what big teeth you have got!" "All the better to eat you up with." And, saying these words, this wicked wolf fell upon Little Red Riding Hood, and ate her all up.',
]
) )
unittest.main() unittest.main()