@@ -384,20 +384,62 @@ def _merge_short_chunks(chunks: list[str], *, min_length: int = 50) -> list[str]
384384# Trailing words that signal a chunk was cut mid-clause.
385385# When a chunk ends with one of these, the next chunk continues the
386386# sentence regardless of capitalisation.
387- _TRAILING_WORDS = frozenset ({
388- # prepositions / particles
389- "of" , "for" , "in" , "to" , "with" , "from" , "by" , "at" , "on" , "into" ,
390- "about" , "between" , "through" , "within" , "without" , "including" ,
391- # articles / determiners
392- "the" , "a" , "an" , "all" , "any" , "each" , "every" , "this" , "that" ,
393- # conjunctions
394- "and" , "or" , "nor" , "but" ,
395- # auxiliary / modal verbs
396- "be" , "is" , "are" , "was" , "were" , "been" , "being" ,
397- "must" , "shall" , "should" , "will" , "would" , "can" , "could" , "may" ,
398- # common mid-clause endings
399- "not" , "also" , "than" ,
400- })
387+ _TRAILING_WORDS = frozenset (
388+ {
389+ # prepositions / particles
390+ "of" ,
391+ "for" ,
392+ "in" ,
393+ "to" ,
394+ "with" ,
395+ "from" ,
396+ "by" ,
397+ "at" ,
398+ "on" ,
399+ "into" ,
400+ "about" ,
401+ "between" ,
402+ "through" ,
403+ "within" ,
404+ "without" ,
405+ "including" ,
406+ # articles / determiners
407+ "the" ,
408+ "a" ,
409+ "an" ,
410+ "all" ,
411+ "any" ,
412+ "each" ,
413+ "every" ,
414+ "this" ,
415+ "that" ,
416+ # conjunctions
417+ "and" ,
418+ "or" ,
419+ "nor" ,
420+ "but" ,
421+ # auxiliary / modal verbs
422+ "be" ,
423+ "is" ,
424+ "are" ,
425+ "was" ,
426+ "were" ,
427+ "been" ,
428+ "being" ,
429+ "must" ,
430+ "shall" ,
431+ "should" ,
432+ "will" ,
433+ "would" ,
434+ "can" ,
435+ "could" ,
436+ "may" ,
437+ # common mid-clause endings
438+ "not" ,
439+ "also" ,
440+ "than" ,
441+ }
442+ )
401443
402444
403445def _ends_mid_clause (text : str ) -> bool :
0 commit comments