-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
48 changed files
with
2,065 additions
and
14 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .segmenter import Segmenter | ||
from .about import __version__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# -*- coding: utf-8 -*- | ||
import re | ||
from pysbd.utils import Text | ||
|
||
|
||
def replace_pre_number_abbr(txt, abbr): | ||
# prepend a space to avoid needing another regex for start of string | ||
txt = " " + txt | ||
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt) | ||
# remove the prepended space | ||
txt = txt[1:] | ||
return txt | ||
|
||
|
||
def replace_prepositive_abbr(txt, abbr): | ||
# prepend a space to avoid needing another regex for start of string | ||
txt = " " + txt | ||
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt) | ||
# remove the prepended space | ||
txt = txt[1:] | ||
return txt | ||
|
||
|
||
class AbbreviationReplacer(object): | ||
def __init__(self, text, lang): | ||
self.text = text | ||
self.lang = lang | ||
|
||
def replace(self): | ||
self.text = Text(self.text).apply( | ||
self.lang.PossessiveAbbreviationRule, | ||
self.lang.KommanditgesellschaftRule, | ||
*self.lang.SingleLetterAbbreviationRules.All | ||
) | ||
abbr_handled_text = "" | ||
for line in self.text.splitlines(True): | ||
abbr_handled_text += self.search_for_abbreviations_in_string(line) | ||
self.text = abbr_handled_text | ||
self.replace_multi_period_abbreviations() | ||
self.text = Text(self.text).apply(*self.lang.AmPmRules.All) | ||
self.text = self.replace_abbreviation_as_sentence_boundary() | ||
return self.text | ||
|
||
def replace_abbreviation_as_sentence_boundary(self): | ||
sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS)) | ||
regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters) | ||
self.text = re.sub(regex, '\\1.', self.text) | ||
return self.text | ||
|
||
def replace_multi_period_abbreviations(self): | ||
def mpa_replace(match): | ||
match = match.group() | ||
match = re.sub(re.escape(r"."), "∯", match) | ||
return match | ||
|
||
self.text = re.sub( | ||
self.lang.MULTI_PERIOD_ABBREVIATION_REGEX, | ||
mpa_replace, | ||
self.text, | ||
flags=re.IGNORECASE | ||
) | ||
|
||
def replace_period_of_abbr(self, txt, abbr): | ||
# prepend a space to avoid needing another regex for start of string | ||
txt = " " + txt | ||
txt = re.sub( | ||
r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format( | ||
abbr=re.escape(abbr.strip()) | ||
), | ||
"∯", | ||
txt, | ||
) | ||
# remove the prepended space | ||
txt = txt[1:] | ||
return txt | ||
|
||
|
||
def search_for_abbreviations_in_string(self, text): | ||
lowered = text.lower() | ||
for abbr in self.lang.Abbreviation.ABBREVIATIONS: | ||
stripped = abbr.strip() | ||
if stripped not in lowered: | ||
continue | ||
abbrev_match = re.findall( | ||
r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE | ||
) | ||
if not abbrev_match: | ||
continue | ||
next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}" | ||
char_array = re.findall(next_word_start, text) | ||
for ind, match in enumerate(abbrev_match): | ||
text = self.scan_for_replacements( | ||
text, match, ind, char_array | ||
) | ||
return text | ||
|
||
def scan_for_replacements(self, txt, am, ind, char_array): | ||
try: | ||
char = char_array[ind] | ||
except IndexError: | ||
char = "" | ||
prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS | ||
number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS | ||
upper = str(char).isupper() | ||
if not upper or am.strip().lower() in prepositive: | ||
if am.strip().lower() in prepositive: | ||
txt = replace_prepositive_abbr(txt, am) | ||
elif am.strip().lower() in number_abbr: | ||
txt = replace_pre_number_abbr(txt, am) | ||
else: | ||
txt = self.replace_period_of_abbr(txt, am) | ||
return txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# inspired from: | ||
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ | ||
|
||
__title__ = "pysbd" | ||
__version__ = "0.3.4" | ||
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages." | ||
__uri__ = "http://nipunsadvilkar.github.io/" | ||
__author__ = "Nipun Sadvilkar" | ||
__email__ = "nipunsadvilkar@gmail.com" | ||
__license__ = "MIT" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# -*- coding: utf-8 -*- | ||
import re | ||
from functools import partial | ||
from pysbd.punctuation_replacer import replace_punctuation | ||
|
||
|
||
class BetweenPunctuation(object): | ||
# Rubular: http://rubular.com/r/2YFrKWQUYi | ||
BETWEEN_SINGLE_QUOTES_REGEX = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'" | ||
|
||
BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = r"(?<=\s)‘(?:[^’]|’[a-zA-Z])*’" | ||
|
||
# Rubular: http://rubular.com/r/3Pw1QlXOjd | ||
BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"' | ||
|
||
# https://regex101.com/r/r6I1bW/1 | ||
# https://stackoverflow.com/questions/13577372/do-python-regular-expressions-have-an-equivalent-to-rubys-atomic-grouping?noredirect=1&lq=1 | ||
BETWEEN_DOUBLE_QUOTES_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"' | ||
|
||
# Rubular: http://rubular.com/r/x6s4PZK8jc | ||
BETWEEN_QUOTE_ARROW_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»' | ||
|
||
BETWEEN_QUOTE_ARROW_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»" | ||
|
||
# Rubular: http://rubular.com/r/JbAIpKdlSq | ||
BETWEEN_QUOTE_SLANTED_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”" | ||
BETWEEN_QUOTE_SLANTED_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”" | ||
|
||
# Rubular: http://rubular.com/r/WX4AvnZvlX | ||
BETWEEN_SQUARE_BRACKETS_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]" | ||
|
||
BETWEEN_SQUARE_BRACKETS_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]' | ||
|
||
# Rubular: http://rubular.com/r/6tTityPflI | ||
BETWEEN_PARENS_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)" | ||
|
||
BETWEEN_PARENS_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)" | ||
|
||
# Rubular: http://rubular.com/r/mXf8cW025o | ||
WORD_WITH_LEADING_APOSTROPHE = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'\S" | ||
|
||
# Rubular: http://rubular.com/r/jTtDKfjxzr | ||
BETWEEN_EM_DASHES_REGEX = r"\-\-(?>[^\-\-])*\-\-" | ||
|
||
BETWEEN_EM_DASHES_REGEX_2 = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--" | ||
|
||
def __init__(self, text): | ||
self.text = text | ||
|
||
def replace(self): | ||
return self.sub_punctuation_between_quotes_and_parens(self.text) | ||
|
||
def sub_punctuation_between_quotes_and_parens(self, txt): | ||
txt = self.sub_punctuation_between_single_quotes(txt) | ||
txt = self.sub_punctuation_between_single_quote_slanted(txt) | ||
txt = self.sub_punctuation_between_double_quotes(txt) | ||
txt = self.sub_punctuation_between_square_brackets(txt) | ||
txt = self.sub_punctuation_between_parens(txt) | ||
txt = self.sub_punctuation_between_quotes_arrow(txt) | ||
txt = self.sub_punctuation_between_em_dashes(txt) | ||
txt = self.sub_punctuation_between_quotes_slanted(txt) | ||
return txt | ||
|
||
def sub_punctuation_between_parens(self, txt): | ||
return re.sub(self.BETWEEN_PARENS_REGEX_2, replace_punctuation, txt) | ||
|
||
def sub_punctuation_between_square_brackets(self, txt): | ||
return re.sub(self.BETWEEN_SQUARE_BRACKETS_REGEX_2, replace_punctuation, | ||
txt) | ||
|
||
def sub_punctuation_between_single_quotes(self, txt): | ||
if re.search(self.WORD_WITH_LEADING_APOSTROPHE, txt) and \ | ||
(not re.search(r"'\s", txt)): | ||
return txt | ||
return re.sub(self.BETWEEN_SINGLE_QUOTES_REGEX, | ||
partial(replace_punctuation, match_type='single'), txt) | ||
|
||
def sub_punctuation_between_single_quote_slanted(self, txt): | ||
return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_REGEX, | ||
replace_punctuation, txt) | ||
|
||
def sub_punctuation_between_double_quotes(self, txt): | ||
return re.sub(self.BETWEEN_DOUBLE_QUOTES_REGEX_2, replace_punctuation, | ||
txt) | ||
|
||
def sub_punctuation_between_quotes_arrow(self, txt): | ||
return re.sub(self.BETWEEN_QUOTE_ARROW_REGEX_2, replace_punctuation, txt) | ||
|
||
def sub_punctuation_between_em_dashes(self, txt): | ||
return re.sub(self.BETWEEN_EM_DASHES_REGEX_2, replace_punctuation, txt) | ||
|
||
def sub_punctuation_between_quotes_slanted(self, txt): | ||
return re.sub(self.BETWEEN_QUOTE_SLANTED_REGEX_2, replace_punctuation, | ||
txt) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# -*- coding: utf-8 -*- | ||
from pysbd.utils import Rule | ||
|
||
|
||
class CleanRules(object): | ||
|
||
# NOTE: Caution: Might require \\ for special characters | ||
# if regex is defined with r'' then dont | ||
# add extra \\ for special characters | ||
# Rubular: http://rubular.com/r/V57WnM9Zut | ||
NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '') | ||
|
||
# Rubular: http://rubular.com/r/dMxp5MixFS | ||
DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r") | ||
|
||
# Rubular: http://rubular.com/r/H6HOJeA8bq | ||
DoubleNewLineRule = Rule(r'\n\n', "\r") | ||
|
||
# Rubular: http://rubular.com/r/FseyMiiYFT | ||
NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '') | ||
|
||
ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r") | ||
|
||
EscapedNewLineRule = Rule(r'\\n', "\n") | ||
|
||
EscapedCarriageReturnRule = Rule(r'\\r', "\r") | ||
|
||
TypoEscapedNewLineRule = Rule(r'\\\ n', "\n") | ||
|
||
TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r") | ||
|
||
# Rubular: http://rubular.com/r/bAJrhyLNeZ | ||
InlineFormattingRule = Rule(r'{b\^>\d*<b\^}|{b\^>\d*<b\^}', '') | ||
|
||
# Rubular: http://rubular.com/r/8mc1ArOIGy | ||
TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r") | ||
|
||
# Rubular: http://rubular.com/r/DwNSuZrNtk | ||
ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ') | ||
|
||
# Rubular: http://rubular.com/r/IQ4TPfsbd8 | ||
ConsecutiveForwardSlashRule = Rule(r'\/{3}', '') | ||
|
||
# Rubular: http://rubular.com/r/6dt98uI76u | ||
NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])' | ||
# NO_SPACE_BETWEEN_SENTENCES_REGEX = r'[a-z]\.[A-Z]' | ||
NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ') | ||
|
||
# Rubular: http://rubular.com/r/l6KN6rH5XE | ||
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = r'(?<=\d)\.(?=[A-Z])' | ||
NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ') | ||
|
||
URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//'] | ||
|
||
# Rubular: http://rubular.com/r/3GiRiP2IbD | ||
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())' | ||
|
||
# Rubular: http://rubular.com/r/Gn18aAnLdZ | ||
NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r") | ||
|
||
QuotationsFirstRule = Rule(r"''", '"') | ||
QuotationsSecondRule = Rule(r'``', '"') | ||
|
||
|
||
class HTML(object): | ||
# Rubular: http://rubular.com/r/9d0OVOEJWj | ||
HTMLTagRule = Rule(r"<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[\^'\">\s]+))?)+\s*|\s*)\/?>", '') | ||
|
||
# Rubular: http://rubular.com/r/XZVqMPJhea | ||
EscapedHTMLTagRule = Rule(r'<\/?[^gt;]*gt;', '') | ||
|
||
All = [HTMLTagRule, EscapedHTMLTagRule] | ||
|
||
|
||
class PDF(object): | ||
# Rubular: http://rubular.com/r/UZAVcwqck8 | ||
NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '') | ||
|
||
# Rubular: http://rubular.com/r/eaNwGavmdo | ||
NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ') |
Oops, something went wrong.