added pybsd in repo

shhossain · Aug 27, 2023 · 14405ac · 14405ac
1 parent b55544c
commit 14405ac
Show file tree

Hide file tree

Showing 48 changed files with 2,065 additions and 14 deletions.
diff --git a/.gitmodules b/.gitmodules
diff --git a/README.md b/README.md
@@ -1,11 +1,11 @@
-# BanglaTranslationKit
+# BanglaTranslationKit (bntrans)
 
 BanglaTranslationKit is a collaborative open-source language translation package meticulously designed for smooth offline conversion between both Bangla and English languages (English to Bangla and Bangla to English)
 
 ## Installation
 
 ```bash
-pip install bangla-translation-kit
+pip install bntrans
 ```
 
 ## Usage

diff --git a/bntrans/pysbd/__init__.py b/bntrans/pysbd/__init__.py
@@ -0,0 +1,2 @@
+from .segmenter import Segmenter
+from .about import __version__
diff --git a/bntrans/pysbd/abbreviation_replacer.py b/bntrans/pysbd/abbreviation_replacer.py
@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+import re
+from pysbd.utils import Text
+
+
+def replace_pre_number_abbr(txt, abbr):
+    # prepend a space to avoid needing another regex for start of string
+    txt = " " + txt
+    txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
+    # remove the prepended space
+    txt = txt[1:]
+    return txt
+
+
+def replace_prepositive_abbr(txt, abbr):
+    # prepend a space to avoid needing another regex for start of string
+    txt = " " + txt
+    txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
+    # remove the prepended space
+    txt = txt[1:]
+    return txt
+
+
+class AbbreviationReplacer(object):
+    def __init__(self, text, lang):
+        self.text = text
+        self.lang = lang
+
+    def replace(self):
+        self.text = Text(self.text).apply(
+            self.lang.PossessiveAbbreviationRule,
+            self.lang.KommanditgesellschaftRule,
+            *self.lang.SingleLetterAbbreviationRules.All
+        )
+        abbr_handled_text = ""
+        for line in self.text.splitlines(True):
+            abbr_handled_text += self.search_for_abbreviations_in_string(line)
+        self.text = abbr_handled_text
+        self.replace_multi_period_abbreviations()
+        self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
+        self.text = self.replace_abbreviation_as_sentence_boundary()
+        return self.text
+
+    def replace_abbreviation_as_sentence_boundary(self):
+        sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
+        regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
+        self.text = re.sub(regex, '\\1.', self.text)
+        return self.text
+
+    def replace_multi_period_abbreviations(self):
+        def mpa_replace(match):
+            match = match.group()
+            match = re.sub(re.escape(r"."), "∯", match)
+            return match
+
+        self.text = re.sub(
+            self.lang.MULTI_PERIOD_ABBREVIATION_REGEX,
+            mpa_replace,
+            self.text,
+            flags=re.IGNORECASE
+        )
+
+    def replace_period_of_abbr(self, txt, abbr):
+        # prepend a space to avoid needing another regex for start of string
+        txt = " " + txt
+        txt = re.sub(
+            r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
+                abbr=re.escape(abbr.strip())
+            ),
+            "∯",
+            txt,
+        )
+        # remove the prepended space
+        txt = txt[1:]
+        return txt
+
+
+    def search_for_abbreviations_in_string(self, text):
+        lowered = text.lower()
+        for abbr in self.lang.Abbreviation.ABBREVIATIONS:
+            stripped = abbr.strip()
+            if stripped not in lowered:
+                continue
+            abbrev_match = re.findall(
+                r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
+            )
+            if not abbrev_match:
+                continue
+            next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
+            char_array = re.findall(next_word_start, text)
+            for ind, match in enumerate(abbrev_match):
+                text = self.scan_for_replacements(
+                    text, match, ind, char_array
+                )
+        return text
+
+    def scan_for_replacements(self, txt, am, ind, char_array):
+        try:
+            char = char_array[ind]
+        except IndexError:
+            char = ""
+        prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS
+        number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS
+        upper = str(char).isupper()
+        if not upper or am.strip().lower() in prepositive:
+            if am.strip().lower() in prepositive:
+                txt = replace_prepositive_abbr(txt, am)
+            elif am.strip().lower() in number_abbr:
+                txt = replace_pre_number_abbr(txt, am)
+            else:
+                txt = self.replace_period_of_abbr(txt, am)
+        return txt
diff --git a/bntrans/pysbd/about.py b/bntrans/pysbd/about.py
@@ -0,0 +1,10 @@
+# inspired from:
+# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
+
+__title__ = "pysbd"
+__version__ = "0.3.4"
+__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
+__uri__ = "http://nipunsadvilkar.github.io/"
+__author__ = "Nipun Sadvilkar"
+__email__ = "nipunsadvilkar@gmail.com"
+__license__ = "MIT"
diff --git a/bntrans/pysbd/between_punctuation.py b/bntrans/pysbd/between_punctuation.py
@@ -0,0 +1,94 @@
+# -*- coding: utf-8 -*-
+import re
+from functools import partial
+from pysbd.punctuation_replacer import replace_punctuation
+
+
+class BetweenPunctuation(object):
+    # Rubular: http://rubular.com/r/2YFrKWQUYi
+    BETWEEN_SINGLE_QUOTES_REGEX = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'"
+
+    BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = r"(?<=\s)‘(?:[^’]|’[a-zA-Z])*’"
+
+    # Rubular: http://rubular.com/r/3Pw1QlXOjd
+    BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"'
+
+    # https://regex101.com/r/r6I1bW/1
+    # https://stackoverflow.com/questions/13577372/do-python-regular-expressions-have-an-equivalent-to-rubys-atomic-grouping?noredirect=1&lq=1
+    BETWEEN_DOUBLE_QUOTES_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"'
+
+    # Rubular: http://rubular.com/r/x6s4PZK8jc
+    BETWEEN_QUOTE_ARROW_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»'
+
+    BETWEEN_QUOTE_ARROW_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»"
+
+    # Rubular: http://rubular.com/r/JbAIpKdlSq
+    BETWEEN_QUOTE_SLANTED_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”"
+    BETWEEN_QUOTE_SLANTED_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”"
+
+    # Rubular: http://rubular.com/r/WX4AvnZvlX
+    BETWEEN_SQUARE_BRACKETS_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]"
+
+    BETWEEN_SQUARE_BRACKETS_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]'
+
+    # Rubular: http://rubular.com/r/6tTityPflI
+    BETWEEN_PARENS_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)"
+
+    BETWEEN_PARENS_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)"
+
+    # Rubular: http://rubular.com/r/mXf8cW025o
+    WORD_WITH_LEADING_APOSTROPHE = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'\S"
+
+    # Rubular: http://rubular.com/r/jTtDKfjxzr
+    BETWEEN_EM_DASHES_REGEX = r"\-\-(?>[^\-\-])*\-\-"
+
+    BETWEEN_EM_DASHES_REGEX_2 = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--"
+
+    def __init__(self, text):
+        self.text = text
+
+    def replace(self):
+        return self.sub_punctuation_between_quotes_and_parens(self.text)
+
+    def sub_punctuation_between_quotes_and_parens(self, txt):
+        txt = self.sub_punctuation_between_single_quotes(txt)
+        txt = self.sub_punctuation_between_single_quote_slanted(txt)
+        txt = self.sub_punctuation_between_double_quotes(txt)
+        txt = self.sub_punctuation_between_square_brackets(txt)
+        txt = self.sub_punctuation_between_parens(txt)
+        txt = self.sub_punctuation_between_quotes_arrow(txt)
+        txt = self.sub_punctuation_between_em_dashes(txt)
+        txt = self.sub_punctuation_between_quotes_slanted(txt)
+        return txt
+
+    def sub_punctuation_between_parens(self, txt):
+        return re.sub(self.BETWEEN_PARENS_REGEX_2, replace_punctuation, txt)
+
+    def sub_punctuation_between_square_brackets(self, txt):
+        return re.sub(self.BETWEEN_SQUARE_BRACKETS_REGEX_2, replace_punctuation,
+                      txt)
+
+    def sub_punctuation_between_single_quotes(self, txt):
+        if re.search(self.WORD_WITH_LEADING_APOSTROPHE, txt) and \
+                (not re.search(r"'\s", txt)):
+            return txt
+        return re.sub(self.BETWEEN_SINGLE_QUOTES_REGEX,
+                      partial(replace_punctuation, match_type='single'), txt)
+
+    def sub_punctuation_between_single_quote_slanted(self, txt):
+        return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_REGEX,
+                      replace_punctuation, txt)
+
+    def sub_punctuation_between_double_quotes(self, txt):
+        return re.sub(self.BETWEEN_DOUBLE_QUOTES_REGEX_2, replace_punctuation,
+                      txt)
+
+    def sub_punctuation_between_quotes_arrow(self, txt):
+        return re.sub(self.BETWEEN_QUOTE_ARROW_REGEX_2, replace_punctuation, txt)
+
+    def sub_punctuation_between_em_dashes(self, txt):
+        return re.sub(self.BETWEEN_EM_DASHES_REGEX_2, replace_punctuation, txt)
+
+    def sub_punctuation_between_quotes_slanted(self, txt):
+        return re.sub(self.BETWEEN_QUOTE_SLANTED_REGEX_2, replace_punctuation,
+                      txt)
diff --git a/bntrans/pysbd/clean/__init__.py b/bntrans/pysbd/clean/__init__.py
diff --git a/bntrans/pysbd/clean/rules.py b/bntrans/pysbd/clean/rules.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+from pysbd.utils import Rule
+
+
+class CleanRules(object):
+
+    # NOTE: Caution: Might require \\ for special characters
+    # if regex is defined with r'' then dont
+    # add extra \\ for special characters
+    # Rubular: http://rubular.com/r/V57WnM9Zut
+    NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '')
+
+    # Rubular: http://rubular.com/r/dMxp5MixFS
+    DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r")
+
+    # Rubular: http://rubular.com/r/H6HOJeA8bq
+    DoubleNewLineRule = Rule(r'\n\n', "\r")
+
+    # Rubular: http://rubular.com/r/FseyMiiYFT
+    NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '')
+
+    ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r")
+
+    EscapedNewLineRule = Rule(r'\\n', "\n")
+
+    EscapedCarriageReturnRule = Rule(r'\\r', "\r")
+
+    TypoEscapedNewLineRule = Rule(r'\\\ n', "\n")
+
+    TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r")
+
+    # Rubular: http://rubular.com/r/bAJrhyLNeZ
+    InlineFormattingRule = Rule(r'{b\^&gt;\d*&lt;b\^}|{b\^>\d*<b\^}', '')
+
+    # Rubular: http://rubular.com/r/8mc1ArOIGy
+    TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r")
+
+    # Rubular: http://rubular.com/r/DwNSuZrNtk
+    ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ')
+
+    # Rubular: http://rubular.com/r/IQ4TPfsbd8
+    ConsecutiveForwardSlashRule = Rule(r'\/{3}', '')
+
+    # Rubular: http://rubular.com/r/6dt98uI76u
+    NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])'
+    # NO_SPACE_BETWEEN_SENTENCES_REGEX = r'[a-z]\.[A-Z]'
+    NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
+
+    # Rubular: http://rubular.com/r/l6KN6rH5XE
+    NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = r'(?<=\d)\.(?=[A-Z])'
+    NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
+
+    URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
+
+    # Rubular: http://rubular.com/r/3GiRiP2IbD
+    NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())'
+
+    # Rubular: http://rubular.com/r/Gn18aAnLdZ
+    NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r")
+
+    QuotationsFirstRule = Rule(r"''", '"')
+    QuotationsSecondRule = Rule(r'``', '"')
+
+
+class HTML(object):
+    # Rubular: http://rubular.com/r/9d0OVOEJWj
+    HTMLTagRule = Rule(r"<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[\^'\">\s]+))?)+\s*|\s*)\/?>", '')
+
+    # Rubular: http://rubular.com/r/XZVqMPJhea
+    EscapedHTMLTagRule = Rule(r'&lt;\/?[^gt;]*gt;', '')
+
+    All = [HTMLTagRule, EscapedHTMLTagRule]
+
+
+class PDF(object):
+    # Rubular: http://rubular.com/r/UZAVcwqck8
+    NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '')
+
+    # Rubular: http://rubular.com/r/eaNwGavmdo
+    NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')