Skip to content

Commit

Permalink
added pybsd in repo
Browse files Browse the repository at this point in the history
  • Loading branch information
shhossain committed Aug 27, 2023
1 parent b55544c commit 14405ac
Show file tree
Hide file tree
Showing 48 changed files with 2,065 additions and 14 deletions.
Empty file added .gitmodules
Empty file.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# BanglaTranslationKit
# BanglaTranslationKit (bntrans)

BanglaTranslationKit is a collaborative open-source language translation package meticulously designed for smooth offline conversion between both Bangla and English languages (English to Bangla and Bangla to English)

## Installation

```bash
pip install bangla-translation-kit
pip install bntrans
```

## Usage
Expand Down
2 changes: 2 additions & 0 deletions bntrans/pysbd/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .segmenter import Segmenter
from .about import __version__
112 changes: 112 additions & 0 deletions bntrans/pysbd/abbreviation_replacer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# -*- coding: utf-8 -*-
import re
from pysbd.utils import Text


def replace_pre_number_abbr(txt, abbr):
# prepend a space to avoid needing another regex for start of string
txt = " " + txt
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
# remove the prepended space
txt = txt[1:]
return txt


def replace_prepositive_abbr(txt, abbr):
# prepend a space to avoid needing another regex for start of string
txt = " " + txt
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
# remove the prepended space
txt = txt[1:]
return txt


class AbbreviationReplacer(object):
def __init__(self, text, lang):
self.text = text
self.lang = lang

def replace(self):
self.text = Text(self.text).apply(
self.lang.PossessiveAbbreviationRule,
self.lang.KommanditgesellschaftRule,
*self.lang.SingleLetterAbbreviationRules.All
)
abbr_handled_text = ""
for line in self.text.splitlines(True):
abbr_handled_text += self.search_for_abbreviations_in_string(line)
self.text = abbr_handled_text
self.replace_multi_period_abbreviations()
self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
self.text = self.replace_abbreviation_as_sentence_boundary()
return self.text

def replace_abbreviation_as_sentence_boundary(self):
sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
self.text = re.sub(regex, '\\1.', self.text)
return self.text

def replace_multi_period_abbreviations(self):
def mpa_replace(match):
match = match.group()
match = re.sub(re.escape(r"."), "∯", match)
return match

self.text = re.sub(
self.lang.MULTI_PERIOD_ABBREVIATION_REGEX,
mpa_replace,
self.text,
flags=re.IGNORECASE
)

def replace_period_of_abbr(self, txt, abbr):
# prepend a space to avoid needing another regex for start of string
txt = " " + txt
txt = re.sub(
r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
abbr=re.escape(abbr.strip())
),
"∯",
txt,
)
# remove the prepended space
txt = txt[1:]
return txt


def search_for_abbreviations_in_string(self, text):
lowered = text.lower()
for abbr in self.lang.Abbreviation.ABBREVIATIONS:
stripped = abbr.strip()
if stripped not in lowered:
continue
abbrev_match = re.findall(
r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
)
if not abbrev_match:
continue
next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
char_array = re.findall(next_word_start, text)
for ind, match in enumerate(abbrev_match):
text = self.scan_for_replacements(
text, match, ind, char_array
)
return text

def scan_for_replacements(self, txt, am, ind, char_array):
try:
char = char_array[ind]
except IndexError:
char = ""
prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS
number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS
upper = str(char).isupper()
if not upper or am.strip().lower() in prepositive:
if am.strip().lower() in prepositive:
txt = replace_prepositive_abbr(txt, am)
elif am.strip().lower() in number_abbr:
txt = replace_pre_number_abbr(txt, am)
else:
txt = self.replace_period_of_abbr(txt, am)
return txt
10 changes: 10 additions & 0 deletions bntrans/pysbd/about.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# inspired from:
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/

__title__ = "pysbd"
__version__ = "0.3.4"
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
__uri__ = "http://nipunsadvilkar.github.io/"
__author__ = "Nipun Sadvilkar"
__email__ = "nipunsadvilkar@gmail.com"
__license__ = "MIT"
94 changes: 94 additions & 0 deletions bntrans/pysbd/between_punctuation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
import re
from functools import partial
from pysbd.punctuation_replacer import replace_punctuation


class BetweenPunctuation(object):
# Rubular: http://rubular.com/r/2YFrKWQUYi
BETWEEN_SINGLE_QUOTES_REGEX = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'"

BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = r"(?<=\s)‘(?:[^’]|’[a-zA-Z])*’"

# Rubular: http://rubular.com/r/3Pw1QlXOjd
BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"'

# https://regex101.com/r/r6I1bW/1
# https://stackoverflow.com/questions/13577372/do-python-regular-expressions-have-an-equivalent-to-rubys-atomic-grouping?noredirect=1&lq=1
BETWEEN_DOUBLE_QUOTES_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"'

# Rubular: http://rubular.com/r/x6s4PZK8jc
BETWEEN_QUOTE_ARROW_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»'

BETWEEN_QUOTE_ARROW_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»"

# Rubular: http://rubular.com/r/JbAIpKdlSq
BETWEEN_QUOTE_SLANTED_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”"
BETWEEN_QUOTE_SLANTED_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”"

# Rubular: http://rubular.com/r/WX4AvnZvlX
BETWEEN_SQUARE_BRACKETS_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]"

BETWEEN_SQUARE_BRACKETS_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]'

# Rubular: http://rubular.com/r/6tTityPflI
BETWEEN_PARENS_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)"

BETWEEN_PARENS_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)"

# Rubular: http://rubular.com/r/mXf8cW025o
WORD_WITH_LEADING_APOSTROPHE = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'\S"

# Rubular: http://rubular.com/r/jTtDKfjxzr
BETWEEN_EM_DASHES_REGEX = r"\-\-(?>[^\-\-])*\-\-"

BETWEEN_EM_DASHES_REGEX_2 = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--"

def __init__(self, text):
self.text = text

def replace(self):
return self.sub_punctuation_between_quotes_and_parens(self.text)

def sub_punctuation_between_quotes_and_parens(self, txt):
txt = self.sub_punctuation_between_single_quotes(txt)
txt = self.sub_punctuation_between_single_quote_slanted(txt)
txt = self.sub_punctuation_between_double_quotes(txt)
txt = self.sub_punctuation_between_square_brackets(txt)
txt = self.sub_punctuation_between_parens(txt)
txt = self.sub_punctuation_between_quotes_arrow(txt)
txt = self.sub_punctuation_between_em_dashes(txt)
txt = self.sub_punctuation_between_quotes_slanted(txt)
return txt

def sub_punctuation_between_parens(self, txt):
return re.sub(self.BETWEEN_PARENS_REGEX_2, replace_punctuation, txt)

def sub_punctuation_between_square_brackets(self, txt):
return re.sub(self.BETWEEN_SQUARE_BRACKETS_REGEX_2, replace_punctuation,
txt)

def sub_punctuation_between_single_quotes(self, txt):
if re.search(self.WORD_WITH_LEADING_APOSTROPHE, txt) and \
(not re.search(r"'\s", txt)):
return txt
return re.sub(self.BETWEEN_SINGLE_QUOTES_REGEX,
partial(replace_punctuation, match_type='single'), txt)

def sub_punctuation_between_single_quote_slanted(self, txt):
return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_REGEX,
replace_punctuation, txt)

def sub_punctuation_between_double_quotes(self, txt):
return re.sub(self.BETWEEN_DOUBLE_QUOTES_REGEX_2, replace_punctuation,
txt)

def sub_punctuation_between_quotes_arrow(self, txt):
return re.sub(self.BETWEEN_QUOTE_ARROW_REGEX_2, replace_punctuation, txt)

def sub_punctuation_between_em_dashes(self, txt):
return re.sub(self.BETWEEN_EM_DASHES_REGEX_2, replace_punctuation, txt)

def sub_punctuation_between_quotes_slanted(self, txt):
return re.sub(self.BETWEEN_QUOTE_SLANTED_REGEX_2, replace_punctuation,
txt)
Empty file added bntrans/pysbd/clean/__init__.py
Empty file.
80 changes: 80 additions & 0 deletions bntrans/pysbd/clean/rules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-
from pysbd.utils import Rule


class CleanRules(object):

# NOTE: Caution: Might require \\ for special characters
# if regex is defined with r'' then dont
# add extra \\ for special characters
# Rubular: http://rubular.com/r/V57WnM9Zut
NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '')

# Rubular: http://rubular.com/r/dMxp5MixFS
DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r")

# Rubular: http://rubular.com/r/H6HOJeA8bq
DoubleNewLineRule = Rule(r'\n\n', "\r")

# Rubular: http://rubular.com/r/FseyMiiYFT
NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '')

ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r")

EscapedNewLineRule = Rule(r'\\n', "\n")

EscapedCarriageReturnRule = Rule(r'\\r', "\r")

TypoEscapedNewLineRule = Rule(r'\\\ n', "\n")

TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r")

# Rubular: http://rubular.com/r/bAJrhyLNeZ
InlineFormattingRule = Rule(r'{b\^&gt;\d*&lt;b\^}|{b\^>\d*<b\^}', '')

# Rubular: http://rubular.com/r/8mc1ArOIGy
TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r")

# Rubular: http://rubular.com/r/DwNSuZrNtk
ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ')

# Rubular: http://rubular.com/r/IQ4TPfsbd8
ConsecutiveForwardSlashRule = Rule(r'\/{3}', '')

# Rubular: http://rubular.com/r/6dt98uI76u
NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])'
# NO_SPACE_BETWEEN_SENTENCES_REGEX = r'[a-z]\.[A-Z]'
NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')

# Rubular: http://rubular.com/r/l6KN6rH5XE
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = r'(?<=\d)\.(?=[A-Z])'
NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')

URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']

# Rubular: http://rubular.com/r/3GiRiP2IbD
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())'

# Rubular: http://rubular.com/r/Gn18aAnLdZ
NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r")

QuotationsFirstRule = Rule(r"''", '"')
QuotationsSecondRule = Rule(r'``', '"')


class HTML(object):
# Rubular: http://rubular.com/r/9d0OVOEJWj
HTMLTagRule = Rule(r"<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[\^'\">\s]+))?)+\s*|\s*)\/?>", '')

# Rubular: http://rubular.com/r/XZVqMPJhea
EscapedHTMLTagRule = Rule(r'&lt;\/?[^gt;]*gt;', '')

All = [HTMLTagRule, EscapedHTMLTagRule]


class PDF(object):
# Rubular: http://rubular.com/r/UZAVcwqck8
NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '')

# Rubular: http://rubular.com/r/eaNwGavmdo
NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')
Loading

0 comments on commit 14405ac

Please sign in to comment.