Skip to content

Commit f4e0135

Browse files
authored
Merge pull request #21 from anetschka/dev
2 parents 1b427b2 + 62f9e0b commit f4e0135

File tree

4 files changed

+53
-13
lines changed

4 files changed

+53
-13
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ This python package is being developed as a `TextBlob` **Language
1414
Extension**. See [Extension
1515
Guidelines](https://textblob.readthedocs.org/en/dev/contributing.html)
1616
for details.
17+
This repo is my personal fork. It contains German-language adaptations that I consider useful. For the main textblob-de repo, visit [Markus Killer's repo](https://github.com/markuskiller/textblob-de).
1718

1819
Features
1920
--------
@@ -55,6 +56,10 @@ for details):
5556
$ pip install -U git+https://github.com/markuskiller/textblob-de.git@dev
5657
$ python -m textblob.download_corpora
5758

59+
To install this fork, just change the repo URL:
60+
61+
$ pip install -U git+https://github.com/anetschka/textblob-de.git@dev
62+
5863
Note
5964

6065
`TextBlob` will be installed/upgraded automatically when running

tests/test_blob.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ def test_translate(self):
293293
assert_true(translated in ["This is a sentence.",
294294
"This is a sentence .",
295295
"That's a sentence.",
296+
"That is a sentence.",
296297
"That's a sentence ."])
297298

298299
@expected_failure

tests/test_tokenizers.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ class TestPatternTokenizer(unittest.TestCase):
191191
def setUp(self):
192192
self.tokenizer = PatternTokenizer()
193193
self.text = "Heute ist der 3. Mai 2014 und Dr. Meier feiert seinen 43. " \
194-
"Geburtstag."
194+
"Geburtstag. Er wünscht sich z. B. einen Pullover von Frau v. Stein."
195195
self.snt1 = "Heute ist der 3 ."
196196

197197
def tearDown(self):
@@ -202,8 +202,7 @@ def test_tokenize(self):
202202
['Heute',
203203
'ist',
204204
'der',
205-
'3',
206-
'.',
205+
'3.',
207206
'Mai',
208207
'2014',
209208
'und',
@@ -214,6 +213,18 @@ def test_tokenize(self):
214213
'43',
215214
'.',
216215
'Geburtstag',
216+
'.',
217+
'Er',
218+
'wünscht',
219+
'sich',
220+
'z.',
221+
'B.',
222+
'einen',
223+
'Pullover',
224+
'von',
225+
'Frau',
226+
'v.',
227+
'Stein',
217228
'.'])
218229

219230
def test_exclude_punc(self):
@@ -231,12 +242,23 @@ def test_exclude_punc(self):
231242
'feiert',
232243
'seinen',
233244
'43',
234-
'Geburtstag'])
245+
'Geburtstag',
246+
'Er',
247+
'wünscht',
248+
'sich',
249+
'z',
250+
'B',
251+
'einen',
252+
'Pullover',
253+
'von',
254+
'Frau',
255+
'v',
256+
'Stein'])
235257

236258
def test_tokenize_nested(self):
237259
assert_equal(self.tokenizer.tokenize(self.text, nested=True),
238-
[['Heute', 'ist', 'der', '3', '.'],
239-
['Mai',
260+
[['Heute', 'ist', 'der', '3.',
261+
'Mai',
240262
'2014',
241263
'und',
242264
'Dr.',
@@ -245,7 +267,9 @@ def test_tokenize_nested(self):
245267
'seinen',
246268
'43',
247269
'.'],
248-
['Geburtstag', '.']])
270+
['Geburtstag', '.'],
271+
['Er', 'wünscht', 'sich', 'z.', 'B.', 'einen', 'Pullover', 'von', 'Frau', 'v.', 'Stein', '.']
272+
])
249273

250274
def test_itokenize(self):
251275
gen = self.tokenizer.itokenize(self.text)
@@ -255,9 +279,10 @@ def test_itokenize(self):
255279

256280
def test_sent_tokenize(self):
257281
sents = self.tokenizer.sent_tokenize(self.text)
258-
assert_equal(sents, ['Heute ist der 3 .',
259-
'Mai 2014 und Dr. Meier feiert seinen 43 .',
260-
'Geburtstag .'])
282+
assert_equal(sents, ['Heute ist der 3. Mai 2014 und Dr. Meier feiert seinen 43 .',
283+
'Geburtstag .',
284+
'Er wünscht sich z. B. einen Pullover von Frau v. Stein .'
285+
])
261286

262287
def test_word_tokenize(self):
263288
tokens = self.tokenizer.word_tokenize(self.snt1)

textblob_de/ext/_pattern/text/de/__init__.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,16 +171,25 @@ def stts2universal(token, tag):
171171
return (token, PRON)
172172
return penntreebank2universal(*stts2penntreebank(token, tag))
173173

174+
#let's add some legal abbreviations, too
175+
#let's also completely rule out at least simple ordinals
176+
#let's also rule out anything that could be a date
174177
ABBREVIATIONS = set((
175178
"Abs.", "Abt.", "Ass.", "Br.", "Ch.", "Chr.", "Cie.", "Co.", "Dept.", "Diff.",
176179
"Dr.", "Eidg.", "Exp.", "Fam.", "Fr.", "Hrsg.", "Inc.", "Inv.", "Jh.", "Jt.", "Kt.",
177180
"Mio.", "Mrd.", "Mt.", "Mte.", "Nr.", "Nrn.", "Ord.", "Ph.", "Phil.", "Pkt.",
178181
"Prof.", "Pt.", " S.", "St.", "Stv.", "Tit.", "VII.", "al.", "begr.","bzw.",
179-
"chem.", "dent.", "dipl.", "e.g.", "ehem.", "etc.", "excl.", "exkl.", "hum.",
182+
"chem.", "dent.", "dipl.", "e.g.", "ehem.", "etc.", "excl.", "exkl.", "gem.", "hum.",
180183
"i.e.", "incl.", "ing.", "inkl.", "int.", "iur.", "lic.", "med.", "no.", "oec.",
181184
"phil.", "phys.", "pp.", "psych.", "publ.", "rer.", "sc.", "soz.", "spez.", "stud.",
182-
"theol.", "usw.", "vet.", "vgl.", "vol.", "wiss.",
183-
"d.h.", "h.c.", u"o.ä.", "u.a.", "z.B.", "z.T.", "z.Zt."
185+
"theol.", "usw.", "v.", "vet.", "vgl.", "vol.", "wiss.",
186+
"d.h.", "h.c.", u"o.ä.", "u.a.", "z.B.", "z.T.", "z.Zt.", "z. B.", "d. h.", "h. c.",
187+
u"o. ä.", "u. a.", "z. B.", "z. T.", "z. Zt.",
188+
"BGBl.", "ABl.", "Bundesgesetzbl.",
189+
"0.", "1.", "2.", "3.", "4.", "5.", "6.", "7.", "8.", "9.", "10.", "11.", "12.", "13.",
190+
"14.", "15.", "16.", "17.", "18.", "19.", "20.", "21.", "22.", "23.", "24.", "25.", "26.",
191+
"27.", "28.", "29.", "30.", "31."
192+
184193
))
185194

186195
def find_lemmata(tokens):

0 commit comments

Comments
 (0)