Skip to content

Commit 994eb2f

Browse files
committed
0.3.1
1 parent a23516a commit 994eb2f

18 files changed

+18518
-0
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
.idea/
2+
*.pyc
3+
*.egg-info
4+
*.swp
5+
PKG-INFO
6+
/dist
7+
/build

CHANGELOG.rst

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
rurecoder
2+
=======
3+
4+
Changelog
5+
---------
6+
7+
v0.1.0
8+
- Реалиизация базовой функциональности.
9+
10+
v0.2.0
11+
- Добавлние декодеров. Теперь умеет декодить такие кракозябры (взял примеры на 2cyr.com):
12+
- - èðèëèöà
13+
- - %D0%A2%D0%BE%D0%B2%D0%B0+%D0%B5+%D0%BA
14+
- - åäíà ãîäè
15+
- - ирилица
16+
17+
v0.3.0
18+
- Добавлена поддержка python3.
19+
20+
v0.3.1
21+
- Удаление из зависимостей пакета regex.
22+
- Минорные фиксы.

MANIFEST.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
include README.rst *.py
2+
recursive-include rurecoder *.py *.json

README.rst

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
rurecoder
2+
=======
3+
4+
5+
Назначение
6+
----------
7+
8+
Пакет нужен чтобы чинить "кракозябры" (или "краказябры") в читаемый текст. Например: "õîğîøèé òåêñò" => "хооший текст".
9+
10+
11+
Установка
12+
---------
13+
::
14+
15+
$ git clone https://github.com/zvezdochiot/python-rurecoder #(based https://bitbucket.org/dkuryakin/recoder.git)
16+
$ cd python-rurecoder && python setup.py install
17+
18+
или
19+
::
20+
21+
$ pip install rurecoder
22+
23+
Полезные команды
24+
----------------
25+
26+
Использование как консольная тулза.
27+
::
28+
29+
$ echo "Îñíîâíàÿ Îëèìïèéñêàÿ äåðåâíÿ â" | python -mrecoder [coding]
30+
31+
По умолчанию, coding=utf-8.
32+
33+
Использование в коде
34+
--------------------
35+
36+
Чаще всего с кракозябрами справится такой базовый пример:
37+
38+
.. code-block:: python
39+
40+
from rurecoder.cyrillic import Recoder
41+
rec = Recoder()
42+
broken_text = u'Îñíîâíàÿ Îëèìïèéñêàÿ äåðåâíÿ â'
43+
fixed_text = rec.fix_common(broken_text)
44+
print fixed_text.encode('utf-8')
45+
46+
47+
Если базовый пример не справился, можно поиграться с настройками:
48+
49+
.. code-block:: python
50+
51+
from rurecoder.cyrillic import Recoder
52+
rec = Recoder(depth=4)
53+
broken_text = u'...'
54+
fixed_text = rec.fix(broken_text) # fix работает дольше и сложнее чем fix_common
55+
...
56+
57+
58+
Можно использовать частоупотребимые слова (и, на, к, в, ...) как индикатор успеха перекодировки. Но в этом случае текст починится только если в нём есть эти слова:
59+
60+
.. code-block:: python
61+
62+
from rurecoder.cyrillic import Recoder
63+
rec = Recoder(use_plus_words=True)
64+
...
65+
66+
67+
Замечания
68+
---------
69+
70+
В данный момент поддерживается только кириллица.
71+
72+
Расширение
73+
----------
74+
75+
Если хочется расширить библиотеку не только кириллицей, предусмотренна удобная тулза:
76+
::
77+
78+
$ cat some_learning_text.txt | python -mrurecoder.builder [coding]
79+
80+
По-умолчанию, coding=utf-8. На stdin подавать текстовку для обучения. На выходе получится 2 файлика: 3grams.json и plus_words.json. Далее всё делается по аналогии с recoder.cyrillic.
81+
82+
Тесты
83+
-----
84+
85+
Тут всё просто:
86+
::
87+
88+
$ git clone https://github.com/zvezdochiot/python-rurecoder #(based https://bitbucket.org/dkuryakin/recoder.git)
89+
$ cd python-rurecoder && python setup.py test
90+
91+
See also CHANGELOG.rst

bin/rurecoder

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env python
2+
3+
import sys, os
4+
5+
# Try to detect where it is run from and set prefix and the search path.
6+
# It is assumed that the user installed cpuset using the --prefix= option
7+
prefix, bin = os.path.split(sys.path[0])
8+
9+
if bin == 'bin' and prefix != sys.prefix:
10+
sys.prefix = prefix
11+
sys.exec_prefix = prefix
12+
13+
major, minor = sys.version_info[0:2]
14+
local_path = [os.path.join(prefix, 'lib', 'python'),
15+
os.path.join(prefix, 'lib', 'python%s.%s' % (major, minor)),
16+
os.path.join(prefix, 'lib', 'python%s.%s' % (major, minor),
17+
'site-packages')]
18+
sys.path = local_path + sys.path
19+
20+
from rurecoder.__main__ import main
21+
22+
if __name__ == '__main__':
23+
main()

rurecoder/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from __future__ import absolute_import
2+
3+
from . import tests
4+
from . import cyrillic
5+
from . import base_recoder

rurecoder/__main__.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from __future__ import unicode_literals
4+
5+
import sys
6+
from . import cyrillic
7+
from .pyver import *
8+
9+
coding = sys.argv[1] if len(sys.argv) > 1 else 'utf-8'
10+
11+
if pyver == 2:
12+
input_data = sys.stdin.read().decode(coding, errors='ignore')
13+
elif pyver == 3:
14+
if len(sys.argv) == 1:
15+
input_data = sys.stdin.read()
16+
else:
17+
input_data = sys.stdin.buffer.read().decode(coding, errors='ignore')
18+
19+
output_data = cyrillic.Recoder().fix_common(input_data)
20+
21+
if pyver == 2:
22+
sys.stdout.write(output_data.encode(coding, errors='ignore'))
23+
elif pyver == 3:
24+
if len(sys.argv) == 1:
25+
sys.stdout.write(output_data)
26+
else:
27+
sys.stdout.buffer.write(output_data.encode(coding, errors='ignore'))

rurecoder/base_recoder.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from __future__ import with_statement
4+
5+
from .pyver import *
6+
import json
7+
8+
__all__ = ['BaseRecoder']
9+
10+
html_parser = HTMLParser.HTMLParser()
11+
12+
class BaseRecoder(object):
13+
base_dir = None
14+
file_3grams = None
15+
file_plus_words = None
16+
codings = None
17+
18+
funcs = [
19+
[lambda *args, **kwargs: kwargs['text'].encode(kwargs['coding'], errors=kwargs['errors']), True, (unicode_type,)],
20+
[lambda *args, **kwargs: kwargs['text'].decode(kwargs['coding'], errors=kwargs['errors']), True, (encoded_type,)],
21+
[lambda *args, **kwargs: unquote_plus(kwargs['text']), False, (encoded_type,)],
22+
[lambda *args, **kwargs: unquote_plus(kwargs['text'].replace('=', '%')), False, (encoded_type,)],
23+
[lambda *args, **kwargs: html_parser.unescape(kwargs['text']), False, (unicode_type,)],
24+
]
25+
26+
regular_error_classes = (
27+
UnicodeError,
28+
AttributeError, # for py3 support
29+
)
30+
31+
def __init__(self, depth=2, errors='ignore', use_plus_words=False):
32+
self.depth = depth
33+
self.errors = errors
34+
self.use_plus_words = use_plus_words
35+
self.last_transform = None
36+
37+
with open(self.file_3grams) as f:
38+
self.grams = json.load(f)
39+
40+
with open(self.file_plus_words) as f:
41+
self.plus_words = set(json.load(f))
42+
43+
def _contains_plus_word(self, text):
44+
for word in self.plus_words:
45+
if (' ' + word + ' ') in text:
46+
return True
47+
return False
48+
49+
def _iter(self, text, depth, transform=lambda _text: _text):
50+
if depth <= 0:
51+
raise StopIteration
52+
for func, coding_dependent, allowed_types in self.funcs:
53+
if not isinstance(text, allowed_types): continue
54+
for coding in (self.codings if coding_dependent else ['fake_coding']):
55+
try:
56+
fixed_text = func(text=text, coding=coding, errors=self.errors)
57+
new_transform = lambda _text: func(text=transform(_text), coding=coding, errors=self.errors)
58+
yield fixed_text, new_transform
59+
for sub_fixed_text, sub_new_transform in self._iter(fixed_text, depth - 1, new_transform):
60+
yield sub_fixed_text, sub_new_transform
61+
except self.regular_error_classes:
62+
pass
63+
64+
def _calc_weight(self, text):
65+
weight = 0.0
66+
count = 0
67+
68+
for i in range_iterator(len(text) - 2):
69+
gram = text[i:i+3]
70+
weight += self.grams.get(gram, 0.0)
71+
count += 1
72+
return (weight / count) if count else 0.0
73+
74+
def fix(self, unicode_text):
75+
max_weight = self._calc_weight(unicode_text.lower())
76+
max_text = unicode_text
77+
for fixed_text, transform in self._iter(unicode_text, self.depth):
78+
if not isinstance(fixed_text, unicode_type):
79+
continue
80+
fixed_text = fixed_text.lower()
81+
weight = self._calc_weight(fixed_text)
82+
if weight > max_weight and (not self.use_plus_words or self._contains_plus_word(fixed_text)):
83+
max_weight = weight
84+
max_text = transform(unicode_text)
85+
self.last_transform = transform
86+
return max_text
87+
88+
def fix_common(self, unicode_text):
89+
max_weight = self._calc_weight(unicode_text.lower())
90+
max_text = unicode_text
91+
92+
for ce in self.codings:
93+
for cd in self.codings:
94+
if ce == cd: continue
95+
try:
96+
fixed_text = unicode_text.encode(ce, errors=self.errors).decode(cd, errors=self.errors).lower()
97+
weight = self._calc_weight(fixed_text)
98+
if weight > max_weight and (not self.use_plus_words or self._contains_plus_word(fixed_text)):
99+
max_weight = weight
100+
max_text = unicode_text.encode(ce, errors=self.errors).decode(cd, errors=self.errors)
101+
self.last_transform = lambda text: text.encode(ce, errors=self.errors).decode(cd, errors=self.errors)
102+
except self.regular_error_classes:
103+
pass
104+
return max_text

rurecoder/builder/__init__.py

Whitespace-only changes.

rurecoder/builder/__main__.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from __future__ import with_statement
4+
5+
import sys
6+
import json
7+
from .builder import Builder
8+
from ..pyver import *
9+
10+
coding = sys.argv[1] if len(sys.argv) > 1 else 'utf-8'
11+
12+
if pyver == 2:
13+
input_data = sys.stdin.read().decode(coding, errors='ignore')
14+
elif pyver == 3:
15+
if len(sys.argv) == 1:
16+
input_data = sys.stdin.read()
17+
else:
18+
input_data = sys.stdin.buffer.read().decode(coding, errors='ignore')
19+
20+
builder = Builder()
21+
22+
with open('3grams.json', 'w') as f:
23+
grams = builder.build_grams(input_data)
24+
json.dump(grams, f, indent=4)
25+
26+
with open('plus_words.json', 'w') as f:
27+
plus_words = builder.build_plus_words(input_data)
28+
json.dump(plus_words, f, indent=4)

0 commit comments

Comments
 (0)