-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Decent preprocessing for Google Translate input
- Loading branch information
1 parent
25d0fc7
commit d3b8660
Showing
2 changed files
with
160 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
rust, रस्ट, noun, | ||
variable, चर, noun, | ||
mutable, परिवर्तनशील, adjective, | ||
mutability, परिवर्तनशीलता, noun, | ||
immutable, अपरिवर्तनशील, adjective, | ||
immutability, अपरिवर्तनशीलता, noun, | ||
safety, सुरक्षा, noun, | ||
concurrent, सहवर्ती, adjective, | ||
concurrency, सहवर्तीता, noun, | ||
directory, फ़ोल्ङर, noun, | ||
code, कोङ, noun, | ||
compile, कंपाइल, noun, | ||
compile, कंपाइल, verb, | ||
compiler, कंपाइलर, noun, | ||
save, सेव, verb, | ||
program, प्रोग्राम, verb, | ||
programming, प्रोग्रामिंग, adjective, | ||
error, एरर, noun, | ||
error message, गलती की सूचना, noun, | ||
output, आउटपुट, noun, | ||
programmer, प्रोग्रामर, noun, | ||
rustacean, रस्ट के उपयोगिता, noun, | ||
value, जानकारी, noun, | ||
bug, बग, noun, | ||
implement, औजारित, noun, | ||
instance, इंस्टेंस, noun, | ||
string, स्ट्रिंग, noun, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
from bs4 import BeautifulSoup | ||
import markdown | ||
|
||
import re | ||
import sys | ||
|
||
#import nltk | ||
|
||
def read_glossary(filename="glossary.csv"): | ||
'''Return dictionary of english words -> hindi words''' | ||
file = open(filename, 'r') | ||
glossary = {} | ||
for line in file.readlines(): | ||
if line.startswith('#'): | ||
# Ignore comment | ||
continue | ||
eng, hin, _type, _descr = line.split(',') | ||
glossary[eng.strip()] = hin.strip() | ||
return glossary | ||
|
||
def replace_with_placeholder(text, reg, holder_id, name): | ||
'''Replace all instances of the given regex with placeholders. @name gives | ||
the name of the placeholder.''' | ||
|
||
out = "" | ||
md = {} | ||
prev_start = 0 | ||
for match in reg.finditer(text): | ||
out += text[prev_start:match.span()[0]] + ("_HOLDER_%d" % holder_id) | ||
prev_start = match.span()[1] | ||
md[holder_id] = (name, match.group()) | ||
holder_id += 1 | ||
out += text[prev_start:] | ||
return (out, md, holder_id) | ||
|
||
|
||
def strip_markdown(input_text, holder_id = 0): | ||
'''Process markdown text, returning simpler text with placeholders and some | ||
metadata. The simple text, after machine translation, can be combined with | ||
the placeholders to produce final text. | ||
holder_id is used internally. It is the placeholder id, and is incremented | ||
each time''' | ||
|
||
out = "" | ||
md = {} | ||
|
||
# Some regular expressions | ||
re_head = re.compile('(#+ )(.*)') | ||
re_comment = re.compile('(<!--.*-->)') | ||
|
||
# Whether or not we are currently processing a code-block | ||
code_block_mode = None | ||
|
||
for line in input_text.split('\n'): | ||
if code_block_mode != None: | ||
if line.startswith('```'): | ||
# Code block finished. Add placeholder | ||
# Add placeholder | ||
out += "_HOLDER_%d\n\n" % holder_id | ||
code_block_mode += line + '\n' | ||
md[holder_id] = ("CodeBlock", code_block_mode) | ||
holder_id += 1 | ||
code_block_mode = None | ||
else: | ||
code_block_mode += line + '\n' | ||
continue | ||
|
||
if line.startswith('```'): | ||
assert(code_block_mode == None) | ||
code_block_mode = line + '\n' | ||
continue | ||
elif line.startswith('#'): | ||
# Add placeholder | ||
head, text = re_head.match(line).groups() | ||
out += "_HOLDER_%d\n\n" % holder_id | ||
md[holder_id] = ("Head", head) | ||
holder_id += 1 | ||
|
||
# Parse recursively | ||
o, m, holder_id = strip_markdown(text, holder_id) | ||
out += o | ||
md.update(m) | ||
continue | ||
|
||
out += line + '\n' | ||
|
||
# Replace all single-quoted pieces of code | ||
re_single_code = re.compile('(\`[^`]+\`)') | ||
out, new_md, holder_id = replace_with_placeholder(out, re_single_code, holder_id, 'Code') | ||
md.update(new_md) | ||
|
||
# Replace all spans | ||
re_span = re.compile('(\<span.+\</span\>)') | ||
out, new_md, holder_id = replace_with_placeholder(out, re_span, holder_id, 'Span') | ||
md.update(new_md) | ||
|
||
return (out, md, holder_id) | ||
|
||
def replacement_word(word): | ||
return '_%s_' % (word.replace(' ', '_').upper()) | ||
|
||
def replace_words(text, glossary): | ||
# The most horrible, inefficient way to do this | ||
|
||
for en_word in glossary: | ||
new_out = "" | ||
md = {} | ||
prev_start = 0 | ||
for match in re.compile('\W(%ss?)\W' % en_word).finditer(text.lower()): | ||
new_out += text[prev_start:match.span()[0] + 1] \ | ||
+ replacement_word(match.groups()[0]) | ||
prev_start = match.span()[1] - 1 | ||
new_out += text[prev_start:] | ||
text = new_out | ||
|
||
# Replace all single newlines with a space | ||
|
||
text = re.sub(r'([^\n])\n([^\n])', r'\1 \2', text) | ||
return text | ||
|
||
|
||
# First parse markdown | ||
in_file = open(sys.argv[1], 'r') | ||
#encode_html(markdown.markdown(in_file.read())) | ||
simple_text, md, holder_id = strip_markdown(in_file.read()) | ||
#print(simple_text, md) | ||
|
||
glossary = read_glossary() | ||
|
||
final = replace_words(simple_text, glossary) | ||
print(final) | ||
#print(unmd_text, md, holder_id) |