Decent preprocessing for Google Translate input

venkatarun95 · Feb 17, 2019 · d3b8660 · d3b8660
1 parent 25d0fc7
commit d3b8660
Show file tree

Hide file tree

Showing 2 changed files with 160 additions and 0 deletions.
diff --git a/auto_translate/glossary.csv b/auto_translate/glossary.csv
@@ -0,0 +1,27 @@
+rust, रस्ट, noun,
+variable, चर, noun,
+mutable, परिवर्तनशील, adjective,
+mutability, परिवर्तनशीलता, noun,
+immutable, अपरिवर्तनशील, adjective,
+immutability, अपरिवर्तनशीलता, noun,
+safety, सुरक्षा, noun,
+concurrent, सहवर्ती, adjective,
+concurrency, सहवर्तीता, noun,
+directory, फ़ोल्ङर, noun,
+code, कोङ, noun,
+compile, कंपाइल, noun,
+compile, कंपाइल, verb,
+compiler, कंपाइलर, noun,
+save, सेव, verb,
+program, प्रोग्राम, verb,
+programming, प्रोग्रामिंग, adjective,
+error, एरर, noun,
+error message, गलती की सूचना, noun,
+output, आउटपुट, noun,
+programmer, प्रोग्रामर, noun,
+rustacean, रस्ट के उपयोगिता, noun,
+value, जानकारी, noun,
+bug, बग, noun,
+implement, औजारित, noun,
+instance, इंस्टेंस, noun,
+string, स्ट्रिंग, noun,
diff --git a/auto_translate/prep.py b/auto_translate/prep.py
@@ -0,0 +1,133 @@
+from bs4 import BeautifulSoup
+import markdown
+
+import re
+import sys
+
+#import nltk
+
+def read_glossary(filename="glossary.csv"):
+    '''Return dictionary of english words -> hindi words'''
+    file = open(filename, 'r')
+    glossary = {}
+    for line in file.readlines():
+        if line.startswith('#'):
+            # Ignore comment
+            continue
+        eng, hin, _type, _descr = line.split(',')
+        glossary[eng.strip()] = hin.strip()
+    return glossary
+
+def replace_with_placeholder(text, reg, holder_id, name):
+    '''Replace all instances of the given regex with placeholders. @name gives
+    the name of the placeholder.'''
+
+    out = ""
+    md = {}
+    prev_start = 0
+    for match in reg.finditer(text):
+        out += text[prev_start:match.span()[0]] + ("_HOLDER_%d" % holder_id)
+        prev_start = match.span()[1]
+        md[holder_id] = (name, match.group())
+        holder_id += 1
+    out += text[prev_start:]
+    return (out, md, holder_id)
+
+
+def strip_markdown(input_text, holder_id = 0):
+    '''Process markdown text, returning simpler text with placeholders and some
+    metadata. The simple text, after machine translation, can be combined with
+    the placeholders to produce final text.
+
+    holder_id is used internally. It is the placeholder id, and is incremented
+    each time'''
+
+    out = ""
+    md = {}
+
+    # Some regular expressions
+    re_head = re.compile('(#+ )(.*)')
+    re_comment = re.compile('(<!--.*-->)')
+
+    # Whether or not we are currently processing a code-block
+    code_block_mode = None
+
+    for line in input_text.split('\n'):
+        if code_block_mode != None:
+            if line.startswith('```'):
+                # Code block finished. Add placeholder
+                # Add placeholder
+                out += "_HOLDER_%d\n\n" % holder_id
+                code_block_mode += line + '\n'
+                md[holder_id] = ("CodeBlock", code_block_mode)
+                holder_id += 1
+                code_block_mode = None
+            else:
+                code_block_mode += line + '\n'
+            continue
+
+        if line.startswith('```'):
+            assert(code_block_mode == None)
+            code_block_mode = line + '\n'
+            continue
+        elif line.startswith('#'):
+            # Add placeholder
+            head, text = re_head.match(line).groups()
+            out += "_HOLDER_%d\n\n" % holder_id
+            md[holder_id] = ("Head", head)
+            holder_id += 1
+
+            # Parse recursively
+            o, m, holder_id = strip_markdown(text, holder_id)
+            out += o
+            md.update(m)
+            continue
+
+        out += line + '\n'
+
+    # Replace all single-quoted pieces of code
+    re_single_code = re.compile('(\`[^`]+\`)')
+    out, new_md, holder_id = replace_with_placeholder(out, re_single_code, holder_id, 'Code')
+    md.update(new_md)
+
+    # Replace all spans
+    re_span = re.compile('(\<span.+\</span\>)')
+    out, new_md, holder_id = replace_with_placeholder(out, re_span, holder_id, 'Span')
+    md.update(new_md)
+
+    return (out, md, holder_id)
+
+def replacement_word(word):
+    return '_%s_' % (word.replace(' ', '_').upper())
+
+def replace_words(text, glossary):
+    # The most horrible, inefficient way to do this
+
+    for en_word in glossary:
+        new_out = ""
+        md = {}
+        prev_start = 0
+        for match in re.compile('\W(%ss?)\W' % en_word).finditer(text.lower()):
+            new_out += text[prev_start:match.span()[0] + 1] \
+            + replacement_word(match.groups()[0])
+            prev_start = match.span()[1] - 1
+        new_out += text[prev_start:]
+        text = new_out
+
+    # Replace all single newlines with a space
+
+    text = re.sub(r'([^\n])\n([^\n])', r'\1 \2', text)
+    return text
+
+
+# First parse markdown
+in_file = open(sys.argv[1], 'r')
+#encode_html(markdown.markdown(in_file.read()))
+simple_text, md, holder_id = strip_markdown(in_file.read())
+#print(simple_text, md)
+
+glossary = read_glossary()
+
+final = replace_words(simple_text, glossary)
+print(final)
+#print(unmd_text, md, holder_id)