organize files

yoosif0 · Sep 3, 2017 · 9e37c17 · 9e37c17
1 parent 5269c1e
commit 9e37c17
Show file tree

Hide file tree

Showing 12 changed files with 15,801 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,12 @@ A python tool that converts Arabic diacritised text to a sequence of phonemes an
 
 This code  is based on https://github.com/nawarhalabi/Arabic-Phonetiser
 
-Modifications mainly make the code in https://github.com/nawarhalabi/Arabic-Phonetiser compatible with python 3, and provide easy to use cmd tool to build the pronunciation dictionary.   
+Modifications mainly make the code in https://github.com/nawarhalabi/Arabic-Phonetiser compatible with python 3, and provide easy to use cmd tool to build the pronunciation dictionary. 
+
+
+
+```
+python phonetise_Arabic.py nawar_corpus_tashkeel.txt
+```  
 
 
diff --git a/corpus2dict.py b/corpus2dict.py
@@ -2,7 +2,8 @@
 import operator
 import re
 
-from nawar_phonotizer import phonetise_Arabic
+import phonetise_Arabic
+from arutils import arabic_utils
 
 parser = argparse.ArgumentParser(description='extracts dictionary and phones from a corpus')
 parser.add_argument('-i', '--input', type=argparse.FileType(mode='r', encoding='utf-8'),
@@ -15,7 +16,8 @@
 
 def corpus2dictionary(corpus, project_name):
     pronunciation_dict = {}
-    phones_list = list()
+    phones_list = set()
+    phones_list.add('SIL')
     repeated_words = 0
     for line in corpus:
         if args.s_tag:
@@ -25,23 +27,38 @@ def corpus2dictionary(corpus, project_name):
         words = sentence.split()
         for word in words:
             if word == '-':
+                if word == "+MUSIC+" or word == "+NONTRANS+":
+                    continue
+                if not arabic_utils.remove_diacritics(word):
+                    continue
+                if word.isdigit():
+                    continue
                 phonetic = 'SIL'
                 pronunciation_dict[word] = phonetic
             else:
+                # u1: utterances_pronunciations_with_boundaries
+                # u2: utterances_pronunciations
+                # dic
                 u1, u2, dic = phonetise_Arabic.my_phonetise(word)
                 if len(u2) > 1:
                     repeated_words += 1
                 phonetic = ' '.join(u2).replace('sil', '').strip()
+                phonetic = phonetic.replace('TTTT', 'TT')
+                phonetic = phonetic.replace('tttt', 'tt')
+                phonetic = phonetic.replace('i0i0', 'i0')
+                phonetic = phonetic.replace('nnnn', 'nn')
                 if len(phonetic.replace(' ', '')) < len(word):
                     phonetic = ' '.join(list(phonetise_Arabic.arabicToBuckwalter(word)))
-                if word in pronunciation_dict:
-                    if pronunciation_dict[word] != phonetic:
-                        repeated_words += 1
+                clean_word = arabic_utils.remove_diacritics(word)
+                if clean_word in pronunciation_dict:
+                    if pronunciation_dict[clean_word] != phonetic:
+                        pronunciation_dict[clean_word].append(phonetic)
+                else:
+                    pronunciation_dict[clean_word] = [phonetic]
                 if word not in pronunciation_dict:
                     pronunciation_dict[word] = phonetic
             for ph in phonetic.split():
-                if ph not in phones_list:
-                    phones_list.append(ph)
+                phones_list.add(ph)
 
     sorted_dict = sorted(pronunciation_dict.items(), key=operator.itemgetter(1))
     phones_list = sorted(phones_list)