Skip to content
This repository has been archived by the owner on Apr 17, 2021. It is now read-only.

Commit

Permalink
organize files
Browse files Browse the repository at this point in the history
  • Loading branch information
motazsaad committed Sep 3, 2017
1 parent 5269c1e commit 9e37c17
Show file tree
Hide file tree
Showing 12 changed files with 15,801 additions and 36 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ A python tool that converts Arabic diacritised text to a sequence of phonemes an

This code is based on https://github.com/nawarhalabi/Arabic-Phonetiser

Modifications mainly make the code in https://github.com/nawarhalabi/Arabic-Phonetiser compatible with python 3, and provide easy to use cmd tool to build the pronunciation dictionary.
Modifications mainly make the code in https://github.com/nawarhalabi/Arabic-Phonetiser compatible with python 3, and provide easy to use cmd tool to build the pronunciation dictionary.



```
python phonetise_Arabic.py nawar_corpus_tashkeel.txt
```


31 changes: 24 additions & 7 deletions corpus2dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import operator
import re

from nawar_phonotizer import phonetise_Arabic
import phonetise_Arabic
from arutils import arabic_utils

parser = argparse.ArgumentParser(description='extracts dictionary and phones from a corpus')
parser.add_argument('-i', '--input', type=argparse.FileType(mode='r', encoding='utf-8'),
Expand All @@ -15,7 +16,8 @@

def corpus2dictionary(corpus, project_name):
pronunciation_dict = {}
phones_list = list()
phones_list = set()
phones_list.add('SIL')
repeated_words = 0
for line in corpus:
if args.s_tag:
Expand All @@ -25,23 +27,38 @@ def corpus2dictionary(corpus, project_name):
words = sentence.split()
for word in words:
if word == '-':
if word == "+MUSIC+" or word == "+NONTRANS+":
continue
if not arabic_utils.remove_diacritics(word):
continue
if word.isdigit():
continue
phonetic = 'SIL'
pronunciation_dict[word] = phonetic
else:
# u1: utterances_pronunciations_with_boundaries
# u2: utterances_pronunciations
# dic
u1, u2, dic = phonetise_Arabic.my_phonetise(word)
if len(u2) > 1:
repeated_words += 1
phonetic = ' '.join(u2).replace('sil', '').strip()
phonetic = phonetic.replace('TTTT', 'TT')
phonetic = phonetic.replace('tttt', 'tt')
phonetic = phonetic.replace('i0i0', 'i0')
phonetic = phonetic.replace('nnnn', 'nn')
if len(phonetic.replace(' ', '')) < len(word):
phonetic = ' '.join(list(phonetise_Arabic.arabicToBuckwalter(word)))
if word in pronunciation_dict:
if pronunciation_dict[word] != phonetic:
repeated_words += 1
clean_word = arabic_utils.remove_diacritics(word)
if clean_word in pronunciation_dict:
if pronunciation_dict[clean_word] != phonetic:
pronunciation_dict[clean_word].append(phonetic)
else:
pronunciation_dict[clean_word] = [phonetic]
if word not in pronunciation_dict:
pronunciation_dict[word] = phonetic
for ph in phonetic.split():
if ph not in phones_list:
phones_list.append(ph)
phones_list.add(ph)

sorted_dict = sorted(pronunciation_dict.items(), key=operator.itemgetter(1))
phones_list = sorted(phones_list)
Expand Down
Loading

0 comments on commit 9e37c17

Please sign in to comment.