Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/transliterate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Modules

.. autofunction:: romanize
.. autofunction:: transliterate
.. autofunction:: pronunciate

Romanize Engines
----------------
Expand Down
3 changes: 2 additions & 1 deletion pythainlp/transliterate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
__all__ = [
"romanize",
"transliterate",
"pronunciate"
]

from pythainlp.transliterate.core import romanize, transliterate
from pythainlp.transliterate.core import romanize, transliterate, pronunciate
70 changes: 52 additions & 18 deletions pythainlp/transliterate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

DEFAULT_ROMANIZE_ENGINE = "royin"
DEFAULT_TRANSLITERATE_ENGINE = "thaig2p"
DEFAULT_PRONUNCIATE_ENGINE = "w2p"


def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:
Expand Down Expand Up @@ -46,7 +47,7 @@ def romanize(text: str, engine: str = DEFAULT_ROMANIZE_ENGINE) -> str:

if engine == "thai2rom":
from .thai2rom import romanize
else: # use default engine "royin"
else: # use default engine: "royin"
from .royin import romanize

return romanize(text)
Expand All @@ -59,51 +60,84 @@ def transliterate(
This function transliterates Thai text.

:param str text: Thai text to be transliterated
:param str engine: 'icu', 'ipa' (default), or 'thaig2p'
:param str engine: 'icu', 'ipa', or 'thaig2p' (default)

:return: A string of phonetic alphabets indicating
how the input text should be pronounced.
:rtype: str

:Options for engines:
* *icu* - International Components for Unicode (ICU)
* *ipa* - International Phonetic Alphabet (IPA) by epitran
* *thaig2p* - (default) Thai Grapheme to Phoneme by deep learning
output is International Phonetic Alphabet (IPA)
(require PyTorch)
* *icu* - pyicu, based on International Components for Unicode (ICU)
* *ipa* - epitran, output is International Phonetic Alphabet (IPA)
* *thaig2p* - (default) Thai Grapheme-to-Phoneme,
output is IPA (require PyTorch)

:Example:
::

from pythainlp.transliterate import transliterate

transliterate("สามารถ", engine="thaig2p")
# output: 's aː ˩˩˦ . m aː t̚ ˥˩'
transliterate("สามารถ", engine="icu")
# output: 's̄āmārt̄h'

transliterate("สามารถ", engine="ipa")
# output: 'saːmaːrot'

transliterate("สามารถ", engine="icu")
# output: 's̄āmārt̄h'
transliterate("สามารถ", engine="thaig2p")
# output: 's aː ˩˩˦ . m aː t̚ ˥˩'

transliterate("ภาพยนตร์", engine="thaig2p")
# output:'pʰ aː p̚ ˥˩ . pʰ a ˦˥ . j o n ˧'
transliterate("ภาพยนตร์", engine="icu")
# output: 'p̣hāphyntr̒'

transliterate("ภาพยนตร์", engine="ipa")
# output: 'pʰaːpjanot'

transliterate("ภาพยนตร์", engine="icu")
# output: 'p̣hāphyntr̒'
transliterate("ภาพยนตร์", engine="thaig2p")
# output:'pʰ aː p̚ ˥˩ . pʰ a ˦˥ . j o n ˧'
"""

if not text or not isinstance(text, str):
return ""

if engine == "icu" or engine == "pyicu":
from .pyicu import transliterate
elif engine == "thaig2p":
from .thaig2p import transliterate
else:
elif engine == "ipa":
from .ipa import transliterate
else: # use default engine: "thaig2p"
from .thaig2p import transliterate

return transliterate(text)


def pronunciate(word: str, engine: str = DEFAULT_PRONUNCIATE_ENGINE) -> str:
"""
This function pronunciates Thai word.

:param str text: Thai text to be pronunciated
:param str engine: 'w2p' (default)

:return: A string of Thai letters indicating
how the input text should be pronounced.
:rtype: str

:Options for engines:
* *w2p* - Thai Word-to-Phoneme

:Example:
::

from pythainlp.transliterate import pronunciate

pronunciate("สามารถ", engine="w2p")
# output: 'สา-มาด'

pronunciate("ภาพยนตร์", engine="w2p")
# output: 'พาบ-พะ-ยน'
"""
if not word or not isinstance(word, str):
return ""

# if engine == "w2p": # has only one engine
from .w2p import pronunciate

return pronunciate(word)
9 changes: 4 additions & 5 deletions pythainlp/transliterate/thaig2p.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@


class ThaiG2P:
def __init__(self):
"""
Transliteration of Thai words.
"""
Latin transliteration of Thai words, using International Phonetic Alphabet
"""

Now supports Thai to Latin (romanization)
"""
def __init__(self):
# get the model, will download if it's not available locally
self.__model_filename = get_corpus_path(_MODEL_NAME)

Expand Down
205 changes: 205 additions & 0 deletions pythainlp/transliterate/w2p.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# -*- coding: utf-8 -*-
"""
Thai Word-to-Phoneme (Thai W2P)
GitHub : https://github.com/wannaphong/Thai_W2P
"""

import codecs
import os
import re
from typing import Union

import numpy as np
from pythainlp.corpus import download, get_corpus_path

_GRAPHEMES = list(
"พจใงต้ืฮแาฐฒฤๅูศฅถฺฎหคสุขเึดฟำฝยลอ็ม"
+ " ณิฑชฉซทรฏฬํัฃวก่ป์ผฆบี๊ธญฌษะไ๋นโภ?"
)
_PHONEMES = list(
"-พจใงต้ืฮแาฐฒฤูศฅถฺฎหคสุขเึดฟำฝยลอ็ม"
+ " ณิฑชฉซทรํฬฏ–ัฃวก่ปผ์ฆบี๊ธฌญะไษ๋นโภ?"
)

_MODEL_NAME = "thai_w2p"


class _Hparams:
batch_size = 256
enc_maxlen = 30 * 2
dec_maxlen = 40 * 2
num_epochs = 50 * 2
hidden_units = 64 * 8
emb_units = 64 * 4
graphemes = ["<pad>", "<unk>", "</s>"] + _GRAPHEMES
phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + _PHONEMES
lr = 0.001


hp = _Hparams()


def _load_vocab():
g2idx = {g: idx for idx, g in enumerate(hp.graphemes)}
idx2g = {idx: g for idx, g in enumerate(hp.graphemes)}

p2idx = {p: idx for idx, p in enumerate(hp.phonemes)}
idx2p = {idx: p for idx, p in enumerate(hp.phonemes)}
# note that g and p mean grapheme and phoneme, respectively.
return g2idx, idx2g, p2idx, idx2p


class Thai_W2P(object):
def __init__(self):
super().__init__()
self.graphemes = hp.graphemes
self.phonemes = hp.phonemes
self.g2idx, self.idx2g, self.p2idx, self.idx2p = _load_vocab()
self.checkpoint = get_corpus_path(_MODEL_NAME)
if self.checkpoint is None:
download(_MODEL_NAME)
self.checkpoint = get_corpus_path(_MODEL_NAME)
self._load_variables()

def _load_variables(self):
self.variables = np.load(self.checkpoint, allow_pickle=True)
# (29, 64). (len(graphemes), emb)
self.enc_emb = self.variables.item().get("encoder.emb.weight")
# (3*128, 64)
self.enc_w_ih = self.variables.item().get("encoder.rnn.weight_ih_l0")
# (3*128, 128)
self.enc_w_hh = self.variables.item().get("encoder.rnn.weight_hh_l0")
# (3*128,)
self.enc_b_ih = self.variables.item().get("encoder.rnn.bias_ih_l0")
# (3*128,)
self.enc_b_hh = self.variables.item().get("encoder.rnn.bias_hh_l0")

# (74, 64). (len(phonemes), emb)
self.dec_emb = self.variables.item().get("decoder.emb.weight")
# (3*128, 64)
self.dec_w_ih = self.variables.item().get("decoder.rnn.weight_ih_l0")
# (3*128, 128)
self.dec_w_hh = self.variables.item().get("decoder.rnn.weight_hh_l0")
# (3*128,)
self.dec_b_ih = self.variables.item().get("decoder.rnn.bias_ih_l0")
# (3*128,)
self.dec_b_hh = self.variables.item().get("decoder.rnn.bias_hh_l0")
# (74, 128)
self.fc_w = self.variables.item().get("decoder.fc.weight")
# (74,)
self.fc_b = self.variables.item().get("decoder.fc.bias")

def _sigmoid(self, x):
return 1 / (1 + np.exp(-x))

def _grucell(self, x, h, w_ih, w_hh, b_ih, b_hh):
rzn_ih = np.matmul(x, w_ih.T) + b_ih
rzn_hh = np.matmul(h, w_hh.T) + b_hh

rz_ih, n_ih = (
rzn_ih[:, : rzn_ih.shape[-1] * 2 // 3],
rzn_ih[:, rzn_ih.shape[-1] * 2 // 3:],
)
rz_hh, n_hh = (
rzn_hh[:, : rzn_hh.shape[-1] * 2 // 3],
rzn_hh[:, rzn_hh.shape[-1] * 2 // 3:],
)

rz = self._sigmoid(rz_ih + rz_hh)
r, z = np.split(rz, 2, -1)

n = np.tanh(n_ih + r * n_hh)
h = (1 - z) * n + z * h

return h

def _gru(self, x, steps, w_ih, w_hh, b_ih, b_hh, h0=None) -> np.ndarray:
if h0 is None:
h0 = np.zeros((x.shape[0], w_hh.shape[1]), np.float32)
h = h0 # initial hidden state

outputs = np.zeros((x.shape[0], steps, w_hh.shape[1]), np.float32)
for t in range(steps):
h = self._grucell(x[:, t, :], h, w_ih, w_hh, b_ih, b_hh) # (b, h)
outputs[:, t, ::] = h

return outputs

def _encode(self, word: str) -> np.ndarray:
chars = list(word) + ["</s>"]
x = [self.g2idx.get(char, self.g2idx["<unk>"]) for char in chars]
x = np.take(self.enc_emb, np.expand_dims(x, 0), axis=0)

return x

def _short_word(self, word: str) -> Union[str, None]:
self.word = word
if self.word.endswith("."):
self.word = self.word.replace(".", "")
self.word = "-".join([i + "อ" for i in list(self.word)])
return self.word
return None

def _predict(self, word: str) -> str:
short_word = self._short_word(word)
if short_word is not None:
return short_word

# encoder
enc = self._encode(word)
enc = self._gru(
enc,
len(word) + 1,
self.enc_w_ih,
self.enc_w_hh,
self.enc_b_ih,
self.enc_b_hh,
h0=np.zeros((1, self.enc_w_hh.shape[-1]), np.float32),
)
last_hidden = enc[:, -1, :]

# decoder
dec = np.take(self.dec_emb, [2], axis=0) # 2: <s>
h = last_hidden

preds = []
for _ in range(20):
h = self._grucell(
dec,
h,
self.dec_w_ih,
self.dec_w_hh,
self.dec_b_ih,
self.dec_b_hh,
) # (b, h)
logits = np.matmul(h, self.fc_w.T) + self.fc_b
pred = logits.argmax()
if pred == 3:
break
preds.append(pred)
dec = np.take(self.dec_emb, [pred], axis=0)

preds = [self.idx2p.get(idx, "<unk>") for idx in preds]

return preds

def __call__(self, word: str) -> str:
if not any(letter in word for letter in self.graphemes):
pron = [word]
else: # predict for oov
pron = self._predict(word)

return "".join(pron)


_THAI_W2P = Thai_W2P()


def pronunciate(text: str) -> str:
"""
Convert a Thai word to its pronunciation in Thai letters.

Input should be one single word.
"""
global _THAI_W2P
return _THAI_W2P(text)
10 changes: 9 additions & 1 deletion tests/test_transliterate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import unittest

import torch
from pythainlp.transliterate import romanize, transliterate
from pythainlp.transliterate import romanize, transliterate, pronunciate
from pythainlp.transliterate.ipa import trans_list, xsampa_list
from pythainlp.transliterate.thai2rom import ThaiTransliterator

Expand Down Expand Up @@ -134,3 +134,11 @@ def test_transliterate(self):
self.assertIsNotNone(transliterate("แมว", engine="thaig2p"))
self.assertIsNotNone(trans_list("คน"))
self.assertIsNotNone(xsampa_list("คน"))

def test_pronunciate(self):
self.assertEqual(pronunciate(""), "")
self.assertIsNotNone(pronunciate("คน", engine="w2p"))
self.assertIsNotNone(pronunciate("แมว", engine="w2p"))
self.assertIsNotNone(pronunciate("มข.", engine="w2p"))
self.assertIsNotNone(pronunciate("มช.", engine="w2p"))
self.assertIsNotNone(pronunciate("jks", engine="w2p"))