Skip to content

Commit 259daf1

Browse files
authored
Merge pull request #579 from PyThaiNLP/add-text-generator
Add pythainlp.generate
2 parents 1b288f2 + 5da36f0 commit 259daf1

File tree

12 files changed

+724
-5
lines changed

12 files changed

+724
-5
lines changed

docs/api/corpus.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,21 @@ TNC
3636
---
3737

3838
.. autofunction:: pythainlp.corpus.tnc.word_freqs
39+
.. autofunction:: pythainlp.corpus.tnc.unigram_word_freqs
40+
.. autofunction:: pythainlp.corpus.tnc.bigram_word_freqs
41+
.. autofunction:: pythainlp.corpus.tnc.trigram_word_freqs
3942

4043
TTC
4144
---
4245

4346
.. autofunction:: pythainlp.corpus.ttc.word_freqs
47+
.. autofunction:: pythainlp.corpus.ttc.unigram_word_freqs
48+
49+
OSCAR
50+
-----
51+
52+
.. autofunction:: pythainlp.corpus.oscar.word_freqs
53+
.. autofunction:: pythainlp.corpus.oscar.unigram_word_freqs
4454

4555
Util
4656
----

docs/api/generate.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
.. currentmodule:: pythainlp.generate
2+
3+
pythainlp.generate
4+
==================
5+
The :class:`pythainlp.generate` is Thai text generate with PyThaiNLP.
6+
7+
Modules
8+
-------
9+
10+
.. autoclass:: Unigram
11+
:members:
12+
.. autoclass:: Bigram
13+
:members:
14+
.. autoclass:: Trigram
15+
:members:
16+
.. autofunction:: pythainlp.generate.thai2fit.gen_sentence

pythainlp/corpus/oscar.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Thai unigram word frequency from OSCAR Corpus (icu word tokenize)
4+
5+
Credit: Korakot Chaovavanich
6+
https://web.facebook.com/groups/colab.thailand/permalink/1524070061101680/
7+
"""
8+
9+
__all__ = [
10+
"word_freqs",
11+
"unigram_word_freqs"
12+
]
13+
14+
from collections import defaultdict
15+
from typing import List, Tuple
16+
17+
from pythainlp.corpus import get_corpus_path
18+
19+
_FILENAME = "oscar_icu"
20+
21+
22+
def word_freqs() -> List[Tuple[str, int]]:
23+
"""
24+
Get word frequency from OSCAR Corpus (icu word tokenize)
25+
"""
26+
word_freqs = []
27+
_path = get_corpus_path(_FILENAME)
28+
with open(_path, "r", encoding="utf-8") as f:
29+
_data = [i for i in f.readlines()]
30+
del _data[0]
31+
for line in _data:
32+
_temp = line.strip().split(",")
33+
if len(_temp) >= 2:
34+
if _temp[0] != " " and '"' not in _temp[0]:
35+
word_freqs.append((_temp[0], int(_temp[1])))
36+
elif _temp[0] == " ":
37+
word_freqs.append(("<s/>", int(_temp[1])))
38+
39+
return word_freqs
40+
41+
42+
def unigram_word_freqs() -> defaultdict:
43+
"""
44+
Get unigram word frequency from OSCAR Corpus (icu word tokenize)
45+
"""
46+
_path = get_corpus_path(_FILENAME)
47+
_word_freqs = defaultdict(int)
48+
with open(_path, "r", encoding="utf-8-sig") as fh:
49+
_data = [i for i in fh.readlines()]
50+
del _data[0]
51+
for i in _data:
52+
_temp = i.strip().split(",")
53+
if _temp[0] != " " and '"' not in _temp[0]:
54+
_word_freqs[_temp[0]] = int(_temp[-1])
55+
elif _temp[0] == " ":
56+
_word_freqs["<s/>"] = int(_temp[-1])
57+
58+
return _word_freqs

pythainlp/corpus/tnc.py

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,34 @@
11
# -*- coding: utf-8 -*-
22
"""
33
Thai National Corpus word frequency
4-
5-
Credit: Korakot Chaovavanich‎
6-
https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
74
"""
85

9-
__all__ = ["word_freqs"]
6+
__all__ = [
7+
"word_freqs",
8+
"unigram_word_freqs",
9+
"bigram_word_freqs",
10+
"trigram_word_freqs"
11+
]
1012

13+
from collections import defaultdict
1114
from typing import List, Tuple
1215

1316
from pythainlp.corpus import get_corpus
17+
from pythainlp.corpus import get_corpus_path
18+
1419

1520
_FILENAME = "tnc_freq.txt"
21+
_BIGRAM = "tnc_bigram_word_freqs"
22+
_TRIGRAM = "tnc_trigram_word_freqs"
1623

1724

1825
def word_freqs() -> List[Tuple[str, int]]:
1926
"""
2027
Get word frequency from Thai National Corpus (TNC)
2128
\n(See: `dev/pythainlp/corpus/tnc_freq.txt\
2229
<https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_)
30+
31+
Credit: Korakot Chaovavanich https://bit.ly/3wSkZsF
2332
"""
2433
lines = list(get_corpus(_FILENAME))
2534
word_freqs = []
@@ -29,3 +38,45 @@ def word_freqs() -> List[Tuple[str, int]]:
2938
word_freqs.append((word_freq[0], int(word_freq[1])))
3039

3140
return word_freqs
41+
42+
43+
def unigram_word_freqs() -> defaultdict:
44+
"""
45+
Get unigram word frequency from Thai National Corpus (TNC)
46+
"""
47+
lines = list(get_corpus(_FILENAME))
48+
_word_freqs = defaultdict(int)
49+
for i in lines:
50+
_temp = i.strip().split(" ")
51+
if len(_temp) >= 2:
52+
_word_freqs[_temp[0]] = int(_temp[-1])
53+
54+
return _word_freqs
55+
56+
57+
def bigram_word_freqs() -> defaultdict:
58+
"""
59+
Get bigram word frequency from Thai National Corpus (TNC)
60+
"""
61+
_path = get_corpus_path(_BIGRAM)
62+
_word_freqs = defaultdict(int)
63+
with open(_path, "r", encoding="utf-8-sig") as fh:
64+
for i in fh.readlines():
65+
_temp = i.strip().split(" ")
66+
_word_freqs[(_temp[0], _temp[1])] = int(_temp[-1])
67+
68+
return _word_freqs
69+
70+
71+
def trigram_word_freqs() -> defaultdict:
72+
"""
73+
Get trigram word frequency from Thai National Corpus (TNC)
74+
"""
75+
_path = get_corpus_path(_TRIGRAM)
76+
_word_freqs = defaultdict(int)
77+
with open(_path, "r", encoding="utf-8-sig") as fh:
78+
for i in fh.readlines():
79+
_temp = i.strip().split(" ")
80+
_word_freqs[(_temp[0], _temp[1], _temp[2])] = int(_temp[-1])
81+
82+
return _word_freqs

pythainlp/corpus/ttc.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,12 @@
66
https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
77
"""
88

9-
__all__ = ["word_freqs"]
9+
__all__ = [
10+
"word_freqs",
11+
"unigram_word_freqs"
12+
]
1013

14+
from collections import defaultdict
1115
from typing import List, Tuple
1216

1317
from pythainlp.corpus import get_corpus
@@ -29,3 +33,17 @@ def word_freqs() -> List[Tuple[str, int]]:
2933
word_freqs.append((word_freq[0], int(word_freq[1])))
3034

3135
return word_freqs
36+
37+
38+
def unigram_word_freqs() -> defaultdict:
39+
"""
40+
Get unigram word frequency from Thai Textbook Corpus (TTC)
41+
"""
42+
lines = list(get_corpus(_FILENAME))
43+
_word_freqs = defaultdict(int)
44+
for i in lines:
45+
_temp = i.strip().split(" ")
46+
if len(_temp) >= 2:
47+
_word_freqs[_temp[0]] = int(_temp[-1])
48+
49+
return _word_freqs

pythainlp/generate/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Thai Text generate
4+
"""
5+
6+
__all__ = [
7+
"Unigram",
8+
"Bigram",
9+
"Trigram"
10+
]
11+
12+
from pythainlp.generate.core import Unigram, Bigram, Trigram

0 commit comments

Comments
 (0)