11# -*- coding: utf-8 -*-
22"""
33Thai National Corpus word frequency
4-
5- Credit: Korakot Chaovavanich
6- https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
74"""
85
9- __all__ = ["word_freqs" ]
6+ __all__ = [
7+ "word_freqs" ,
8+ "unigram_word_freqs" ,
9+ "bigram_word_freqs" ,
10+ "trigram_word_freqs"
11+ ]
1012
13+ from collections import defaultdict
1114from typing import List , Tuple
1215
1316from pythainlp .corpus import get_corpus
17+ from pythainlp .corpus import get_corpus_path
18+
1419
1520_FILENAME = "tnc_freq.txt"
21+ _BIGRAM = "tnc_bigram_word_freqs"
22+ _TRIGRAM = "tnc_trigram_word_freqs"
1623
1724
1825def word_freqs () -> List [Tuple [str , int ]]:
1926 """
2027 Get word frequency from Thai National Corpus (TNC)
2128 \n (See: `dev/pythainlp/corpus/tnc_freq.txt\
2229 <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/tnc_freq.txt>`_)
30+
31+ Credit: Korakot Chaovavanich https://bit.ly/3wSkZsF
2332 """
2433 lines = list (get_corpus (_FILENAME ))
2534 word_freqs = []
@@ -29,3 +38,45 @@ def word_freqs() -> List[Tuple[str, int]]:
2938 word_freqs .append ((word_freq [0 ], int (word_freq [1 ])))
3039
3140 return word_freqs
41+
42+
43+ def unigram_word_freqs () -> defaultdict :
44+ """
45+ Get unigram word frequency from Thai National Corpus (TNC)
46+ """
47+ lines = list (get_corpus (_FILENAME ))
48+ _word_freqs = defaultdict (int )
49+ for i in lines :
50+ _temp = i .strip ().split (" " )
51+ if len (_temp ) >= 2 :
52+ _word_freqs [_temp [0 ]] = int (_temp [- 1 ])
53+
54+ return _word_freqs
55+
56+
57+ def bigram_word_freqs () -> defaultdict :
58+ """
59+ Get bigram word frequency from Thai National Corpus (TNC)
60+ """
61+ _path = get_corpus_path (_BIGRAM )
62+ _word_freqs = defaultdict (int )
63+ with open (_path , "r" , encoding = "utf-8-sig" ) as fh :
64+ for i in fh .readlines ():
65+ _temp = i .strip ().split (" " )
66+ _word_freqs [(_temp [0 ], _temp [1 ])] = int (_temp [- 1 ])
67+
68+ return _word_freqs
69+
70+
71+ def trigram_word_freqs () -> defaultdict :
72+ """
73+ Get trigram word frequency from Thai National Corpus (TNC)
74+ """
75+ _path = get_corpus_path (_TRIGRAM )
76+ _word_freqs = defaultdict (int )
77+ with open (_path , "r" , encoding = "utf-8-sig" ) as fh :
78+ for i in fh .readlines ():
79+ _temp = i .strip ().split (" " )
80+ _word_freqs [(_temp [0 ], _temp [1 ], _temp [2 ])] = int (_temp [- 1 ])
81+
82+ return _word_freqs
0 commit comments