1+ # Author: Alejandro Nolla - z0mbiehunt3r
2+ # Purpose: Example for detecting language using a stopwords based approach
3+ # Created: 15/05/13
4+ # original code: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/
5+
6+ # Edit: Athanasios Giannakopoulos
7+ # Date: 13/01/2017
8+
9+ import sys , numpy
10+ from nltk import wordpunct_tokenize
11+ from nltk .corpus import stopwords
12+
13+
14+ def _calculate_languages_ratios (text ):
15+ """
16+ Calculate probability of given text to be written in several languages and
17+ return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
18+
19+ @param text: Text whose language want to be detected
20+ @type text: str
21+
22+ @return: Dictionary with languages and unique stopwords seen in analyzed text
23+ @rtype: dict
24+ """
25+
26+ languages_ratios = {}
27+
28+ tokens = wordpunct_tokenize (text )
29+ words = [word .lower () for word in tokens ]
30+
31+ # Compute per language included in nltk number of unique stopwords appearing in analyzed text
32+ # choose only between 3 languages
33+ for language in ['english' , 'german' , 'french' ]:
34+ stopwords_set = set (stopwords .words (language ))
35+ words_set = set (words )
36+ common_elements = words_set .intersection (stopwords_set )
37+
38+ languages_ratios [language ] = len (common_elements ) # language "score"
39+
40+ for value in languages_ratios .values ():
41+ if value != 0 :
42+ return languages_ratios
43+
44+ return 'nan'
45+
46+ def detect_language (text ):
47+ """
48+ Calculate probability of given text to be written in several languages and
49+ return the highest scored.
50+
51+ It uses a stopwords based approach, counting how many unique stopwords
52+ are seen in analyzed text.
53+
54+ @param text: Text whose language want to be detected
55+ @type text: str
56+
57+ @return: Most scored language guessed
58+ @rtype: str
59+ """
60+
61+ ratios = _calculate_languages_ratios (text )
62+
63+ if ratios == 'nan' :
64+ return numpy .nan
65+
66+ most_rated_language = max (ratios , key = ratios .get )
67+
68+ if most_rated_language == 'english' :
69+ return 'en'
70+ if most_rated_language == 'german' :
71+ return 'de'
72+ if most_rated_language == 'french' :
73+ return 'fr'
0 commit comments