Skip to content

Commit 1d09417

Browse files
steliosstelios
authored andcommitted
Add libraries
1 parent 4a51fec commit 1d09417

File tree

3 files changed

+112
-3
lines changed

3 files changed

+112
-3
lines changed

.gitignore

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,3 @@ code/local/parser.py
99
# Remove unnecessary data files
1010
data/tweets_*
1111
data/processed_tweets*
12-
13-
code/local/language_detector.py
14-
code/local/libraries.py

code/local/language_detector.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Author: Alejandro Nolla - z0mbiehunt3r
2+
# Purpose: Example for detecting language using a stopwords based approach
3+
# Created: 15/05/13
4+
# original code: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/
5+
6+
# Edit: Athanasios Giannakopoulos
7+
# Date: 13/01/2017
8+
9+
import sys, numpy
10+
from nltk import wordpunct_tokenize
11+
from nltk.corpus import stopwords
12+
13+
14+
def _calculate_languages_ratios(text):
15+
"""
16+
Calculate probability of given text to be written in several languages and
17+
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
18+
19+
@param text: Text whose language want to be detected
20+
@type text: str
21+
22+
@return: Dictionary with languages and unique stopwords seen in analyzed text
23+
@rtype: dict
24+
"""
25+
26+
languages_ratios = {}
27+
28+
tokens = wordpunct_tokenize(text)
29+
words = [word.lower() for word in tokens]
30+
31+
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
32+
# choose only between 3 languages
33+
for language in ['english', 'german', 'french']:
34+
stopwords_set = set(stopwords.words(language))
35+
words_set = set(words)
36+
common_elements = words_set.intersection(stopwords_set)
37+
38+
languages_ratios[language] = len(common_elements) # language "score"
39+
40+
for value in languages_ratios.values():
41+
if value != 0:
42+
return languages_ratios
43+
44+
return 'nan'
45+
46+
def detect_language(text):
47+
"""
48+
Calculate probability of given text to be written in several languages and
49+
return the highest scored.
50+
51+
It uses a stopwords based approach, counting how many unique stopwords
52+
are seen in analyzed text.
53+
54+
@param text: Text whose language want to be detected
55+
@type text: str
56+
57+
@return: Most scored language guessed
58+
@rtype: str
59+
"""
60+
61+
ratios = _calculate_languages_ratios(text)
62+
63+
if ratios == 'nan':
64+
return numpy.nan
65+
66+
most_rated_language = max(ratios, key=ratios.get)
67+
68+
if most_rated_language == 'english':
69+
return 'en'
70+
if most_rated_language == 'german':
71+
return 'de'
72+
if most_rated_language == 'french':
73+
return 'fr'

code/local/libraries.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# data handling
2+
import glob
3+
import json
4+
import pickle
5+
import os.path
6+
import numpy as np
7+
import pandas as pd
8+
from datetime import datetime
9+
10+
# visualization
11+
import folium
12+
import networkx as nx
13+
import seaborn as sns
14+
import matplotlib as mpl
15+
import matplotlib.cm as cm
16+
import matplotlib.pyplot as plt
17+
18+
# location data
19+
import googlemaps
20+
21+
# NLP
22+
from nltk.corpus import stopwords
23+
import string
24+
import re
25+
26+
# API requests
27+
import requests
28+
29+
# translation
30+
from yandex_translate import YandexTranslate
31+
32+
# data structures
33+
from collections import defaultdict
34+
from collections import Counter
35+
36+
# warnings
37+
import warnings
38+
warnings.simplefilter(action = "ignore", category = FutureWarning)
39+
pd.options.mode.chained_assignment = None # default='warn'

0 commit comments

Comments
 (0)