Add libraries

stelios · stelios · commit 1d09417018ef · 2017-02-01T16:24:34.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -9,6 +9,3 @@ code/local/parser.py
 # Remove unnecessary data files
 data/tweets_*
 data/processed_tweets*
-
-code/local/language_detector.py
-code/local/libraries.py
diff --git a/code/local/language_detector.py b/code/local/language_detector.py
@@ -0,0 +1,73 @@
+# Author: Alejandro Nolla - z0mbiehunt3r
+# Purpose: Example for detecting language using a stopwords based approach
+# Created: 15/05/13
+# original code: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/
+
+# Edit: Athanasios Giannakopoulos
+# Date: 13/01/2017
+
+import sys, numpy
+from nltk import wordpunct_tokenize
+from nltk.corpus import stopwords
+
+
+def _calculate_languages_ratios(text):
+    """
+    Calculate probability of given text to be written in several languages and
+    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
+    
+    @param text: Text whose language want to be detected
+    @type text: str
+    
+    @return: Dictionary with languages and unique stopwords seen in analyzed text
+    @rtype: dict
+    """
+
+    languages_ratios = {}
+
+    tokens = wordpunct_tokenize(text)
+    words = [word.lower() for word in tokens]
+
+    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
+    # choose only between 3 languages
+    for language in ['english', 'german', 'french']:
+        stopwords_set = set(stopwords.words(language))
+        words_set = set(words)
+        common_elements = words_set.intersection(stopwords_set)
+
+        languages_ratios[language] = len(common_elements) # language "score"
+
+    for value in languages_ratios.values():
+        if value != 0:
+            return languages_ratios
+    
+    return 'nan'
+
+def detect_language(text):
+    """
+    Calculate probability of given text to be written in several languages and
+    return the highest scored.
+    
+    It uses a stopwords based approach, counting how many unique stopwords
+    are seen in analyzed text.
+    
+    @param text: Text whose language want to be detected
+    @type text: str
+    
+    @return: Most scored language guessed
+    @rtype: str
+    """
+
+    ratios = _calculate_languages_ratios(text)
+
+    if ratios == 'nan':
+        return numpy.nan
+
+    most_rated_language = max(ratios, key=ratios.get)
+
+    if most_rated_language == 'english':
+        return 'en'
+    if most_rated_language == 'german':
+        return 'de'
+    if most_rated_language == 'french':
+        return 'fr'
diff --git a/code/local/libraries.py b/code/local/libraries.py
@@ -0,0 +1,39 @@
+# data handling
+import glob
+import json
+import pickle
+import os.path
+import numpy as np
+import pandas as pd
+from datetime import datetime
+
+# visualization
+import folium
+import networkx as nx
+import seaborn as sns
+import matplotlib as mpl
+import matplotlib.cm as cm
+import matplotlib.pyplot as plt
+
+# location data
+import googlemaps
+
+# NLP
+from nltk.corpus import stopwords
+import string
+import re
+
+# API requests
+import requests
+
+# translation
+from yandex_translate import YandexTranslate
+
+# data structures
+from collections import defaultdict
+from collections import Counter
+
+# warnings
+import warnings
+warnings.simplefilter(action = "ignore", category = FutureWarning)
+pd.options.mode.chained_assignment = None  # default='warn'