python 3 migration

halolimat · Oct 9, 2018 · 1737c81 · 1737c81
1 parent 54ed270
commit 1737c81
Show file tree

Hide file tree

Showing 9 changed files with 194 additions and 32 deletions.
diff --git a/LNEx/Language_Modeling.py b/LNEx/Language_Modeling.py
@@ -157,7 +157,7 @@ def __init__(self, geo_locations):
 
     lm = GazBasedModel(geo_locations)
 
-    print (lm.phrase_probability("new"))
+    print(lm.phrase_probability("new"))
 
     lm.phrase_probability("new avadi")
     lm.phrase_probability("new avadi road")

diff --git a/LNEx/__init__.py b/LNEx/__init__.py
@@ -5,12 +5,12 @@
 v3.0 License.
 #############################################################################"""
 
-import core
-import osm_gazetteer
 import os, json
-
 import elasticsearch
 
+from . import core
+from . import osm_gazetteer
+
 ################################################################################
 ################################################################################
 

diff --git a/LNEx/core.py b/LNEx/core.py
@@ -20,8 +20,8 @@
 load()
 
 # importing local modules
-import Language_Modeling
-from tokenizer import Twokenize
+from . import Language_Modeling
+from .tokenizer import Twokenize
 
 ################################################################################
 ################################################################################
@@ -245,7 +245,7 @@ def build_tree(glm, ts):
                 flattened = list(flatten(i))
 
                 # remove consecutive duplicates
-                final_list = map(itemgetter(0), groupby(flattened))
+                final_list = list(map(itemgetter(0), groupby(flattened)))
 
                 # prune based on the probability from the language model
                 p = " ".join(final_list)
@@ -684,7 +684,7 @@ def filterout_overlaps(valid_ngrams):
 def find_ngrams(input_list, n):
     '''Generates grams of length (n) from the list of unigrams (input_list)'''
 
-    return zip(*[input_list[i:] for i in range(n)])
+    return list(zip(*[input_list[i:] for i in range(n)]))
 
 ################################################################################
 
@@ -817,7 +817,7 @@ def __init__(self, geo_locations, extended_words3):
         ########################################################################
 
         # list of unigrams
-        unigrams = self.glm.unigrams["words"].keys()
+        unigrams = list(self.glm.unigrams["words"].keys())
 
         self.stopwords_notin_gazetteer = set(
             self.extended_words3) - set(unigrams)

diff --git a/LNEx/gaz_augmentation_and_filtering.py b/LNEx/gaz_augmentation_and_filtering.py
@@ -193,7 +193,7 @@ def preprocess_name(loc_name):
 def find_ngrams(unigrams, n):
     '''Created ngrams of length n from the unigrams list'''
 
-    return zip(*[unigrams[i:] for i in range(n)])
+    return list(zip(*[unigrams[i:] for i in range(n)]))
 
 ################################################################################
 
@@ -231,7 +231,7 @@ def high_precision_filtering(geo_locations):
     new_geo_locations = defaultdict(lambda: defaultdict(set))
 
     for text in geo_locations:
-
+        
         original_text = text
 
         text = text.replace("( ", "(").replace(" )", ")").lower()
@@ -252,7 +252,7 @@ def high_precision_filtering(geo_locations):
 
         text = re.sub('\s{2,}', ' ', text).strip()
 
-        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
+        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('UTF-8')
         text = str(text.strip())
 
         # skip few names
@@ -481,7 +481,7 @@ def augment(geo_locations):
                     base_name[1:1] = f_name
 
                     # remove consecutive duplicate tokens
-                    base_name = map(itemgetter(0), groupby(base_name))
+                    base_name = list(map(itemgetter(0), groupby(base_name)))
 
                     flexi_grams.append(" ".join(base_name))
 
@@ -504,4 +504,4 @@ def augment(geo_locations):
                     new_geo_locations[new_name]["meta"] = \
                         set(new_geo_locations[name]["meta"]).union(new_geo_locations[new_name]["meta"])
 
-    return new_geo_locations, get_extended_words3(new_geo_locations.keys())
+    return new_geo_locations, get_extended_words3(list(new_geo_locations.keys()))
diff --git a/LNEx/geo_calculations.py b/LNEx/geo_calculations.py
@@ -56,4 +56,4 @@ def is_bb_acceptable(bb):
 
     texas_bb = [25.8371638, -106.6456461, 36.5007041, -93.5080389]
 
-    print (is_bb_acceptable(texas_bb))
+    print(is_bb_acceptable(texas_bb))
diff --git a/LNEx/osm_gazetteer.py b/LNEx/osm_gazetteer.py
@@ -5,17 +5,15 @@
 v3.0 License.
 #############################################################################"""
 
+import json
 from collections import defaultdict
 
-from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search, Q
 from elasticsearch_dsl.connections import connections
 
 import geo_calculations
 import gaz_augmentation_and_filtering
 
-import json
-
 ################################################################################
 ################################################################################
 
@@ -51,16 +49,16 @@ def search_index(bb):
 
     if connection_string == '' or index_name == '':
 
-        print ("\n###########################################################")
-        print ("Global ERROR: Elastic host and port or index name not defined")
-        print ("#############################################################\n")
+        print("\n###########################################################")
+        print("Global ERROR: Elastic host and port or index name not defined")
+        print("#############################################################\n")
         exit()
 
     if not geo_calculations.is_bb_acceptable(bb) or bb[0] > bb[2] or bb[1] > bb[3]:
 
-        print ("\n##########################################################")
-        print ("Global ERROR: Bounding Box is too big, choose a smaller one!")
-        print ("############################################################\n")
+        print("\n##########################################################")
+        print("Global ERROR: Bounding Box is too big, choose a smaller one!")
+        print("############################################################\n")
         exit()
 
     connections.create_connection(hosts=[connection_string], timeout=60)
@@ -199,11 +197,11 @@ def build_bb_gazetteer(bb, augmentType):
 
     elif augmentType == "NA": # None
         new_geo_locations = gaz_augmentation_and_filtering.filter_geo_locations(geo_locations)
-        extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(new_geo_locations.keys())
+        extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(list(new_geo_locations.keys()))
 
     elif augmentType == "HP": # High Precision Filtering
         new_geo_locations = gaz_augmentation_and_filtering.high_precision_filtering(geo_locations)
-        extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(new_geo_locations.keys())
+        extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(list(new_geo_locations.keys()))
 
     # for serialization
     geo_info = dict(geo_info)
@@ -230,4 +228,4 @@ def build_bb_gazetteer(bb, augmentType):
 
     geo_locations, geo_info, extended_words3 = build_bb_gazetteer(bb)
 
-    print (json.dumps(dict(geo_locations), indent=2))
+    print(json.dumps(dict(geo_locations), indent=2))
diff --git a/LNEx/tokenizer/Twokenize.py b/LNEx/tokenizer/Twokenize.py
@@ -195,7 +195,7 @@ def unprotected_tokenize(s):
   return s.split()
 
 if __name__=='__main__':
-    print (tokenize("RT @im_ursbro: #ChennaiFloods #'Saidapet', food available for 700-people;no.4,pilliyarkoil street,Jones road subway.call Dinesh Thomas @ 04"))
+    print(tokenize("RT @im_ursbro: #ChennaiFloods #'Saidapet', food available for 700-people;no.4,pilliyarkoil street,Jones road subway.call Dinesh Thomas @ 04"))
 
 #for line in sys.stdin:
   #  print u" ".join(tokenize(line[:-1])).encode('utf-8')

diff --git a/pytest.ipynb b/pytest.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Copyright 2017 Hussein S. Al-Olimat, hussein@knoesis.org\n",
+    "\n",
+    "This software is released under the GNU Affero General Public License (AGPL) v3.0 License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# pytest is an example usecase of using LNEx in Python 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install wordsegment\n",
+    "!pip install shapely\n",
+    "!pip install nltk\n",
+    "!pip install elasticsearch\n",
+    "!pip install elasticsearch_dsl\n",
+    "!pip install geopy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json, re\n",
+    "from shapely.geometry import MultiPoint\n",
+    "\n",
+    "import sys \n",
+    "sys.path.append(\"LNEx\")\n",
+    "import LNEx as lnex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_tweets():\n",
+    "    tweets_file = \"_Data/sample_tweets.txt\"\n",
+    "    # read tweets from file to list\n",
+    "    with open(tweets_file) as f:\n",
+    "        tweets = f.read().splitlines()\n",
+    "    return tweets\n",
+    "\n",
+    "def init_using_elasticindex(bb, cache, augmentType, dataset, capital_word_shape):\n",
+    "    lnex.elasticindex(conn_string='localhost:9200', index_name=\"photon\")\n",
+    "\n",
+    "    geo_info = lnex.initialize( bb, augmentType=augmentType,\n",
+    "                                    cache=cache,\n",
+    "                                    dataset_name=dataset,\n",
+    "                                    capital_word_shape=capital_word_shape)\n",
+    "    return geo_info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initializing LNEx ...\n",
+      "Done Initialization ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "bbs = { \"chennai\": [12.74, 80.066986084, 13.2823848224, 80.3464508057],\n",
+    "        \"louisiana\": [29.4563, -93.3453, 31.4521, -89.5276],\n",
+    "        \"houston\": [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156],\n",
+    "        \"columbus\": [39.808631, -83.2102799, 40.1572719, -82.7713781],\n",
+    "        \"test\": [41.6187434973, -83.7106928844, 41.6245055116, -83.7017216664]}\n",
+    "\n",
+    "dataset = \"chennai\"\n",
+    "\n",
+    "geo_info = init_using_elasticindex(bbs[dataset], cache=False, augmentType=\"HP\", \n",
+    "                                   dataset=dataset, capital_word_shape=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tambaram (43, 51) tambaram ['7066', '13478']\n",
+      "Mudichur (29, 37) mudichur ['3205']\n",
+      "##################################################\n",
+      "Jones road (64, 74) jones road ['9569', '6472']\n",
+      "Saidapet (0, 8) saidapet ['1180', '3771', '11613', '13880', '133', '13201']\n",
+      "##################################################\n",
+      "Chennai Central (12, 27) chennai central ['7267', '7347']\n",
+      "Chennai Egmore (28, 42) chennai egmore ['5346', '7768']\n",
+      "##################################################\n",
+      "New Avadi road (20, 34) new avadi road ['2741', '7133', '16966', '16786', '15324', '16791', '8', '14795', '2288']\n",
+      "Water tank road (39, 54) water tank rd ['5773']\n",
+      "##################################################\n",
+      "##################################################\n",
+      "mambalam (29, 37) mambalam ['12606']\n",
+      "new avadi rd (8, 20) new avadi road ['2741', '7133', '16966', '16786', '15324', '16791', '8', '14795', '2288']\n",
+      "chennai (21, 28) chennai ['10301', '10318']\n",
+      "##################################################\n",
+      "##################################################\n",
+      "avadi (4, 9) avadi ['14979', '607']\n",
+      "##################################################\n",
+      "pathur (21, 27) pathur ['10359']\n",
+      "##################################################\n"
+     ]
+    }
+   ],
+   "source": [
+    "for tweet in read_tweets():\n",
+    "    # remove hashtags, urls, etc...\n",
+    "    tweet = ' '.join(re.sub(\"(#[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \", tweet).split())\n",
+    "    for output in lnex.extract(tweet):\n",
+    "        print(output[0], output[1], output[2], output[3][\"main\"])\n",
+    "    print(\"#\"*50)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pytest.py b/pytest.py
@@ -123,8 +123,8 @@ def init_using_elasticindex(bb, cache, dataset, capital_word_shape):
 
             row = output[0], output[1], output[2], geo_point
             rows.append(row)
-
-        print "-" * 120
-        print tabulate(rows, headers=header)
-        print "#" * 120
+            
+        print("-" * 120)
+        print(tabulate(rows, headers=header))
+        print("#" * 120)
         break