Skip to content

Commit

Permalink
python 3 migration
Browse files Browse the repository at this point in the history
  • Loading branch information
halolimat committed Oct 9, 2018
1 parent 54ed270 commit 1737c81
Show file tree
Hide file tree
Showing 9 changed files with 194 additions and 32 deletions.
2 changes: 1 addition & 1 deletion LNEx/Language_Modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def __init__(self, geo_locations):

lm = GazBasedModel(geo_locations)

print (lm.phrase_probability("new"))
print(lm.phrase_probability("new"))

lm.phrase_probability("new avadi")
lm.phrase_probability("new avadi road")
Expand Down
6 changes: 3 additions & 3 deletions LNEx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
v3.0 License.
#############################################################################"""

import core
import osm_gazetteer
import os, json

import elasticsearch

from . import core
from . import osm_gazetteer

################################################################################
################################################################################

Expand Down
10 changes: 5 additions & 5 deletions LNEx/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
load()

# importing local modules
import Language_Modeling
from tokenizer import Twokenize
from . import Language_Modeling
from .tokenizer import Twokenize

################################################################################
################################################################################
Expand Down Expand Up @@ -245,7 +245,7 @@ def build_tree(glm, ts):
flattened = list(flatten(i))

# remove consecutive duplicates
final_list = map(itemgetter(0), groupby(flattened))
final_list = list(map(itemgetter(0), groupby(flattened)))

# prune based on the probability from the language model
p = " ".join(final_list)
Expand Down Expand Up @@ -684,7 +684,7 @@ def filterout_overlaps(valid_ngrams):
def find_ngrams(input_list, n):
'''Generates grams of length (n) from the list of unigrams (input_list)'''

return zip(*[input_list[i:] for i in range(n)])
return list(zip(*[input_list[i:] for i in range(n)]))

################################################################################

Expand Down Expand Up @@ -817,7 +817,7 @@ def __init__(self, geo_locations, extended_words3):
########################################################################

# list of unigrams
unigrams = self.glm.unigrams["words"].keys()
unigrams = list(self.glm.unigrams["words"].keys())

self.stopwords_notin_gazetteer = set(
self.extended_words3) - set(unigrams)
Expand Down
10 changes: 5 additions & 5 deletions LNEx/gaz_augmentation_and_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def preprocess_name(loc_name):
def find_ngrams(unigrams, n):
'''Created ngrams of length n from the unigrams list'''

return zip(*[unigrams[i:] for i in range(n)])
return list(zip(*[unigrams[i:] for i in range(n)]))

################################################################################

Expand Down Expand Up @@ -231,7 +231,7 @@ def high_precision_filtering(geo_locations):
new_geo_locations = defaultdict(lambda: defaultdict(set))

for text in geo_locations:

original_text = text

text = text.replace("( ", "(").replace(" )", ")").lower()
Expand All @@ -252,7 +252,7 @@ def high_precision_filtering(geo_locations):

text = re.sub('\s{2,}', ' ', text).strip()

text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('UTF-8')
text = str(text.strip())

# skip few names
Expand Down Expand Up @@ -481,7 +481,7 @@ def augment(geo_locations):
base_name[1:1] = f_name

# remove consecutive duplicate tokens
base_name = map(itemgetter(0), groupby(base_name))
base_name = list(map(itemgetter(0), groupby(base_name)))

flexi_grams.append(" ".join(base_name))

Expand All @@ -504,4 +504,4 @@ def augment(geo_locations):
new_geo_locations[new_name]["meta"] = \
set(new_geo_locations[name]["meta"]).union(new_geo_locations[new_name]["meta"])

return new_geo_locations, get_extended_words3(new_geo_locations.keys())
return new_geo_locations, get_extended_words3(list(new_geo_locations.keys()))
2 changes: 1 addition & 1 deletion LNEx/geo_calculations.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,4 @@ def is_bb_acceptable(bb):

texas_bb = [25.8371638, -106.6456461, 36.5007041, -93.5080389]

print (is_bb_acceptable(texas_bb))
print(is_bb_acceptable(texas_bb))
22 changes: 10 additions & 12 deletions LNEx/osm_gazetteer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,15 @@
v3.0 License.
#############################################################################"""

import json
from collections import defaultdict

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from elasticsearch_dsl.connections import connections

import geo_calculations
import gaz_augmentation_and_filtering

import json

################################################################################
################################################################################

Expand Down Expand Up @@ -51,16 +49,16 @@ def search_index(bb):

if connection_string == '' or index_name == '':

print ("\n###########################################################")
print ("Global ERROR: Elastic host and port or index name not defined")
print ("#############################################################\n")
print("\n###########################################################")
print("Global ERROR: Elastic host and port or index name not defined")
print("#############################################################\n")
exit()

if not geo_calculations.is_bb_acceptable(bb) or bb[0] > bb[2] or bb[1] > bb[3]:

print ("\n##########################################################")
print ("Global ERROR: Bounding Box is too big, choose a smaller one!")
print ("############################################################\n")
print("\n##########################################################")
print("Global ERROR: Bounding Box is too big, choose a smaller one!")
print("############################################################\n")
exit()

connections.create_connection(hosts=[connection_string], timeout=60)
Expand Down Expand Up @@ -199,11 +197,11 @@ def build_bb_gazetteer(bb, augmentType):

elif augmentType == "NA": # None
new_geo_locations = gaz_augmentation_and_filtering.filter_geo_locations(geo_locations)
extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(new_geo_locations.keys())
extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(list(new_geo_locations.keys()))

elif augmentType == "HP": # High Precision Filtering
new_geo_locations = gaz_augmentation_and_filtering.high_precision_filtering(geo_locations)
extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(new_geo_locations.keys())
extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(list(new_geo_locations.keys()))

# for serialization
geo_info = dict(geo_info)
Expand All @@ -230,4 +228,4 @@ def build_bb_gazetteer(bb, augmentType):

geo_locations, geo_info, extended_words3 = build_bb_gazetteer(bb)

print (json.dumps(dict(geo_locations), indent=2))
print(json.dumps(dict(geo_locations), indent=2))
2 changes: 1 addition & 1 deletion LNEx/tokenizer/Twokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def unprotected_tokenize(s):
return s.split()

if __name__=='__main__':
print (tokenize("RT @im_ursbro: #ChennaiFloods #'Saidapet', food available for 700-people;no.4,pilliyarkoil street,Jones road subway.call Dinesh Thomas @ 04"))
print(tokenize("RT @im_ursbro: #ChennaiFloods #'Saidapet', food available for 700-people;no.4,pilliyarkoil street,Jones road subway.call Dinesh Thomas @ 04"))

#for line in sys.stdin:
# print u" ".join(tokenize(line[:-1])).encode('utf-8')
Expand Down
164 changes: 164 additions & 0 deletions pytest.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright 2017 Hussein S. Al-Olimat, hussein@knoesis.org\n",
"\n",
"This software is released under the GNU Affero General Public License (AGPL) v3.0 License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# pytest is an example usecase of using LNEx in Python 3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install wordsegment\n",
"!pip install shapely\n",
"!pip install nltk\n",
"!pip install elasticsearch\n",
"!pip install elasticsearch_dsl\n",
"!pip install geopy"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json, re\n",
"from shapely.geometry import MultiPoint\n",
"\n",
"import sys \n",
"sys.path.append(\"LNEx\")\n",
"import LNEx as lnex"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def read_tweets():\n",
" tweets_file = \"_Data/sample_tweets.txt\"\n",
" # read tweets from file to list\n",
" with open(tweets_file) as f:\n",
" tweets = f.read().splitlines()\n",
" return tweets\n",
"\n",
"def init_using_elasticindex(bb, cache, augmentType, dataset, capital_word_shape):\n",
" lnex.elasticindex(conn_string='localhost:9200', index_name=\"photon\")\n",
"\n",
" geo_info = lnex.initialize( bb, augmentType=augmentType,\n",
" cache=cache,\n",
" dataset_name=dataset,\n",
" capital_word_shape=capital_word_shape)\n",
" return geo_info"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initializing LNEx ...\n",
"Done Initialization ...\n"
]
}
],
"source": [
"bbs = { \"chennai\": [12.74, 80.066986084, 13.2823848224, 80.3464508057],\n",
" \"louisiana\": [29.4563, -93.3453, 31.4521, -89.5276],\n",
" \"houston\": [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156],\n",
" \"columbus\": [39.808631, -83.2102799, 40.1572719, -82.7713781],\n",
" \"test\": [41.6187434973, -83.7106928844, 41.6245055116, -83.7017216664]}\n",
"\n",
"dataset = \"chennai\"\n",
"\n",
"geo_info = init_using_elasticindex(bbs[dataset], cache=False, augmentType=\"HP\", \n",
" dataset=dataset, capital_word_shape=False)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tambaram (43, 51) tambaram ['7066', '13478']\n",
"Mudichur (29, 37) mudichur ['3205']\n",
"##################################################\n",
"Jones road (64, 74) jones road ['9569', '6472']\n",
"Saidapet (0, 8) saidapet ['1180', '3771', '11613', '13880', '133', '13201']\n",
"##################################################\n",
"Chennai Central (12, 27) chennai central ['7267', '7347']\n",
"Chennai Egmore (28, 42) chennai egmore ['5346', '7768']\n",
"##################################################\n",
"New Avadi road (20, 34) new avadi road ['2741', '7133', '16966', '16786', '15324', '16791', '8', '14795', '2288']\n",
"Water tank road (39, 54) water tank rd ['5773']\n",
"##################################################\n",
"##################################################\n",
"mambalam (29, 37) mambalam ['12606']\n",
"new avadi rd (8, 20) new avadi road ['2741', '7133', '16966', '16786', '15324', '16791', '8', '14795', '2288']\n",
"chennai (21, 28) chennai ['10301', '10318']\n",
"##################################################\n",
"##################################################\n",
"avadi (4, 9) avadi ['14979', '607']\n",
"##################################################\n",
"pathur (21, 27) pathur ['10359']\n",
"##################################################\n"
]
}
],
"source": [
"for tweet in read_tweets():\n",
" # remove hashtags, urls, etc...\n",
" tweet = ' '.join(re.sub(\"(#[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \", tweet).split())\n",
" for output in lnex.extract(tweet):\n",
" print(output[0], output[1], output[2], output[3][\"main\"])\n",
" print(\"#\"*50)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
8 changes: 4 additions & 4 deletions pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,8 @@ def init_using_elasticindex(bb, cache, dataset, capital_word_shape):

row = output[0], output[1], output[2], geo_point
rows.append(row)

print "-" * 120
print tabulate(rows, headers=header)
print "#" * 120
print("-" * 120)
print(tabulate(rows, headers=header))
print("#" * 120)
break

0 comments on commit 1737c81

Please sign in to comment.