diff --git a/LNEx/Language_Modeling.py b/LNEx/Language_Modeling.py index 870f133..cce9c54 100644 --- a/LNEx/Language_Modeling.py +++ b/LNEx/Language_Modeling.py @@ -157,7 +157,7 @@ def __init__(self, geo_locations): lm = GazBasedModel(geo_locations) - print (lm.phrase_probability("new")) + print(lm.phrase_probability("new")) lm.phrase_probability("new avadi") lm.phrase_probability("new avadi road") diff --git a/LNEx/__init__.py b/LNEx/__init__.py index 1de0f8d..3576b16 100644 --- a/LNEx/__init__.py +++ b/LNEx/__init__.py @@ -5,12 +5,12 @@ v3.0 License. #############################################################################""" -import core -import osm_gazetteer import os, json - import elasticsearch +from . import core +from . import osm_gazetteer + ################################################################################ ################################################################################ diff --git a/LNEx/core.py b/LNEx/core.py index ebfbdec..89ca7e1 100644 --- a/LNEx/core.py +++ b/LNEx/core.py @@ -20,8 +20,8 @@ load() # importing local modules -import Language_Modeling -from tokenizer import Twokenize +from . import Language_Modeling +from .tokenizer import Twokenize ################################################################################ ################################################################################ @@ -245,7 +245,7 @@ def build_tree(glm, ts): flattened = list(flatten(i)) # remove consecutive duplicates - final_list = map(itemgetter(0), groupby(flattened)) + final_list = list(map(itemgetter(0), groupby(flattened))) # prune based on the probability from the language model p = " ".join(final_list) @@ -684,7 +684,7 @@ def filterout_overlaps(valid_ngrams): def find_ngrams(input_list, n): '''Generates grams of length (n) from the list of unigrams (input_list)''' - return zip(*[input_list[i:] for i in range(n)]) + return list(zip(*[input_list[i:] for i in range(n)])) ################################################################################ @@ -817,7 +817,7 @@ def __init__(self, geo_locations, extended_words3): ######################################################################## # list of unigrams - unigrams = self.glm.unigrams["words"].keys() + unigrams = list(self.glm.unigrams["words"].keys()) self.stopwords_notin_gazetteer = set( self.extended_words3) - set(unigrams) diff --git a/LNEx/gaz_augmentation_and_filtering.py b/LNEx/gaz_augmentation_and_filtering.py index 718e3b5..326d3f1 100644 --- a/LNEx/gaz_augmentation_and_filtering.py +++ b/LNEx/gaz_augmentation_and_filtering.py @@ -193,7 +193,7 @@ def preprocess_name(loc_name): def find_ngrams(unigrams, n): '''Created ngrams of length n from the unigrams list''' - return zip(*[unigrams[i:] for i in range(n)]) + return list(zip(*[unigrams[i:] for i in range(n)])) ################################################################################ @@ -231,7 +231,7 @@ def high_precision_filtering(geo_locations): new_geo_locations = defaultdict(lambda: defaultdict(set)) for text in geo_locations: - + original_text = text text = text.replace("( ", "(").replace(" )", ")").lower() @@ -252,7 +252,7 @@ def high_precision_filtering(geo_locations): text = re.sub('\s{2,}', ' ', text).strip() - text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') + text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('UTF-8') text = str(text.strip()) # skip few names @@ -481,7 +481,7 @@ def augment(geo_locations): base_name[1:1] = f_name # remove consecutive duplicate tokens - base_name = map(itemgetter(0), groupby(base_name)) + base_name = list(map(itemgetter(0), groupby(base_name))) flexi_grams.append(" ".join(base_name)) @@ -504,4 +504,4 @@ def augment(geo_locations): new_geo_locations[new_name]["meta"] = \ set(new_geo_locations[name]["meta"]).union(new_geo_locations[new_name]["meta"]) - return new_geo_locations, get_extended_words3(new_geo_locations.keys()) + return new_geo_locations, get_extended_words3(list(new_geo_locations.keys())) diff --git a/LNEx/geo_calculations.py b/LNEx/geo_calculations.py index 3353573..db57c07 100644 --- a/LNEx/geo_calculations.py +++ b/LNEx/geo_calculations.py @@ -56,4 +56,4 @@ def is_bb_acceptable(bb): texas_bb = [25.8371638, -106.6456461, 36.5007041, -93.5080389] - print (is_bb_acceptable(texas_bb)) + print(is_bb_acceptable(texas_bb)) diff --git a/LNEx/osm_gazetteer.py b/LNEx/osm_gazetteer.py index 05844af..0d99f38 100644 --- a/LNEx/osm_gazetteer.py +++ b/LNEx/osm_gazetteer.py @@ -5,17 +5,15 @@ v3.0 License. #############################################################################""" +import json from collections import defaultdict -from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q from elasticsearch_dsl.connections import connections import geo_calculations import gaz_augmentation_and_filtering -import json - ################################################################################ ################################################################################ @@ -51,16 +49,16 @@ def search_index(bb): if connection_string == '' or index_name == '': - print ("\n###########################################################") - print ("Global ERROR: Elastic host and port or index name not defined") - print ("#############################################################\n") + print("\n###########################################################") + print("Global ERROR: Elastic host and port or index name not defined") + print("#############################################################\n") exit() if not geo_calculations.is_bb_acceptable(bb) or bb[0] > bb[2] or bb[1] > bb[3]: - print ("\n##########################################################") - print ("Global ERROR: Bounding Box is too big, choose a smaller one!") - print ("############################################################\n") + print("\n##########################################################") + print("Global ERROR: Bounding Box is too big, choose a smaller one!") + print("############################################################\n") exit() connections.create_connection(hosts=[connection_string], timeout=60) @@ -199,11 +197,11 @@ def build_bb_gazetteer(bb, augmentType): elif augmentType == "NA": # None new_geo_locations = gaz_augmentation_and_filtering.filter_geo_locations(geo_locations) - extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(new_geo_locations.keys()) + extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(list(new_geo_locations.keys())) elif augmentType == "HP": # High Precision Filtering new_geo_locations = gaz_augmentation_and_filtering.high_precision_filtering(geo_locations) - extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(new_geo_locations.keys()) + extended_words3 = gaz_augmentation_and_filtering.get_extended_words3(list(new_geo_locations.keys())) # for serialization geo_info = dict(geo_info) @@ -230,4 +228,4 @@ def build_bb_gazetteer(bb, augmentType): geo_locations, geo_info, extended_words3 = build_bb_gazetteer(bb) - print (json.dumps(dict(geo_locations), indent=2)) + print(json.dumps(dict(geo_locations), indent=2)) diff --git a/LNEx/tokenizer/Twokenize.py b/LNEx/tokenizer/Twokenize.py index d8e5b1c..d00f7ce 100644 --- a/LNEx/tokenizer/Twokenize.py +++ b/LNEx/tokenizer/Twokenize.py @@ -195,7 +195,7 @@ def unprotected_tokenize(s): return s.split() if __name__=='__main__': - print (tokenize("RT @im_ursbro: #ChennaiFloods #'Saidapet', food available for 700-people;no.4,pilliyarkoil street,Jones road subway.call Dinesh Thomas @ 04")) + print(tokenize("RT @im_ursbro: #ChennaiFloods #'Saidapet', food available for 700-people;no.4,pilliyarkoil street,Jones road subway.call Dinesh Thomas @ 04")) #for line in sys.stdin: # print u" ".join(tokenize(line[:-1])).encode('utf-8') diff --git a/pytest.ipynb b/pytest.ipynb new file mode 100644 index 0000000..2eabdad --- /dev/null +++ b/pytest.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright 2017 Hussein S. Al-Olimat, hussein@knoesis.org\n", + "\n", + "This software is released under the GNU Affero General Public License (AGPL) v3.0 License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# pytest is an example usecase of using LNEx in Python 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install wordsegment\n", + "!pip install shapely\n", + "!pip install nltk\n", + "!pip install elasticsearch\n", + "!pip install elasticsearch_dsl\n", + "!pip install geopy" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import json, re\n", + "from shapely.geometry import MultiPoint\n", + "\n", + "import sys \n", + "sys.path.append(\"LNEx\")\n", + "import LNEx as lnex" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def read_tweets():\n", + " tweets_file = \"_Data/sample_tweets.txt\"\n", + " # read tweets from file to list\n", + " with open(tweets_file) as f:\n", + " tweets = f.read().splitlines()\n", + " return tweets\n", + "\n", + "def init_using_elasticindex(bb, cache, augmentType, dataset, capital_word_shape):\n", + " lnex.elasticindex(conn_string='localhost:9200', index_name=\"photon\")\n", + "\n", + " geo_info = lnex.initialize( bb, augmentType=augmentType,\n", + " cache=cache,\n", + " dataset_name=dataset,\n", + " capital_word_shape=capital_word_shape)\n", + " return geo_info" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initializing LNEx ...\n", + "Done Initialization ...\n" + ] + } + ], + "source": [ + "bbs = { \"chennai\": [12.74, 80.066986084, 13.2823848224, 80.3464508057],\n", + " \"louisiana\": [29.4563, -93.3453, 31.4521, -89.5276],\n", + " \"houston\": [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156],\n", + " \"columbus\": [39.808631, -83.2102799, 40.1572719, -82.7713781],\n", + " \"test\": [41.6187434973, -83.7106928844, 41.6245055116, -83.7017216664]}\n", + "\n", + "dataset = \"chennai\"\n", + "\n", + "geo_info = init_using_elasticindex(bbs[dataset], cache=False, augmentType=\"HP\", \n", + " dataset=dataset, capital_word_shape=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tambaram (43, 51) tambaram ['7066', '13478']\n", + "Mudichur (29, 37) mudichur ['3205']\n", + "##################################################\n", + "Jones road (64, 74) jones road ['9569', '6472']\n", + "Saidapet (0, 8) saidapet ['1180', '3771', '11613', '13880', '133', '13201']\n", + "##################################################\n", + "Chennai Central (12, 27) chennai central ['7267', '7347']\n", + "Chennai Egmore (28, 42) chennai egmore ['5346', '7768']\n", + "##################################################\n", + "New Avadi road (20, 34) new avadi road ['2741', '7133', '16966', '16786', '15324', '16791', '8', '14795', '2288']\n", + "Water tank road (39, 54) water tank rd ['5773']\n", + "##################################################\n", + "##################################################\n", + "mambalam (29, 37) mambalam ['12606']\n", + "new avadi rd (8, 20) new avadi road ['2741', '7133', '16966', '16786', '15324', '16791', '8', '14795', '2288']\n", + "chennai (21, 28) chennai ['10301', '10318']\n", + "##################################################\n", + "##################################################\n", + "avadi (4, 9) avadi ['14979', '607']\n", + "##################################################\n", + "pathur (21, 27) pathur ['10359']\n", + "##################################################\n" + ] + } + ], + "source": [ + "for tweet in read_tweets():\n", + " # remove hashtags, urls, etc...\n", + " tweet = ' '.join(re.sub(\"(#[A-Za-z0-9]+)|([^0-9A-Za-z \\t])|(\\w+:\\/\\/\\S+)\",\" \", tweet).split())\n", + " for output in lnex.extract(tweet):\n", + " print(output[0], output[1], output[2], output[3][\"main\"])\n", + " print(\"#\"*50)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pytest.py b/pytest.py index 300d916..e205734 100644 --- a/pytest.py +++ b/pytest.py @@ -123,8 +123,8 @@ def init_using_elasticindex(bb, cache, dataset, capital_word_shape): row = output[0], output[1], output[2], geo_point rows.append(row) - - print "-" * 120 - print tabulate(rows, headers=header) - print "#" * 120 + + print("-" * 120) + print(tabulate(rows, headers=header)) + print("#" * 120) break