-
Notifications
You must be signed in to change notification settings - Fork 0
/
category_lookup_table.py
227 lines (190 loc) · 8.71 KB
/
category_lookup_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
from pathlib import Path
from typing import Optional, Mapping, List
import nltk
from nltk.tag.perceptron import PerceptronTagger
from ruamel.yaml import YAML
from category import Category
yaml = YAML(typ="safe")
CategoryMap = Mapping[str, Category]
STRING_CATEGORY_MAP: CategoryMap = {
"": Category.Unknown,
# Chase strings
"Automotive": Category.Automotive,
"Bills & Utilities": Category.BillsUtilities,
"Education": Category.Education,
"Entertainment": Category.Entertainment,
"Fees & Adjustments": Category.FeesAdjustments,
"Food & Drink": Category.Food,
"Gas": Category.Gas,
"Gifts & Donations": Category.Gifts,
"Groceries": Category.Groceries,
"Health & Wellness": Category.Health,
"Home": Category.Home,
"Miscellaneous": Category.Miscellaneous,
"Personal": Category.Personal,
"Professional Services": Category.ProfessionalServices,
"Shopping": Category.Shopping,
"Travel": Category.Travel,
# AMEX strings
"Business Services-Advertising Services": Category.Entertainment,
"Business Services-Health Care Services": Category.Health,
"Business Services-Mailing & Shipping": Category.ProfessionalServices,
"Business Services-Other Services": Category.Miscellaneous,
"Business Services-Printing & Publishing": Category.ProfessionalServices,
"Business Services-Professional Services": Category.ProfessionalServices,
"Communications-Cable & Internet Comm": Category.Entertainment,
"Entertainment-Associations": Category.Entertainment,
"Entertainment-General Attractions": Category.Entertainment,
"Entertainment-General Events": Category.Entertainment,
"Entertainment-Other Entertainment": Category.Entertainment,
"Entertainment-Theatrical Events": Category.Entertainment,
"Entertainment-Theme Parks": Category.Entertainment,
"Fees & Adjustments-Fees & Adjustments": Category.FeesAdjustments,
"Merchandise & Supplies-Appliance Stores": Category.Shopping,
"Merchandise & Supplies-Arts & Jewelry": Category.Shopping,
"Merchandise & Supplies-Book Stores": Category.Shopping,
"Merchandise & Supplies-Clothing Stores": Category.Shopping,
"Merchandise & Supplies-Computer Supplies": Category.Shopping,
"Merchandise & Supplies-Department Stores": Category.Shopping,
"Merchandise & Supplies-Florists & Garden": Category.Shopping,
"Merchandise & Supplies-Furnishing": Category.Shopping,
"Merchandise & Supplies-General Retail": Category.Shopping,
"Merchandise & Supplies-Groceries": Category.Groceries,
"Merchandise & Supplies-Hardware Supplies": Category.Shopping,
"Merchandise & Supplies-Internet Purchase": Category.Shopping,
"Merchandise & Supplies-Mail Order": Category.Entertainment,
"Merchandise & Supplies-Pharmacies": Category.Health,
"Merchandise & Supplies-Sporting Goods Stores": Category.Shopping,
"Merchandise & Supplies-Wholesale Stores": Category.Shopping,
"Other-Government Services": Category.ProfessionalServices,
"Other-Miscellaneous": Category.Miscellaneous,
"Restaurant-Bar & Café": Category.Food,
"Restaurant-Restaurant": Category.Food,
"Transportation-Auto Services": Category.Automotive,
"Transportation-Fuel": Category.Automotive,
"Transportation-Other Transportation": Category.Travel,
"Transportation-Parking Charges": Category.ProfessionalServices,
"Transportation-Taxis & Coach": Category.Travel,
"Travel-Airline": Category.Travel,
"Travel-Lodging": Category.Travel,
"Travel-Travel Agencies": Category.Travel,
}
class CategoryHinter:
"""
Maintains a config of keywords to category mapping.
These categories can be learned either directly from a bank history or manually by the user.
"""
def __init__(self, config_path: Path) -> None:
"""Initializes the config.
Params:
config_path: The path to the config file that this class maintains.
"""
self._config_path = config_path
if not config_path.exists():
with config_path.open("w", encoding="utf-8"):
pass
self._config = yaml.load(config_path)
if not self._config:
self._config = {}
def __enter__(self) -> "CategoryLookupTable":
"""Enter context."""
return self
def __exit__(self, *_args) -> None:
"""Exit context; flush contents to file."""
self.flush()
def flush(self) -> None:
"""Flushes the table to file."""
yaml.dump(self._config, self._config_path)
def _split_description(self, description: str) -> List[str]:
"""Tokenizes the description and strips non alphanumeric characters/whitespaces."""
description = "".join(c if str.isalnum(c) or str.isspace(c) or c == "&" else " " for c in description)
def is_valid_word(s: str) -> bool:
return len(s) > 0 and all(str.isalnum(c) or c == "&" for c in s) and s != "&"
return list(filter(is_valid_word, description.split(" ")))
def hint(self, description: str) -> Optional[CategoryMap]:
"""
Scans the words in the description to determine if there are any key words that are in the config.
If there are, return the categories mapped to each key word found.
"""
category_map: CategoryMap = {}
for word in self._split_description(description):
if word in self._config:
category_map[word] = self._config[word]
return category_map
def store(self, key: str, category: Category) -> None:
"""Stores a key to category mapping in the table."""
if " " in key:
raise RuntimeError(f"{key} can not contain whitespaces, it must be 1 word")
self._config[key] = category
def build_hints(self, description_category_map: CategoryMap, do_flush: bool) -> None:
"""
Builds a category map of individual words from the input category map of descriptions.
Params:
description_category_map : A map of descriptions to categories.
do_flush : True to write to file, false to print results.
"""
# Ensure the necessary databases are downloaded
nltk.download("punkt", quiet=True)
nltk.download("averaged_perceptron_tagger", quiet=True)
# The part of speech tagger instance
tagger = PerceptronTagger()
def criteria(word: str) -> bool:
"""
Each word must meet this criteria to be eligible for hinting.
1. The word must not be a single letter.
2. The word must be a noun.
"""
tag = tagger.tag([word])[0][1]
# https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk/15389153
return len(word) > 1 and tag.startswith("NN")
hints: CategoryMap = {}
for description, category in description_category_map.items():
for word in self._split_description(description):
if criteria(word):
hints[word] = category
n: int = 0
for word, category in hints.items():
if word not in self._config:
n += 1
print(f"{n} new hints discovered")
# Merge the current set of hints with the new set
self._config |= hints
print(f"{len(self._config)} total hints")
if not do_flush:
print(self._config)
return
# Write to file
yaml.dump(self._config, self._config_path)
class CategoryLookupTable:
"""
Builds and maintains a table of keywords to category mapping.
"""
def __init__(self, config_path: Path) -> None:
"""Initializes the config.
Params:
config_path: The path to the config file that this class maintains.
"""
self._config_path = config_path
if not config_path.exists():
with config_path.open("w", encoding="utf-8"):
pass
self._table: CategoryMap = yaml.load(config_path)
if not self._table:
self._table = {}
def __enter__(self) -> "CategoryLookupTable":
"""Enter context."""
return self
def __exit__(self, *_) -> None:
"""Exit context; flush contents to file."""
self.flush()
def flush(self) -> None:
"""Flushes the table to file."""
yaml.dump(self._table, self._config_path)
def load(self, key: str) -> Optional[Category]:
"""Looks up the string in the table."""
return self._table.get(key)
def store(self, key: str, category: Category) -> None:
"""Stores a key to category mapping in the table."""
if category == Category.Unknown:
raise RuntimeError(f"Trying to store {key} with an unknown category")
self._table[key] = category