-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkenpom.py
executable file
·323 lines (261 loc) · 10.9 KB
/
kenpom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
#!/usr/bin/env python
"""Scrape KenPom data for quick display.
TODO:
* Use conf list for input validation? Maybe generate list via an arg (--conf-list).
* Provide configuration object to drive display of columns? Default list is
pretty useful (lacks, tempo, luck, SOS).
"""
import argparse
import logging
import sys
from typing import List, Tuple
from urllib.parse import unquote_plus
from bs4 import BeautifulSoup, SoupStrainer
from cachetools import TTLCache, cached
import requests
from datastructures import (
CONF_NAMES,
KenPom,
KenPomDict,
MetaData,
SCHOOL_DATA_BY_ABBREV,
SCHOOL_DATA_BY_NAME,
)
log = logging.getLogger(__name__)
URL = 'https://kenpom.com/'
NUM_SCHOOLS = 363 # Total number of NCAA D1 schools
DATA_ROW_COL_COUNT = 22 # Number of data elements in tr elements w/ data we want
HEADER_LEN = 37 # Number of `-` chars to print underneath the output header text
CACHE_IN_SECS = 600
def main():
"""Get args, fetch data, filter data, display data."""
args = parse_args()
if args.filter:
user_input = args.filter
else:
user_input = get_input(args.indent)
while user_input not in ('q', 'quit', 'exit'):
as_of, raw_data = fetch_and_parse_data()
data, meta_data = filter_data(raw_data, user_input)
write_to_console(data, meta_data, as_of, args.indent)
if args.only_once:
user_input = 'quit'
else:
user_input = 'quit' if not args.filter else get_input(args.indent)
def parse_args():
parser = argparse.ArgumentParser()
parser.usage = f"""
List, in KenPom ranked order, Division 1 men's college basketball
teams given some filter. Filters include top-n teams, conference,
team name, team abbrev (aka ESPN ticker symbol).
If no filter is provided, the program will go into a loop prompting
you for a new filter after each run. While running with in the loop
we only update data about every {CACHE_IN_SECS // 60} minutes.
Example filters:
7 List top 7 teams
acc,sec List all teams from the ACC and SEC conferences
vt,wof Compare teams by abbrev: Virginia Tech and Wofford
Valley List all teams with `valley` in the school name
School names with spaces in them can be quoted or use the + sign in
lieu of a space. That is, both of the following will work.
"virginia tech",wofford
virginia+tech, wofford"""
parser.add_argument(
dest='filter',
nargs='?',
default='25',
help='one or more (comma-separated) search terms, defaults to 25',
)
parser.add_argument(
'--indent',
type=int,
metavar='N',
default=0,
help='offset console output by `N` spaces',
)
parser.add_argument(
'--once',
dest='only_once',
action='store_true',
help='run once and quit, bypassing the interactive loop',
)
return parser.parse_args()
@cached(cache=TTLCache(maxsize=20000, ttl=CACHE_IN_SECS))
def fetch_and_parse_data():
"""Convenience method that allows us to cache results.
Caching the results allow us to let a long-running process (such as PyTo on
the phone) get relatively up-to-date results. Note that as of 2020-02-07 the
total size of raw data was 18500 bytes, so we may need to double-check this
after each season to ensure Kenpom hasn't dumped more data.
"""
page_content = fetch_content(URL)
raw_data, as_of = parse_data(page_content)
return as_of, raw_data
def get_input(indent: int) -> str:
"""Pull args from command-line, or prompt user if no args.
Keep the user input as a string, we'll type it later.
"""
left_pad = indent * ' ' if indent else ''
user_input = (
input(f'\n{left_pad}Top `n`, abbrev(s), conference(s), or school(s) [25]: ') or '25'
)
# Convert All input to our numerical/str equivalent
user_input = user_input.lower()
if user_input == 'all':
user_input = '0'
return user_input
def fetch_content(url: str) -> str:
"""Fetch the HTML content from the URL."""
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:102.0) ' 'Gecko/20100101 Firefox/102.0',
}
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.content.decode('utf-8')
def parse_data(html_content: str) -> Tuple[KenPomDict, str]:
"""Parse raw HTML into a more useful data structure.
We also append one data item: `abbrev`. This allows us to search by the oft-
used school abbrev (KU, UK, UMBC, aka score ticker symbol).
"""
as_of_html = BeautifulSoup(html_content, 'lxml').find_all(class_='update')
as_of = as_of_html[0].text.strip() if as_of_html else ''
# Join the total # of games and date info onto one line.
as_of = as_of.replace('\n', ' ')
soup = BeautifulSoup(html_content, 'lxml', parse_only=SoupStrainer('tr'))
data = dict()
for elements in soup:
# Rely on the fact that relevant rows have distinct, known number of items
if len(elements) != DATA_ROW_COL_COUNT:
continue
# Grab just text vales from our html elements
text_items = [e.text.strip() for e in elements if hasattr(e, 'text') if e.text.strip()]
# Tidy up the school name for a variety of oddities, we are passing text_items
# into the constructor later, so be sure to update that.
text_items[1] = _massage_school_name(text_items[1])
# Get abbrev to use as data key, allow user to search on this
school_data = SCHOOL_DATA_BY_NAME.get(text_items[1].lower(), {})
if school_data and school_data.get('abbrev'):
school_abbrev = school_data['abbrev']
text_items.append(school_abbrev.upper())
data[school_abbrev] = KenPom(*text_items)
else:
log.info(f'Bad data? text_items content: {text_items}')
return data, as_of
def _massage_school_name(school_name: str) -> str:
"""Given a school name, massage the text for various peculiarities.
* Replace the trailing dot of school names like Boise St.
* Handle "Tourney Mode" where school names have their seed in the name.
"""
# Replace the trailing dot in `Boise St.` so right-justified text looks better.
# ... trust me, it makes a difference.
school_name = school_name.replace('.', '')
# Who knew! During the NCAA tourney season KenPom puts the tourney seed into
# the school name. So, while text_items[1] will be "Gonzaga" most of the year,
# during (and after, till start of next season) text_items[1] will be
# "Gonzaga 1" since they're a #1 seed.
# convert "NC State 1" to ["NC", "State", "1"]
name_candidate = school_name.split(' ')
try:
# Is the last element a number?
int(name_candidate[-1])
# Convert back to a string, minus the trailing number
school_name = ' '.join(name_candidate[:-1])
except ValueError:
pass
return school_name
def _get_filters(user_input: str) -> Tuple[List[str], int]:
"""Return filters based on user input.
This is an ugly way of handling top-N filters vs. name-based filters. Will
eventually clean this up, but haven't needed to change it so far.
"""
# IF we're filtering by N, we only have one parameter, and it should convert
# to an int cleanly; otherwise, we're dealing with a list (possibly of 1 item)
# of strings representing names (conf, school, or abbrev). Normalize that data
# to lower case and handle some input requirements for spaces.
try:
top_filter = int(user_input)
assert top_filter >= 0, 'Top `n` must be zero or greater.'
return [], top_filter
except ValueError:
# Normalize the user input from command-line (or `input`)
input_as_list = [c.lower() for c in user_input.split(',')]
# Remove any quotes used in school name input
input_as_list = [u.replace('"', '').replace("'", '') for u in input_as_list]
# Decode any enabbrevd input (mostly + for space) because sometimes we start
# typing and don't want to go back and surround input with quotes
input_as_list = [unquote_plus(i) for i in input_as_list]
return input_as_list, -1
def filter_data(data: KenPomDict, user_input: str) -> Tuple[KenPomDict, MetaData]:
"""Filter which schools we will display based on user input."""
names, top_filter = _get_filters(user_input)
if top_filter == 0:
filtered_data = data
elif top_filter > 0:
filtered_data = {k: v for k, v in data.items() if v.rank <= top_filter}
elif abbrevs := SCHOOL_DATA_BY_ABBREV.keys() & set(names):
filtered_data = {k: v for k, v in data.items() if k in abbrevs}
elif conf_names := CONF_NAMES.intersection(set(names)):
filtered_data = {k: v for k, v in data.items() if v.conf.lower() in conf_names}
else: # full school name
filtered_data = {k: v for k, v in data.items() for n in names if n in v.name.lower()}
# Keep track of the longest school name. We'll need this to handle
# right-justified formatting in our console output. If there's no
# filtered_data then we have bogus input, so we need to guard against
# the evil `None` rearing its ugly head.
max_name_len = max({len(v.name) for v in filtered_data.values()}) if filtered_data else 0
meta_data = {
'max_name_len': max_name_len,
'names': names,
'num_teams': len(filtered_data),
'top_filter': top_filter,
}
return filtered_data, meta_data
def write_to_console(
data: KenPomDict, meta: MetaData, as_of: str, indent: int = 0
) -> Tuple[KenPomDict, MetaData]:
"""Dump the data to standard out."""
left_pad = indent * ' ' if indent else ''
str_template = (
'{left_pad}{team:>{len}} {abbrev:>5} {rank:>5} {off_rank:>3} /{def_rank:>4} '
'{record:>6} {conf:>5}'
)
# Header text ...
print(
str_template.format(
len=meta['max_name_len'],
left_pad=left_pad,
abbrev='Abbrev',
team='Team',
rank='Rank',
off_rank='Off',
def_rank='Def',
record='Rec',
conf='Conf',
)
)
# -----------------------------------
print(left_pad + (meta['max_name_len'] + HEADER_LEN) * '-')
# Data ...
for team in list(data.values()):
print(
str_template.format(
len=meta['max_name_len'],
left_pad=left_pad,
abbrev=team.abbrev,
team=team.name,
rank=team.rank,
off_rank=team.off_rank,
def_rank=team.def_rank,
record=team.record,
conf=team.conf,
)
)
# Footer (as of date)
print(f'\n{left_pad}{as_of}\n')
return data, meta
if __name__ == '__main__':
MAJ, MIN, *_ = sys.version_info
if MAJ == 3 and MIN < 8:
print('This requires Python 3.8 or higher.')
sys.exit(1)
main()