Skip to content

Commit 7b190b1

Browse files
authored
Add notebook
1 parent 2173563 commit 7b190b1

File tree

1 file changed

+372
-0
lines changed

1 file changed

+372
-0
lines changed

Outbreaks_Headlines_1.ipynb

Lines changed: 372 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,372 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 4,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"df = pd.read_fwf('data/headlines.txt')\n",
10+
"df.columns = ['headline']"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": 29,
16+
"metadata": {},
17+
"outputs": [
18+
{
19+
"data": {
20+
"text/html": [
21+
"<div>\n",
22+
"<style scoped>\n",
23+
" .dataframe tbody tr th:only-of-type {\n",
24+
" vertical-align: middle;\n",
25+
" }\n",
26+
"\n",
27+
" .dataframe tbody tr th {\n",
28+
" vertical-align: top;\n",
29+
" }\n",
30+
"\n",
31+
" .dataframe thead th {\n",
32+
" text-align: right;\n",
33+
" }\n",
34+
"</style>\n",
35+
"<table border=\"1\" class=\"dataframe\">\n",
36+
" <thead>\n",
37+
" <tr style=\"text-align: right;\">\n",
38+
" <th></th>\n",
39+
" <th>headline</th>\n",
40+
" </tr>\n",
41+
" </thead>\n",
42+
" <tbody>\n",
43+
" <tr>\n",
44+
" <td>0</td>\n",
45+
" <td>Could Zika Reach New York City?</td>\n",
46+
" </tr>\n",
47+
" <tr>\n",
48+
" <td>1</td>\n",
49+
" <td>First Case of Zika in Miami Beach</td>\n",
50+
" </tr>\n",
51+
" <tr>\n",
52+
" <td>2</td>\n",
53+
" <td>Mystery Virus Spreads in Recife, Brazil</td>\n",
54+
" </tr>\n",
55+
" <tr>\n",
56+
" <td>3</td>\n",
57+
" <td>Dallas man comes down with case of Zika</td>\n",
58+
" </tr>\n",
59+
" <tr>\n",
60+
" <td>4</td>\n",
61+
" <td>Trinidad confirms first Zika case</td>\n",
62+
" </tr>\n",
63+
" </tbody>\n",
64+
"</table>\n",
65+
"</div>"
66+
],
67+
"text/plain": [
68+
" headline\n",
69+
"0 Could Zika Reach New York City?\n",
70+
"1 First Case of Zika in Miami Beach\n",
71+
"2 Mystery Virus Spreads in Recife, Brazil\n",
72+
"3 Dallas man comes down with case of Zika\n",
73+
"4 Trinidad confirms first Zika case"
74+
]
75+
},
76+
"execution_count": 29,
77+
"metadata": {},
78+
"output_type": "execute_result"
79+
}
80+
],
81+
"source": [
82+
"df.head()"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": 46,
88+
"metadata": {},
89+
"outputs": [],
90+
"source": [
91+
"import geonamescache\n",
92+
"\n",
93+
"gc = geonamescache.GeonamesCache()\n",
94+
"countries = gc.get_countries()\n",
95+
"cities = gc.get_cities()"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": 58,
101+
"metadata": {},
102+
"outputs": [],
103+
"source": [
104+
"country_names = []\n",
105+
"country_ids = list(countries.keys())\n",
106+
"for country_id in country_ids:\n",
107+
" country_names.append(countries[country_id]['name'])"
108+
]
109+
},
110+
{
111+
"cell_type": "code",
112+
"execution_count": 60,
113+
"metadata": {},
114+
"outputs": [
115+
{
116+
"data": {
117+
"text/plain": [
118+
"252"
119+
]
120+
},
121+
"execution_count": 60,
122+
"metadata": {},
123+
"output_type": "execute_result"
124+
}
125+
],
126+
"source": [
127+
"len(country_names)"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": 73,
133+
"metadata": {},
134+
"outputs": [
135+
{
136+
"data": {
137+
"text/plain": [
138+
"['Andorra',\n",
139+
" 'United Arab Emirates',\n",
140+
" 'Afghanistan',\n",
141+
" 'Antigua and Barbuda',\n",
142+
" 'Anguilla',\n",
143+
" 'Albania',\n",
144+
" 'Armenia',\n",
145+
" 'Angola',\n",
146+
" 'Antarctica',\n",
147+
" 'Argentina',\n",
148+
" 'American Samoa',\n",
149+
" 'Austria',\n",
150+
" 'Australia',\n",
151+
" 'Aruba',\n",
152+
" 'Aland Islands',\n",
153+
" 'Azerbaijan',\n",
154+
" 'Bosnia and Herzegovina',\n",
155+
" 'Barbados',\n",
156+
" 'Bangladesh',\n",
157+
" 'Belgium']"
158+
]
159+
},
160+
"execution_count": 73,
161+
"metadata": {},
162+
"output_type": "execute_result"
163+
}
164+
],
165+
"source": [
166+
"country_names[:20]"
167+
]
168+
},
169+
{
170+
"cell_type": "code",
171+
"execution_count": 68,
172+
"metadata": {},
173+
"outputs": [],
174+
"source": [
175+
"city_names = []\n",
176+
"city_ids = list(cities.keys())\n",
177+
"for city_id in city_ids:\n",
178+
" city_names.append(cities[city_id]['name'])"
179+
]
180+
},
181+
{
182+
"cell_type": "code",
183+
"execution_count": 70,
184+
"metadata": {},
185+
"outputs": [
186+
{
187+
"data": {
188+
"text/plain": [
189+
"24336"
190+
]
191+
},
192+
"execution_count": 70,
193+
"metadata": {},
194+
"output_type": "execute_result"
195+
}
196+
],
197+
"source": [
198+
"len(city_names)"
199+
]
200+
},
201+
{
202+
"cell_type": "code",
203+
"execution_count": 94,
204+
"metadata": {},
205+
"outputs": [
206+
{
207+
"data": {
208+
"text/plain": [
209+
"['Andorra la Vella',\n",
210+
" 'Umm Al Quwain City',\n",
211+
" 'Ras Al Khaimah City',\n",
212+
" 'Zayed City',\n",
213+
" 'Khawr Fakkān',\n",
214+
" 'Dubai',\n",
215+
" 'Dibba Al-Fujairah',\n",
216+
" 'Dibba Al-Hisn',\n",
217+
" 'Sharjah',\n",
218+
" 'Ar Ruways',\n",
219+
" 'Al Fujairah City',\n",
220+
" 'Al Ain City',\n",
221+
" 'Ajman City',\n",
222+
" 'Adh Dhayd',\n",
223+
" 'Abu Dhabi',\n",
224+
" 'Khalifah A City',\n",
225+
" 'Bani Yas City',\n",
226+
" 'Musaffah',\n",
227+
" 'Al Shamkhah City',\n",
228+
" 'Reef Al Fujairah City']"
229+
]
230+
},
231+
"execution_count": 94,
232+
"metadata": {},
233+
"output_type": "execute_result"
234+
}
235+
],
236+
"source": [
237+
"city_names[:20]"
238+
]
239+
},
240+
{
241+
"cell_type": "code",
242+
"execution_count": 89,
243+
"metadata": {},
244+
"outputs": [],
245+
"source": [
246+
"from unidecode import unidecode\n",
247+
"\n",
248+
"city_names_unidecoded = [unidecode(city) for city in city_names]"
249+
]
250+
},
251+
{
252+
"cell_type": "code",
253+
"execution_count": 93,
254+
"metadata": {},
255+
"outputs": [
256+
{
257+
"data": {
258+
"text/plain": [
259+
"['Andorra la Vella',\n",
260+
" 'Umm Al Quwain City',\n",
261+
" 'Ras Al Khaimah City',\n",
262+
" 'Zayed City',\n",
263+
" 'Khawr Fakkan',\n",
264+
" 'Dubai',\n",
265+
" 'Dibba Al-Fujairah',\n",
266+
" 'Dibba Al-Hisn',\n",
267+
" 'Sharjah',\n",
268+
" 'Ar Ruways',\n",
269+
" 'Al Fujairah City',\n",
270+
" 'Al Ain City',\n",
271+
" 'Ajman City',\n",
272+
" 'Adh Dhayd',\n",
273+
" 'Abu Dhabi',\n",
274+
" 'Khalifah A City',\n",
275+
" 'Bani Yas City',\n",
276+
" 'Musaffah',\n",
277+
" 'Al Shamkhah City',\n",
278+
" 'Reef Al Fujairah City']"
279+
]
280+
},
281+
"execution_count": 93,
282+
"metadata": {},
283+
"output_type": "execute_result"
284+
}
285+
],
286+
"source": [
287+
"city_names_unidecoded[:20]"
288+
]
289+
},
290+
{
291+
"cell_type": "code",
292+
"execution_count": 104,
293+
"metadata": {},
294+
"outputs": [],
295+
"source": [
296+
"pattern_country = '|'.join(country_names)\n",
297+
"pattern_city = '|'.join(city_names)\n",
298+
"\n",
299+
"def pattern_searcher(search_str:str, search_list:str):\n",
300+
" search_obj = re.search(search_list, search_str)\n",
301+
" if search_obj :\n",
302+
" return_str = search_str[search_obj.start(): search_obj.end()]\n",
303+
" else:\n",
304+
" return_str = 'NA'\n",
305+
" return return_str\n",
306+
"\n",
307+
"df['country'] = df['headline'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern_country))\n",
308+
"df['city'] = df['headline'].apply(lambda x: pattern_searcher(search_str=x, search_list=pattern_city))"
309+
]
310+
},
311+
{
312+
"cell_type": "code",
313+
"execution_count": 107,
314+
"metadata": {},
315+
"outputs": [],
316+
"source": [
317+
"df = df.replace('NA', np.nan)"
318+
]
319+
},
320+
{
321+
"cell_type": "code",
322+
"execution_count": 108,
323+
"metadata": {},
324+
"outputs": [
325+
{
326+
"data": {
327+
"text/plain": [
328+
"headline 0\n",
329+
"country 633\n",
330+
"city 40\n",
331+
"dtype: int64"
332+
]
333+
},
334+
"execution_count": 108,
335+
"metadata": {},
336+
"output_type": "execute_result"
337+
}
338+
],
339+
"source": [
340+
"df.isnull().sum()"
341+
]
342+
},
343+
{
344+
"cell_type": "code",
345+
"execution_count": null,
346+
"metadata": {},
347+
"outputs": [],
348+
"source": []
349+
}
350+
],
351+
"metadata": {
352+
"kernelspec": {
353+
"display_name": "Python 3",
354+
"language": "python",
355+
"name": "python3"
356+
},
357+
"language_info": {
358+
"codemirror_mode": {
359+
"name": "ipython",
360+
"version": 3
361+
},
362+
"file_extension": ".py",
363+
"mimetype": "text/x-python",
364+
"name": "python",
365+
"nbconvert_exporter": "python",
366+
"pygments_lexer": "ipython3",
367+
"version": "3.6.7"
368+
}
369+
},
370+
"nbformat": 4,
371+
"nbformat_minor": 2
372+
}

0 commit comments

Comments
 (0)