Skip to content

Commit d3aff44

Browse files
committed
Add top scorers CSV and mini project notebook for web scraping with BeautifulSoup
1 parent b7a8cea commit d3aff44

File tree

3 files changed

+893
-0
lines changed

3 files changed

+893
-0
lines changed
Lines changed: 333 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,333 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import requests\n",
10+
"from bs4 import BeautifulSoup\n",
11+
"import pandas as pd"
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": 3,
17+
"metadata": {},
18+
"outputs": [],
19+
"source": [
20+
"base_url = 'https://www.bbc.com/sport/football/premier-league/top-scorers'"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": 24,
26+
"metadata": {},
27+
"outputs": [],
28+
"source": [
29+
"player_names = []\n",
30+
"team_names = []\n",
31+
"goals = []\n",
32+
"assists = []\n",
33+
"num_matches = []\n",
34+
"shots = []"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": 29,
40+
"metadata": {},
41+
"outputs": [
42+
{
43+
"name": "stdout",
44+
"output_type": "stream",
45+
"text": [
46+
"Status code: 200\n",
47+
"Done\n"
48+
]
49+
}
50+
],
51+
"source": [
52+
"try:\n",
53+
" response = requests.get(base_url)\n",
54+
" response.raise_for_status()\n",
55+
"except requests.exceptions.HTTPError as err:\n",
56+
" raise SystemExit(err)\n",
57+
"else:\n",
58+
" status_code = response.status_code\n",
59+
" print(f'Status code: {status_code}')\n",
60+
" if status_code == 200:\n",
61+
" soup = BeautifulSoup(response.text, 'html.parser')\n",
62+
" players = soup.find('tbody').find_all('tr', class_='ssrcss-dhlz6k-TableRowBody e1icz100')\n",
63+
" for player in players:\n",
64+
" player_name = player.find('div', class_ = \"ssrcss-m6ah29-PlayerName e1n8xy5b1\").get_text(strip=True)\n",
65+
" team_name = player.find('div', class_ = \"ssrcss-qvpga1-TeamsSummary e1n8xy5b0\").get_text(strip=True)\n",
66+
" goals_scored = player.find('div', class_ = \"ssrcss-8k20kk-CellWrapper ef9ipf0\").get_text(strip=True)\n",
67+
"\n",
68+
" stats = player.find_all('div', class_ = \"ssrcss-150z8d-CellWrapper ef9ipf0\")\n",
69+
" assists_made = int(stats[0].get_text(strip=True))\n",
70+
" matches_played = int(stats[2].get_text(strip=True))\n",
71+
" shots_taken = int(stats[-3].get_text(strip=True))\n",
72+
" \n",
73+
" player_names.append(player_name)\n",
74+
" team_names.append(team_name)\n",
75+
" goals.append(goals_scored)\n",
76+
" assists.append(assists_made)\n",
77+
" num_matches.append(matches_played)\n",
78+
" shots.append(shots_taken)\n",
79+
"\n",
80+
" data = {\n",
81+
" 'player': player_names,\n",
82+
" 'team': team_names,\n",
83+
" 'matches': num_matches,\n",
84+
" 'goals': goals,\n",
85+
" 'assists': assists,\n",
86+
" 'shots': shots\n",
87+
" }\n",
88+
" df_players = pd.DataFrame(data)\n",
89+
" else:\n",
90+
" print('Failed to fetch data')\n",
91+
"finally:\n",
92+
" print('Done')"
93+
]
94+
},
95+
{
96+
"cell_type": "code",
97+
"execution_count": null,
98+
"metadata": {},
99+
"outputs": [],
100+
"source": [
101+
"df_players.to_csv('top_scorers.csv', index=False)"
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": 36,
107+
"metadata": {},
108+
"outputs": [
109+
{
110+
"data": {
111+
"text/html": [
112+
"<div>\n",
113+
"<style scoped>\n",
114+
" .dataframe tbody tr th:only-of-type {\n",
115+
" vertical-align: middle;\n",
116+
" }\n",
117+
"\n",
118+
" .dataframe tbody tr th {\n",
119+
" vertical-align: top;\n",
120+
" }\n",
121+
"\n",
122+
" .dataframe thead th {\n",
123+
" text-align: right;\n",
124+
" }\n",
125+
"</style>\n",
126+
"<table border=\"1\" class=\"dataframe\">\n",
127+
" <thead>\n",
128+
" <tr style=\"text-align: right;\">\n",
129+
" <th></th>\n",
130+
" <th>player</th>\n",
131+
" <th>team</th>\n",
132+
" <th>matches</th>\n",
133+
" <th>goals</th>\n",
134+
" <th>assists</th>\n",
135+
" <th>shots</th>\n",
136+
" </tr>\n",
137+
" </thead>\n",
138+
" <tbody>\n",
139+
" <tr>\n",
140+
" <th>0</th>\n",
141+
" <td>Mohamed Salah</td>\n",
142+
" <td>Liverpool</td>\n",
143+
" <td>29</td>\n",
144+
" <td>27</td>\n",
145+
" <td>17</td>\n",
146+
" <td>106</td>\n",
147+
" </tr>\n",
148+
" <tr>\n",
149+
" <th>1</th>\n",
150+
" <td>E. Haaland</td>\n",
151+
" <td>Man City</td>\n",
152+
" <td>28</td>\n",
153+
" <td>21</td>\n",
154+
" <td>3</td>\n",
155+
" <td>102</td>\n",
156+
" </tr>\n",
157+
" <tr>\n",
158+
" <th>2</th>\n",
159+
" <td>A. Isak</td>\n",
160+
" <td>Newcastle</td>\n",
161+
" <td>25</td>\n",
162+
" <td>19</td>\n",
163+
" <td>5</td>\n",
164+
" <td>69</td>\n",
165+
" </tr>\n",
166+
" <tr>\n",
167+
" <th>3</th>\n",
168+
" <td>C. Wood</td>\n",
169+
" <td>Nottm Forest</td>\n",
170+
" <td>29</td>\n",
171+
" <td>18</td>\n",
172+
" <td>3</td>\n",
173+
" <td>53</td>\n",
174+
" </tr>\n",
175+
" <tr>\n",
176+
" <th>4</th>\n",
177+
" <td>B. Mbeumo</td>\n",
178+
" <td>Brentford</td>\n",
179+
" <td>29</td>\n",
180+
" <td>15</td>\n",
181+
" <td>5</td>\n",
182+
" <td>62</td>\n",
183+
" </tr>\n",
184+
" </tbody>\n",
185+
"</table>\n",
186+
"</div>"
187+
],
188+
"text/plain": [
189+
" player team matches goals assists shots\n",
190+
"0 Mohamed Salah Liverpool 29 27 17 106\n",
191+
"1 E. Haaland Man City 28 21 3 102\n",
192+
"2 A. Isak Newcastle 25 19 5 69\n",
193+
"3 C. Wood Nottm Forest 29 18 3 53\n",
194+
"4 B. Mbeumo Brentford 29 15 5 62"
195+
]
196+
},
197+
"execution_count": 36,
198+
"metadata": {},
199+
"output_type": "execute_result"
200+
}
201+
],
202+
"source": [
203+
"df_players.head()"
204+
]
205+
},
206+
{
207+
"cell_type": "code",
208+
"execution_count": 37,
209+
"metadata": {},
210+
"outputs": [
211+
{
212+
"data": {
213+
"text/html": [
214+
"<div>\n",
215+
"<style scoped>\n",
216+
" .dataframe tbody tr th:only-of-type {\n",
217+
" vertical-align: middle;\n",
218+
" }\n",
219+
"\n",
220+
" .dataframe tbody tr th {\n",
221+
" vertical-align: top;\n",
222+
" }\n",
223+
"\n",
224+
" .dataframe thead th {\n",
225+
" text-align: right;\n",
226+
" }\n",
227+
"</style>\n",
228+
"<table border=\"1\" class=\"dataframe\">\n",
229+
" <thead>\n",
230+
" <tr style=\"text-align: right;\">\n",
231+
" <th></th>\n",
232+
" <th>matches</th>\n",
233+
" <th>assists</th>\n",
234+
" <th>shots</th>\n",
235+
" </tr>\n",
236+
" </thead>\n",
237+
" <tbody>\n",
238+
" <tr>\n",
239+
" <th>count</th>\n",
240+
" <td>39.000000</td>\n",
241+
" <td>39.000000</td>\n",
242+
" <td>39.000000</td>\n",
243+
" </tr>\n",
244+
" <tr>\n",
245+
" <th>mean</th>\n",
246+
" <td>26.358974</td>\n",
247+
" <td>4.128205</td>\n",
248+
" <td>58.512821</td>\n",
249+
" </tr>\n",
250+
" <tr>\n",
251+
" <th>std</th>\n",
252+
" <td>2.590338</td>\n",
253+
" <td>2.820783</td>\n",
254+
" <td>19.177662</td>\n",
255+
" </tr>\n",
256+
" <tr>\n",
257+
" <th>min</th>\n",
258+
" <td>20.000000</td>\n",
259+
" <td>0.000000</td>\n",
260+
" <td>31.000000</td>\n",
261+
" </tr>\n",
262+
" <tr>\n",
263+
" <th>25%</th>\n",
264+
" <td>25.500000</td>\n",
265+
" <td>3.000000</td>\n",
266+
" <td>44.500000</td>\n",
267+
" </tr>\n",
268+
" <tr>\n",
269+
" <th>50%</th>\n",
270+
" <td>27.000000</td>\n",
271+
" <td>4.000000</td>\n",
272+
" <td>53.000000</td>\n",
273+
" </tr>\n",
274+
" <tr>\n",
275+
" <th>75%</th>\n",
276+
" <td>28.000000</td>\n",
277+
" <td>5.000000</td>\n",
278+
" <td>64.500000</td>\n",
279+
" </tr>\n",
280+
" <tr>\n",
281+
" <th>max</th>\n",
282+
" <td>29.000000</td>\n",
283+
" <td>17.000000</td>\n",
284+
" <td>106.000000</td>\n",
285+
" </tr>\n",
286+
" </tbody>\n",
287+
"</table>\n",
288+
"</div>"
289+
],
290+
"text/plain": [
291+
" matches assists shots\n",
292+
"count 39.000000 39.000000 39.000000\n",
293+
"mean 26.358974 4.128205 58.512821\n",
294+
"std 2.590338 2.820783 19.177662\n",
295+
"min 20.000000 0.000000 31.000000\n",
296+
"25% 25.500000 3.000000 44.500000\n",
297+
"50% 27.000000 4.000000 53.000000\n",
298+
"75% 28.000000 5.000000 64.500000\n",
299+
"max 29.000000 17.000000 106.000000"
300+
]
301+
},
302+
"execution_count": 37,
303+
"metadata": {},
304+
"output_type": "execute_result"
305+
}
306+
],
307+
"source": [
308+
"df_players.describe()"
309+
]
310+
}
311+
],
312+
"metadata": {
313+
"kernelspec": {
314+
"display_name": "base",
315+
"language": "python",
316+
"name": "python3"
317+
},
318+
"language_info": {
319+
"codemirror_mode": {
320+
"name": "ipython",
321+
"version": 3
322+
},
323+
"file_extension": ".py",
324+
"mimetype": "text/x-python",
325+
"name": "python",
326+
"nbconvert_exporter": "python",
327+
"pygments_lexer": "ipython3",
328+
"version": "3.12.7"
329+
}
330+
},
331+
"nbformat": 4,
332+
"nbformat_minor": 2
333+
}

0 commit comments

Comments
 (0)