|
| 1 | +# use urllib to read the HTML from the data files below, and parse the data, |
| 2 | +# extracting numbers and compute the sum of the numbers in the file. |
| 3 | +# http://py4e-data.dr-chuck.net/comments_42.html is where we should get data from |
| 4 | +# This file is written so I'm able to refer back to remind myself |
| 5 | + |
| 6 | +import urllib.request, urllib.parse, urllib.error |
| 7 | +from bs4 import BeautifulSoup |
| 8 | +import ssl |
| 9 | + |
| 10 | +num=list() |
| 11 | + |
| 12 | +# to ignore certificate errors for https |
| 13 | +ctx=ssl.create_default_context() |
| 14 | +ctx.check_hostname=False |
| 15 | +ctx.verify_mode=ssl.CERT_NONE |
| 16 | + |
| 17 | +# Urlopen will return a sort of file handle we can then read. read() will read |
| 18 | +# the entire document into a one string |
| 19 | +# html_object is a clean html version of the page which has been parsed by bs |
| 20 | +# tags here outputs a list that include the entire span tag with content inside |
| 21 | +url=input("Enter URL: ") |
| 22 | +if len(url)<1: |
| 23 | + url= 'http://py4e-data.dr-chuck.net/comments_42.html' |
| 24 | + |
| 25 | +html=urllib.request.urlopen(url,context=ctx).read() |
| 26 | +# #testing |
| 27 | +# print('\n===OUTPUT OF HTML FROM READ()===\n',html) |
| 28 | + |
| 29 | +html_object=BeautifulSoup(html, 'html.parser') |
| 30 | +# #testing |
| 31 | +# print('\n===HTML OBJECT RETURNED FROM BeautifulSoup===\n',html_object) |
| 32 | +html_tags=html_object('span') |
| 33 | +# #testing |
| 34 | +# print('\n===TAGS WITH \'span\'===\n',html_tags) |
| 35 | +for tag in html_tags: |
| 36 | + # # testing |
| 37 | + # print('\n===TAG\n',tag) |
| 38 | + num.append(int(tag.contents[0])) |
| 39 | +print(sum(num)) |
0 commit comments