-
Notifications
You must be signed in to change notification settings - Fork 0
/
ask.py
executable file
·54 lines (34 loc) · 1.33 KB
/
ask.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Import ingredients to make soup
from bs4 import BeautifulSoup
# Generic URL parsing goodies
import urllib.request
import http.client
# Some arguments for the command line
import argparse
import re
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", dest="user_url", help="The URL for parsing", type=str)
args = parser.parse_args()
def fetch_page(url):
page = urllib.request.urlopen(url)
return page
# URL from the arguments; assign to variable and fetch
web_url = args.user_url
# Parse the HTML with lxml
soup = BeautifulSoup(fetch_page(web_url), 'lxml')
# Finding data-poll-url; using find instead of findAll
# as findAll returns "results set" and not tag
data_tag = soup.find('a', id="newItemsReady")
data_poll_url = data_tag['data-poll-url']
poll_score = data_poll_url.split("=")[1]
# Setting up second page for fetches - testing
soup = BeautifulSoup(fetch_page(web_url + "/answers/more?page=3&score=" + poll_score), 'lxml')
# Assign all <img> tags to tag types
tag = soup.img
# Look for <img> that has both src and onerror - other images in page
# that were extraneous did not have these attributes
def has_img_and_onerror(tag):
return tag.has_attr('onerror') and tag.has_attr('src')
# Get just the src attributes of these tags
for images in soup.findAll(has_img_and_onerror):
print(images.get('src'))