-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape_page.py
More file actions
113 lines (94 loc) · 3.19 KB
/
scrape_page.py
File metadata and controls
113 lines (94 loc) · 3.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from bs4 import BeautifulSoup
import requests
import re
import dateutil.parser
import json
re_title = re.compile(r"(.*) - ([a-z]{1,2}4[a-z]{1,2})( - ([0-9]{1,2}))?( \((.*)\))?")
re_id = re.compile(r'(.*)/([0-9]+)\.html')
def parse_orientation_match(m):
if m and m.group(2):
orientation = m.group(2).strip()
gender = orientation.split("4")[0]
target = orientation.split("4")[1]
return orientation, gender, target
else:
return None, None, None
def parse_city_to_slug(city):
city = re.sub('/', ' ', city)
city = re.sub('\\.', ' ', city).strip()
return re.sub('\s+', '-', city).lower().strip()
def gen_redis_key(o):
slug = parse_city_to_slug(o['city'])
return '%s:%s' % (slug, o['orientation'])
def parse_title(soup):
raw_title = soup.find("h2", {'class': 'postingtitle'}).text.strip()
m = re_title.search(raw_title)
title = m.group(1).strip() if m else None
orientation, gender, target = parse_orientation_match(m)
age = int(m.group(4)) if m and m.group(4) else None
location = m.group(6).strip() if m and m.group(6) else None
return raw_title, title, orientation, age, location, gender, target
def parse_email(soup):
raw_email = soup.find("section", {"class":"dateReplyBar"}).a.attrs['href'][7:]
return raw_email.split('?')[0]
def parse_body(soup):
return soup.find("section", {"id":"postingbody"}).text.strip()
def parse_date(soup):
results = soup.findAll("p", {"class":"postinginfo"})
date_string = [r.find('date').text for r in results if r.find('date')][0]
# reformat datestring, strip time zone
date_string = re.sub(',', '', date_string)
date_string = re.sub('\s+', ' ', date_string).strip()
date_string = ' '.join(date_string.split()[0:2])
# return
return dateutil.parser.parse(date_string)
def parse_images(soup):
img_div = soup.find("div", {"id":"thumbs"})
if img_div:
img_links = [a.attrs['href'] for a in img_div.findAll('a')]
n_imgs = len(img_links)
else:
n_imgs = 0
img_links = []
return n_imgs, img_links
def scrape_page(url, city):
r = requests.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.content)
# parse date and title first
dt = parse_date(soup)
ts = int(dt.strftime("%s"))
raw_title, title, orientation, age, location, gender, target = \
parse_title(soup)
n_imgs, img_links = parse_images(soup)
output = dict(
url = url,
city = city,
city_slug = parse_city_to_slug(city),
raw_title = raw_title,
title = title,
orientation = orientation,
gender = gender,
target = target,
age = age,
location = location,
email = parse_email(soup),
body = parse_body(soup),
timestamp = ts,
datetime = dt.strftime("%Y-%m-%d %H:%M:%S"),
year = dt.year,
month = dt.month,
day = dt.day,
hour = dt.hour,
min = dt.minute,
weekday = dt.weekday(),
n_imgs = n_imgs,
img_links = img_links
)
key = gen_redis_key(output)
# print "adding %s to key:%s with ts:%s" % (url, key, ts)
return key, ts, json.dumps(output)
else:
return None
if __name__ == '__main__':
print scrape_page('http://tijuana.es.craigslist.com.mx/mis/4130179087.html', 'tijuana')