Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
kakashitheaf committed Nov 6, 2016
1 parent 1656f10 commit 6f3e673
Showing 1 changed file with 59 additions and 24 deletions.
83 changes: 59 additions & 24 deletions dianping/dianping/spiders/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,44 +12,73 @@

BAIDU_GEO = u'http://api.map.baidu.com/geocoder/v2/?address={}&output=json&ak=gQsCAgCrWsuN99ggSIjGn5nO'

base_category_url = "http://www.dianping.com/search/category"

start_url_dict = {
u"足疗按摩": "/2/30/g141r1471",
u"中医养生": "/2/30/g2827r1471",
u"健康体检": "/2/80/g612",
u"妇幼保健": "/2/70/g258",
u"美容Spa": "/2/50/g158",
u"整形塑体": "/2/85/g183",
u"运动健身": "/2/45/g147",
u"口腔健康": "/2/85/g182",
u"药店": "/2/85/g235"
}


def clean_string(string):
return string.replace(' ', '').replace('\n', '') if string else ''


def address_to_geo(address):
data = requests.get(BAIDU_GEO.format(address)).json()
if 'result' in data:
return {}
longitude = data['result']['location']['lng'] if 'result' in data else 120.260569
latitude = data['result']['location']['lat'] if 'result' in data else 30.242865
return {'longitude': longitude, 'latitude': latitude}


class dianpingSpider(CommonSpider):
name = "dianping"
allowed_domains = ["dianping.com"]
start_urls = [
"http://www.dianping.com/search/category/2/30/g141r1471", #足疗按摩
"http://www.dianping.com/search/category/2/30/g140r1471", #洗浴
"http://www.dianping.com/search/category/2/30/g2827r1471", #中医养生


]
def start_requests(self):
for k, v in start_url_dict.items():
for i in range(1, 20):
url = base_category_url + v + 'p{}'.format(i)
yield Request(url, callback=self.parse, meta={'category': k})
break
break

def parse(self, response):
hxs = Selector(response)
sites = hxs.xpath('//div[@class="tit"]/a/@href').extract()
for site in sites:
if site.startswith('/shop/'):
yield Request("http://www.dianping.com{}".format(site), callback=self.parse_shop)
shops = hxs.xpath('//div[@class="tit"]/a/@href').extract()
for shop in shops:
if shop.startswith('/shop/'):
yield Request("http://www.dianping.com{}".format(shop), callback=self.parse_shop,
meta=response.request.meta)

def parse_shop(self, response):
shop = {}
hxs = Selector(response)
shop_name = hxs.css('.shop-name::text').extract_first().strip()
shop['name'] = shop_name
address = hxs.css('.address span.item::text').extract_first().strip()
shop['address'] = address
phone_number = hxs.css('.tel span.item::text').extract_first().strip()
shop['phone_number'] = phone_number
shop_name = hxs.css('.shop-name::text').extract_first()
shop['name'] = clean_string(shop_name)
address = hxs.css('.address span.item::text').extract_first()
shop['address'] = clean_string(address)
phone_number = hxs.css('.tel span.item::text').extract_first()
shop['phone_number'] = clean_string(phone_number)
path = u'//span[contains(text(), "营业时间:")]/following-sibling::span/text()'
opening_hours = hxs.xpath(path).extract_first().strip()
shop['opening_hours'] = opening_hours
data = requests.get(BAIDU_GEO.format(address)).json()
shop['longitude'] = data['result']['location']['lng']
shop['latitude'] = data['result']['location']['lat']
opening_hours = hxs.xpath(path).extract_first()
shop['opening_hours'] = clean_string(opening_hours)
geo = address_to_geo(address)
shop.update(geo)
store_images = hxs.xpath("//div[@class='photos-container']//img/@src").extract()
shop['store_images'] = ','.join(store_images[:2])
deals = hxs.xpath("//div[@id='sales']//a/@href").extract()
shop['deals'] = deals
shop['category'] = response.request.meta['category']
return shop


Expand All @@ -62,7 +91,13 @@ class dianpingDealSpider(CommonSpider):
]

def parse(self, response):
deal = {}
hxs = Selector(response)
name = hxs.xpath('//div[@class="bd"]/h1/text()').extract_first()
name = name.replace(' ', '').replace('\n', '')
print name
bd = hxs.css('.bd')
name = bd.css('.title::text').extract_first()
deal['name'] = clean_string(name)
description = bd.css('.sub-title span::text').extract_first()
deal['description'] = clean_string(description)
price = bd.css('.price-display::text').extract_first()
deal['price'] = clean_string(price)
print deal

0 comments on commit 6f3e673

Please sign in to comment.