-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathflipkart.py
119 lines (86 loc) · 3.57 KB
/
flipkart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import re
def flipkart(page_soup):
'''
It takes the Beautiful-soup of the flipkart's page, extract the details of the products listed and write them into a csv file
'''
# create a new file to write data in csv
f = open('data/flipkart-laptops.csv', 'w')
# write a header, columns name
f.write('Brand,Name,Price-was,Current-Price,Discount(₹),Discount(%),Rating(Out of 5),Number-of-ratings,RAM(GB),Storage(GB/TB),Refurbished(0/1),URL\n')
# grabs each item-container
conts = page_soup.findAll('div', {'class': 'bhgxx2 col-12-12'})
conts = conts[2:-2]
for cont in conts:
# get the title of the product
try:
title = cont.select_one('div > div > div > a > div:nth-of-type(3) > div > div').get_text(strip=True)
except:
title = cont.select_one('div > div > div > a > div:nth-of-type(2) > div > div').get_text(strip=True)
# if product is refurbished/renewed
if ('refurbished' in title.lower()) or ('renewed' in title.lower()):
refurb = '1'
else:
refurb = '0'
# Link of the product
link = cont.div.div.div.a['href']
link = 'https://www.flipkart.com/' + link
# retrieve the brand name of the product
brand = title.split(' ')[0]
# star rating of the product and number of reviews, if available
try:
rating = cont.select_one('div > div > div > a > div:nth-of-type(3) > div > div:nth-of-type(2) > span').get_text(strip=True)
num_rating = cont.select_one('div > div > div > a > div:nth-of-type(3) > div > div:nth-of-type(2) > span:nth-of-type(2) > span > span').get_text(strip=True)
num_rating = num_rating.split(' ')[0]
num_review = cont.select_one('div > div > div > a > div:nth-of-type(3) > div > div:nth-of-type(2) > span:nth-of-type(2) > span > span:nth-of-type(3)').get_text(strip=True)
num_review = num_review.split(' ')[0]
except:
rating = 'NaN'
num_rating = '0'
num_review = '0'
# previous price of the product and discount, if available
try:
price = cont.select_one('div > div > div > a > div:nth-of-type(3) > div:nth-of-type(2) > div').get_text(strip=True)
price = re.search('₹(\d+,\d+)₹(\d+,\d+)(\d\d)%', price)
# previous price
pw = price.group(2)
# Current price of the product
pc = price.group(1)
# discount in ruppee and percent
dc = 'NaN'
dcp = price.group(3)
except:
pw = 'NaN'
pc = 'NaN'
dc = '0'
dcp = '0'
# ram
try:
features = re.search('\((\d+) GB/(\d+)\s.B', title)
ram = features.group(1)
hdd = features.group(2)
except:
ram = 'NaN'
hdd = 'NaN'
# write the variables in csv file
f.write(brand + ',' + title.replace(',', ' ') + ',' + pw + ',' + pc + ',' + dc + ',' + dcp + ',' + rating + ',' + num_rating + ',' + ram + ',' + hdd + ',' + refurb + ',' + link + '\n')
f.close()
print('Scraped Flipkart\'s page')
def kartUrls(page_soup, url):
'''
Extract the last page number from page_soup and create urls upto that page using predefined template of the flipkart urls
'''
# get the last page number
last = kartLast(page_soup)
urls = list()
# Create urls to return using url template of flipkart
for i in range(2, last+1):
url = 'https://www.flipkart.com/laptops/pr?sid=6bo%2Fb5g&p%5B%5D=facets.serviceability%5B%5D%3Dtrue&fm=neo%2Fmerchandising&iid=M_3580c4f9-a714-45e8-a54c-64fa60d4b35d_10.f37da198-ab3e-48f0-bc9a-12dfbbbc32c9_DesktopSite&ppt=clp&ppn=laptops-store&page=' + str(i)
urls.append(url)
return urls
def kartLast(page_soup):
'''
Return the last page number from the page
'''
last = page_soup.find('div', {'class': '_2zg3yZ'})
last = last.span.text.split(' ')[-1]
return int(last)