Skip to content

Commit 0e40d70

Browse files
authored
Add files via upload
1 parent 488b528 commit 0e40d70

File tree

1 file changed

+120
-0
lines changed

1 file changed

+120
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#Custom
2+
import Functions.GetURLParams as gup
3+
import Functions.Export_to_Excel as ex
4+
5+
#Third Party
6+
import requests
7+
from bs4 import BeautifulSoup
8+
import xlwt
9+
10+
#Built-in
11+
from datetime import date,datetime
12+
from collections import defaultdict
13+
import os
14+
15+
#input Example
16+
'''
17+
1,0,0,0
18+
Web Development
19+
Tamilnadu,Bangalore
20+
2021-07-22
21+
3
22+
1
23+
'''
24+
workbook = xlwt.Workbook()
25+
count = 0
26+
27+
while True:
28+
count+=1
29+
final_params = gup.get_URL_params()
30+
URL = 'https://internshala.com'+final_params.lower()
31+
page = requests.get(URL)
32+
soup = BeautifulSoup(page.content, 'html.parser')
33+
max_pages = int(soup.find(id='total_pages').text.strip())
34+
35+
limit = int(input("How many pages you would like to get? Max Pages ({max_pages})\n".format(max_pages=max_pages)))
36+
if limit > max_pages:
37+
limit = max_pages
38+
print("Pages Set to Maximum pages present")
39+
elif limit <= 0:
40+
limit = 1
41+
print("Pages set to 1")
42+
43+
flag = 0
44+
if limit > 1:
45+
flag = input('Different pages on different sheets?(Default: Yes) | 1: No\n')
46+
if flag == '1':
47+
sheet = workbook.add_sheet("Sheet - {count}".format(count=count))
48+
ex.write_header(sheet)
49+
else:
50+
flag = '1'
51+
sheet = workbook.add_sheet("Sheet - {count}".format(count=count))
52+
ex.write_header(sheet)
53+
54+
params = defaultdict(lambda:[])
55+
56+
for i in range(limit):
57+
URL += '/page-{i}'.format(i = i+1)
58+
page = requests.get(URL)
59+
soup = BeautifulSoup(page.content, 'html.parser')
60+
if flag != '1':
61+
sheet = workbook.add_sheet("Sheet - {count}|Page - {i}".format(count=count,i = i+1))
62+
ex.write_header(sheet)
63+
intern_titles = soup.find_all(class_ = 'heading_4_5 profile')
64+
if(len(intern_titles) == 0):
65+
print('No Results Found....')
66+
exit()
67+
print('--------------Scraping Page {i} -----------------'.format(i=i+1))
68+
for title in intern_titles:
69+
elem = title.find('a',href=True)
70+
sub_URL = 'https://internshala.com'+str(elem['href'])
71+
72+
sub_page = requests.get(sub_URL)
73+
sub_soup = BeautifulSoup(sub_page.content,'html.parser')
74+
75+
params['internship_title'].append(sub_soup.find(class_ = 'profile_on_detail_page').text.strip())
76+
params['company'].append(sub_soup.find(class_ = 'heading_6 company_name').find('a').text.strip())
77+
params['location'].append(sub_soup.find(class_ = 'location_link').text.strip())
78+
79+
info = sub_soup.find(class_ = 'internship_other_details_container')
80+
81+
other_details = info.find_all(class_ = 'item_body')
82+
83+
params['duration'].append(other_details[1].text.strip())
84+
params['stipend'].append(other_details[2].text.strip())
85+
params['apply_by'].append(other_details[3].text.strip())
86+
params['applicants'].append(sub_soup.find(class_ = 'applications_message').text.strip())
87+
88+
try :
89+
skills_raw = sub_soup.find(class_ = 'heading_5_5',string = 'Skill(s) required')
90+
skills_raw = skills_raw.findNext(class_ = 'round_tabs_container')
91+
params['skills'].append([str(i.text.strip()+' , ') for i in skills_raw.find_all(class_ = 'round_tabs')])
92+
except (IndexError,AttributeError):
93+
params['skills'].append([])
94+
95+
try :
96+
perks_raw = sub_soup.find(class_ = 'heading_5_5',string = 'Perks')
97+
perks_raw = perks_raw.findNext(class_ = 'round_tabs_container')
98+
params['perks'].append([str(i.text.strip()+' , ') for i in perks_raw.find_all(class_ = 'round_tabs')])
99+
except (IndexError,AttributeError):
100+
params['perks'].append([])
101+
102+
try :
103+
params['openings'].append(int(sub_soup.find_all(class_='text-container')[-1].text.strip()))
104+
except IndexError:
105+
params['openings'].append([])
106+
params['link'].append(sub_URL)
107+
108+
if flag != '1':
109+
ex.write_body(params,sheet)
110+
params = defaultdict(lambda:[])
111+
112+
if flag == '1':
113+
ex.write_body(params,sheet) # Excel write
114+
115+
ex.save_and_export(flag,workbook) # Excel save and Export file
116+
117+
118+
119+
120+

0 commit comments

Comments
 (0)