|
| 1 | +#Custom |
| 2 | +import Functions.GetURLParams as gup |
| 3 | +import Functions.Export_to_Excel as ex |
| 4 | + |
| 5 | +#Third Party |
| 6 | +import requests |
| 7 | +from bs4 import BeautifulSoup |
| 8 | +import xlwt |
| 9 | + |
| 10 | +#Built-in |
| 11 | +from datetime import date,datetime |
| 12 | +from collections import defaultdict |
| 13 | +import os |
| 14 | + |
| 15 | +#input Example |
| 16 | +''' |
| 17 | +1,0,0,0 |
| 18 | +Web Development |
| 19 | +Tamilnadu,Bangalore |
| 20 | +2021-07-22 |
| 21 | +3 |
| 22 | +1 |
| 23 | +''' |
| 24 | +workbook = xlwt.Workbook() |
| 25 | +count = 0 |
| 26 | + |
| 27 | +while True: |
| 28 | + count+=1 |
| 29 | + final_params = gup.get_URL_params() |
| 30 | + URL = 'https://internshala.com'+final_params.lower() |
| 31 | + page = requests.get(URL) |
| 32 | + soup = BeautifulSoup(page.content, 'html.parser') |
| 33 | + max_pages = int(soup.find(id='total_pages').text.strip()) |
| 34 | + |
| 35 | + limit = int(input("How many pages you would like to get? Max Pages ({max_pages})\n".format(max_pages=max_pages))) |
| 36 | + if limit > max_pages: |
| 37 | + limit = max_pages |
| 38 | + print("Pages Set to Maximum pages present") |
| 39 | + elif limit <= 0: |
| 40 | + limit = 1 |
| 41 | + print("Pages set to 1") |
| 42 | + |
| 43 | + flag = 0 |
| 44 | + if limit > 1: |
| 45 | + flag = input('Different pages on different sheets?(Default: Yes) | 1: No\n') |
| 46 | + if flag == '1': |
| 47 | + sheet = workbook.add_sheet("Sheet - {count}".format(count=count)) |
| 48 | + ex.write_header(sheet) |
| 49 | + else: |
| 50 | + flag = '1' |
| 51 | + sheet = workbook.add_sheet("Sheet - {count}".format(count=count)) |
| 52 | + ex.write_header(sheet) |
| 53 | + |
| 54 | + params = defaultdict(lambda:[]) |
| 55 | + |
| 56 | + for i in range(limit): |
| 57 | + URL += '/page-{i}'.format(i = i+1) |
| 58 | + page = requests.get(URL) |
| 59 | + soup = BeautifulSoup(page.content, 'html.parser') |
| 60 | + if flag != '1': |
| 61 | + sheet = workbook.add_sheet("Sheet - {count}|Page - {i}".format(count=count,i = i+1)) |
| 62 | + ex.write_header(sheet) |
| 63 | + intern_titles = soup.find_all(class_ = 'heading_4_5 profile') |
| 64 | + if(len(intern_titles) == 0): |
| 65 | + print('No Results Found....') |
| 66 | + exit() |
| 67 | + print('--------------Scraping Page {i} -----------------'.format(i=i+1)) |
| 68 | + for title in intern_titles: |
| 69 | + elem = title.find('a',href=True) |
| 70 | + sub_URL = 'https://internshala.com'+str(elem['href']) |
| 71 | + |
| 72 | + sub_page = requests.get(sub_URL) |
| 73 | + sub_soup = BeautifulSoup(sub_page.content,'html.parser') |
| 74 | + |
| 75 | + params['internship_title'].append(sub_soup.find(class_ = 'profile_on_detail_page').text.strip()) |
| 76 | + params['company'].append(sub_soup.find(class_ = 'heading_6 company_name').find('a').text.strip()) |
| 77 | + params['location'].append(sub_soup.find(class_ = 'location_link').text.strip()) |
| 78 | + |
| 79 | + info = sub_soup.find(class_ = 'internship_other_details_container') |
| 80 | + |
| 81 | + other_details = info.find_all(class_ = 'item_body') |
| 82 | + |
| 83 | + params['duration'].append(other_details[1].text.strip()) |
| 84 | + params['stipend'].append(other_details[2].text.strip()) |
| 85 | + params['apply_by'].append(other_details[3].text.strip()) |
| 86 | + params['applicants'].append(sub_soup.find(class_ = 'applications_message').text.strip()) |
| 87 | + |
| 88 | + try : |
| 89 | + skills_raw = sub_soup.find(class_ = 'heading_5_5',string = 'Skill(s) required') |
| 90 | + skills_raw = skills_raw.findNext(class_ = 'round_tabs_container') |
| 91 | + params['skills'].append([str(i.text.strip()+' , ') for i in skills_raw.find_all(class_ = 'round_tabs')]) |
| 92 | + except (IndexError,AttributeError): |
| 93 | + params['skills'].append([]) |
| 94 | + |
| 95 | + try : |
| 96 | + perks_raw = sub_soup.find(class_ = 'heading_5_5',string = 'Perks') |
| 97 | + perks_raw = perks_raw.findNext(class_ = 'round_tabs_container') |
| 98 | + params['perks'].append([str(i.text.strip()+' , ') for i in perks_raw.find_all(class_ = 'round_tabs')]) |
| 99 | + except (IndexError,AttributeError): |
| 100 | + params['perks'].append([]) |
| 101 | + |
| 102 | + try : |
| 103 | + params['openings'].append(int(sub_soup.find_all(class_='text-container')[-1].text.strip())) |
| 104 | + except IndexError: |
| 105 | + params['openings'].append([]) |
| 106 | + params['link'].append(sub_URL) |
| 107 | + |
| 108 | + if flag != '1': |
| 109 | + ex.write_body(params,sheet) |
| 110 | + params = defaultdict(lambda:[]) |
| 111 | + |
| 112 | + if flag == '1': |
| 113 | + ex.write_body(params,sheet) # Excel write |
| 114 | + |
| 115 | + ex.save_and_export(flag,workbook) # Excel save and Export file |
| 116 | + |
| 117 | + |
| 118 | + |
| 119 | + |
| 120 | + |
0 commit comments