forked from azflin/job-keyword-analysis-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping_functions.py
141 lines (126 loc) · 5.24 KB
/
scraping_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import requests, re
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
import pandas as pd
from pandas import DataFrame
pd.options.display.width = 150
import matplotlib.pyplot
class JobPosting:
def __init__(self,job_title,href,company,location):
self.job_title=job_title
self.href=href
self.company=company
self.location=location
removable_words = ["jobs","apply","email","password","new","please","required",
"opportunities","skills","sign","talent","follow",
"job","work","click","home","next","us","start","take","enter"
,"date","people","career","hours","week","type","attach","resume","search","advanced","privacy","terms",
"find","years","experience","find","title","keywords","company","indeed","help","centre","cookies",
"post","(free)","industry","employment","ago","save","forums","browse","employer","browse","city",
"province","title","api","review","application","help","instructions","applying","full","time","opportunity",
"team","join","resumes","employers","rights","reserved","create","north","america","toronto","interview","cover","letter",
"30+","days","inc","trends"]
#Define function to scrape and process text of a given URL
def scrapeText(url):
html = requests.get(url).text
text = BeautifulSoup(html,'html.parser')
iframe = text.iframe
if iframe:
try:
extra = scrapeText(iframe['src'])
except:
extra=[]
else:
extra = []
for garbage in text(["head","script","style","input"]):
garbage.extract()
text = " ".join(text.strings)
text = text.replace(u'\xa0', ' ').encode('utf-8')
text = re.sub(r'(\.+ )|,|\||:|/|\'|\-|;|\*|(\s\d+\s)|(\s\W\s)',' ',text)
text = re.sub(r'(\s+)|(\s+\d+\s+)',' ',text)
arr = text.lower().split()
arr = [word for word in arr if word not in stopwords.words("english") and word.encode('string-escape')[0]!='\\']
arr = [word for word in arr if word not in removable_words]
return arr + extra
#Create a list of job URLs called 'job_postings' from indeed
def getURLs(search_query):
job_postings=[]
while True:
html = requests.get('http://www.indeed.ca'+search_query).text
soup = BeautifulSoup(html,'html.parser')
for job in soup.find_all("a",class_="turnstileLink"):
if job.parent.name == 'h2':
title = job.text.strip()
href = job['href']
try:
company = job.parent.parent.find_all('span','company')[0].text.strip()
except:
company = "No Company"
location = job.parent.parent.find_all('span','location')[0].text.strip()
job_postings.append(JobPosting(title,href,company,location))
if not 'Next' in soup.find_all("span","np")[0].text:
try:
soup.find_all("span","np")[1]
search_query = soup.find_all("span","np")[1].parent.parent['href']
except:
break
else:
search_query = soup.find_all("span","np")[0].parent.parent['href']
print "Found %d job URLs" % len(job_postings)
return job_postings
#Create dataframe of job information from given job URLs by scraping the HTML off each URL
def scrapeJobs(job_postings):
print "Scraping text from URLs..."
for job,num in zip(job_postings,range(len(job_postings))):
try:
job.words = scrapeText('http://www.indeed.ca'+job.href)
except:
job.words = "null"
print "Failed scraping job %d" % num
df = DataFrame({'Title':[],'Company':[],'Location':[],'Description':[]})
df = df[['Title','Company','Location','Description']]
for i in range(len(job_postings)):
df.loc[i] = [job_postings[i].job_title,job_postings[i].company,
job_postings[i].location,job_postings[i].words]
df=df[df.Description!="null"]
print "Successfully scraped text from %d jobs." % len(df)
return df
#Create plots for keyword frequency and bigram frequency
def analyzeJobs(skills,df):
skills_series = pd.Series()
for skill in skills:
df[skill]=df.Description.apply(lambda x: 1 if skill in x else 0)
skills_series[skill] = int(df[skill].sum()/float(len(df))*100)
skills_series.sort(ascending=False)
for skill,count in zip(skills_series.index,skills_series):
print "%s in %d%% of jobs" % (skill, count)
def find_bigrams(input_list):
bigram_list = []
for i in range(len(input_list)-1):
bigram_list.append(input_list[i] + " " + input_list[i+1])
return bigram_list
df['bigrams']=df['Description'].apply(find_bigrams)
bigram_series = pd.Series()
for bigrams in df['bigrams']:
bigram_counter = pd.Series()
for bigram in set(bigrams):
if bigram in bigram_counter:
bigram_counter[bigram]+=1
else:
bigram_counter[bigram]=1
bigram_series=bigram_series.add(bigram_counter,fill_value=0)
bigram_series.sort(ascending=False)
print "Top 25 Most Frequent Bigrams, % of Postings with Bigram"
for x,y in zip(bigram_series.index,bigram_series[:25]):
print x + ": " + str(int(y/len(df)*100)) + "%"
skills_plot = skills_series.plot(kind='bar',title='Percent of Jobs with Programming Language, Indeed Query: Developer Toronto')
skills_plot.set_ylabel('% of Jobs')
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig('skills_plot.png')
matplotlib.pyplot.close()
bigrams_plot = bigram_series[:20].plot(kind='barh',title='Percent of Jobs with Bigram, Indeed Query: Developer Toronto')
matplotlib.pyplot.gca().invert_yaxis()
matplotlib.pyplot.tight_layout()
matplotlib.pyplot.savefig('bigrams_plot.png')
matplotlib.pyplot.close()