Skip to content

Commit

Permalink
scawler ok
Browse files Browse the repository at this point in the history
  • Loading branch information
harbinzhang committed Jan 1, 2017
0 parents commit e407bf9
Show file tree
Hide file tree
Showing 13 changed files with 6,360 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .idea/linkedin.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

477 changes: 477 additions & 0 deletions .idea/workspace.xml

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# linkedin
4,482 changes: 4,482 additions & 0 deletions scawler/ceshi.html

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions scawler/ceshi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@


import random
print 1+ random.randint(1,1000)/1000.0
Binary file added scawler/ghostdriver.log
Binary file not shown.
18 changes: 18 additions & 0 deletions scawler/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from scawler/scawler import Scawler


username = 'haibin610@yeah.net'
password = 'coolman'
url = 'https://www.linkedin.com/in/haibinzhang'


if __name__ == '__main__':
scawler = Scawler(username,password)
scawler.deal(0)
# scawler.dealNew(url)

# ceshi"url_id" : "haibinzhang"
# soup = scawler.preLoad(url)
# print scawler.getSkills(soup)
1,094 changes: 1,094 additions & 0 deletions scawler/relatedLinks

Large diffs are not rendered by default.

241 changes: 241 additions & 0 deletions scawler/scawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
import cookielib
import os
import re
import string
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import pymongo
from time import sleep
import random

class Scawler(object):

def __init__(self, username, password):
self.username = username
self.password = password
self.cnt = 0
self.broken = False

client = pymongo.MongoClient("localhost", 27017)
self.db = client.mydb

phantomjsdriver = "/Users/Harbin/Desktop/mine/vagrant/temp/linkedin/phantomjs"
os.environ["webdriver.phantomjs.driver"] = phantomjsdriver
self.driver = webdriver.PhantomJS(phantomjsdriver)
self.login()


def login(self):
self.driver.set_page_load_timeout(7)
baseurl = "https://www.linkedin.com/"

self.driver.get(baseurl)
elem = self.driver.find_element_by_id("login-email")
elem.clear()
elem.send_keys(self.username)
elem = self.driver.find_element_by_id("login-password")
elem.clear()
elem.send_keys(self.password)
elem.send_keys(Keys.RETURN)


def getBackground(self, soup):
try:
backinfo = soup.find(id = "background-experience")
backgrounds = []
res = backinfo.find("div",{"class":"editable-item section-item current-position"})
background = []
for _ in res.findAll('a'):
if _.getText() == "":
continue
background.append(_.getText())

for item in backinfo.findAll("div",{"class":"editable-item section-item past-position"}):
background = []
for _ in item.findAll('a'):
if _.getText() == "":
continue
background.append(_.getText())
try:
background.append(item.find("p",{"class":[u"description", u"summary-field-show-more"]}).getText())
except:
# print ("No description on background")
pass
backgrounds.append(background + [])
return backgrounds
except:
return None

def getSkills(self, soup):
try:
res = soup.find(id="profile-skills")
num = res.findAll("span", {"class": "num-endorsements"})
skills = res.findAll("span", {"class": "endorse-item-name"})
topskills = {}
for i in range(len(skills)):
tmp = str(skills[i].getText()).split('.')[0]
try:
topskills[tmp] = int(num[i].getText())
except:
topskills[tmp] = 0
return topskills
except:
f = open('siton','ab')
f.write(self.driver.current_url)
f.write('\n')
f.close()
self.broken = True
print "Oh, That's bad.."


# I don't know why it cannot work
# try:
# res = soup.find(id="profile-skills").getText()
# num = res.findAll("span", {"class": "num-endorsements"})
# skills = res.findAll("span", {"class": "endorse-item-name"})
# topskills = {}
# for i in range(len(num)):
# tmp = str(skills[i].getText()).split('.')[0]
# # tmp = tmp.split('$')[0]
# print tmp
# topskills[tmp] = int(num[i].getText())
# return topskills
# except:
# return None

def saveProfile(self, soup):
# url_id
curtUrls = self.driver.current_url.split('/')[4]
url_id = curtUrls.split('?')[0]
# name
try:
name = soup.find(id="name").getText()
except:
name = None
# locality
try:
locality = soup.find("span",{"class":"locality"}).getText()
except:
locality = None
# title
try:
title = soup.find("p",{"class":"title"}).getText()
except:
title = None
# industry
try:
industry = soup.find("a",{"name":"industry"}).getText()
except:
industry = None
# curt_company
try:
curt_company = soup.find("a",{"name":"company"}).getText()
except:
curt_company = None
# edu
try:
edu = soup.find("a",{"title":"More details for this school"}).getText()
except:
edu = None

backgrounds = self.getBackground(soup)

topskills = self.getSkills(soup)

ans = {
"url_id":url_id,
"name":name,
"locality":locality,
"title":title,
"industry":industry,
"curt_company":curt_company,
"edu":edu,
"backgrouds":backgrounds,
"topskills":topskills
}

# print ans
# save it
try:
self.db.linkedin.insert(ans)
except:
f = open('siton','ab')
f.write(str(ans))
f.write('\n')
f.close()

if not self.broken:
self.db.linkedinDedual.insert({"url_id":url_id})


def loadPage(self):
# self.driver.get(url)

html = self.driver.page_source
soup = BeautifulSoup(html, "html.parser")

# save related users
res = soup.find_all("a", {"class": "browse-map-photo"})
links = open('relatedLinks','ab')
for item in res:
links.write(str(item['href']))
links.write('\n')
links.close()

# save Profile
self.saveProfile(soup)


def preLoad(self, url):
self.driver.get(url)

# for debug
# html = self.driver.page_source
# soup = BeautifulSoup(html)
# return soup

curtUrls = self.driver.current_url.split('/')[4]
url_id = curtUrls.split('?')[0]
self.cnt += 1
if self.db.linkedinDedual.find({"url_id":url_id}).count() != 0:
print "already dealed : "+str(self.cnt)+" / "+str(self.num)
return False
else:
print "new one, be going to do it : "+str(self.cnt)+" / "+str(self.num)
return True

def popUrl(self, num):
lines = open('relatedLinks').readlines()
if num >= len(lines):
num = len(lines) - 1
open('relatedLinks', 'w').writelines(lines[num: -1])
return lines[0:num]

def deal(self, num = 1):
self.num = num
urls = self.popUrl(num)
while len(urls) > 0:
url = urls.pop()
t = 1+ random.randint(1,1000)/1000.0
print "sleep : "+ str(t) + " s"
sleep(1+ random.randint(1,1000)/1000.0)
if self.preLoad(url):
self.loadPage()

self.driver.close()


def dealNew(self, url):
if self.preLoad(url):
self.loadPage()









Binary file added scawler/scawler.pyc
Binary file not shown.
10 changes: 10 additions & 0 deletions scawler/siton
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
https://www.linkedin.com/in/catherine-cheng-544a7b8a?authType=name&authToken=CsdG&trk=prof-sb-browse_map-photo
https://www.linkedin.com/in/catherine-cheng-544a7b8a?authType=name&authToken=CsdG&trk=prof-sb-browse_map-photo
https://www.linkedin.com/in/christopher-ryder-7026b218?authType=name&authToken=0N17&trk=prof-sb-browse_map-photo
https://www.linkedin.com/in/catherine-cheng-544a7b8a?authType=name&authToken=CsdG&trk=prof-sb-browse_map-photo
https://www.linkedin.com/in/catherine-cheng-544a7b8a?authType=name&authToken=CsdG&trk=prof-sb-browse_map-photo
https://www.linkedin.com/in/catherine-cheng-544a7b8a?authType=name&authToken=CsdG&trk=prof-sb-browse_map-photo
https://www.linkedin.com/in/catherine-cheng-544a7b8a?authType=name&authToken=CsdG&trk=prof-sb-browse_map-photo
https://www.linkedin.com/in/catherine-cheng-544a7b8a?authType=name&authToken=CsdG&trk=prof-sb-browse_map-photo
https://www.linkedin.com/in/catherine-cheng-544a7b8a?authType=name&authToken=CsdG&trk=prof-sb-browse_map-photo
https://www.linkedin.com/in/catherine-cheng-544a7b8a?authType=name&authToken=CsdG&trk=prof-sb-browse_map-photo

0 comments on commit e407bf9

Please sign in to comment.