Skip to content

Commit

Permalink
fix: 修复因部分user_id错误导致其它正确的user_id无法爬取的问题
Browse files Browse the repository at this point in the history
  • Loading branch information
dataabc committed Mar 17, 2020
1 parent 359a894 commit 700eab6
Showing 1 changed file with 44 additions and 39 deletions.
83 changes: 44 additions & 39 deletions weibo.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,16 +648,16 @@ def get_page_count(self):
page_count = int(math.ceil(weibo_count / 10.0))
return page_count
except KeyError:
sys.exit(u'程序出错,错误原因可能为以下两者:\n'
u'1.user_id不正确;\n'
u'2.此用户微博可能需要设置cookie才能爬取。\n'
u'解决方案:\n'
u'请参考\n'
u'https://github.com/dataabc/weibo-crawler#如何获取user_id\n'
u'获取正确的user_id;\n'
u'或者参考\n'
u'https://github.com/dataabc/weibo-crawler#3程序设置\n'
u'中的“设置cookie”部分设置cookie信息')
print(u'程序出错,错误原因可能为以下两者:\n'
u'1.user_id不正确;\n'
u'2.此用户微博可能需要设置cookie才能爬取。\n'
u'解决方案:\n'
u'请参考\n'
u'https://github.com/dataabc/weibo-crawler#如何获取user_id\n'
u'获取正确的user_id;\n'
u'或者参考\n'
u'https://github.com/dataabc/weibo-crawler#3程序设置\n'
u'中的“设置cookie”部分设置cookie信息')

def get_write_info(self, wrote_count):
"""获取要写入的微博信息"""
Expand Down Expand Up @@ -976,36 +976,41 @@ def write_data(self, wrote_count):

def get_pages(self):
"""获取全部微博"""
self.get_user_info()
self.print_user_info()
since_date = datetime.strptime(self.user_config['since_date'],
'%Y-%m-%d')
today = datetime.strptime(str(date.today()), '%Y-%m-%d')
if since_date <= today:
page_count = self.get_page_count()
wrote_count = 0
page1 = 0
random_pages = random.randint(1, 5)
self.start_date = datetime.now().strftime('%Y-%m-%d')
for page in tqdm(range(1, page_count + 1), desc='Progress'):
is_end = self.get_one_page(page)
if is_end:
break

if page % 20 == 0: # 每爬20页写入一次文件
self.write_data(wrote_count)
wrote_count = self.got_count

# 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限
# 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默
# 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间
if (page - page1) % random_pages == 0 and page < page_count:
sleep(random.randint(6, 10))
page1 = page
random_pages = random.randint(1, 5)
try:
self.get_user_info()
self.print_user_info()
since_date = datetime.strptime(self.user_config['since_date'],
'%Y-%m-%d')
today = datetime.strptime(str(date.today()), '%Y-%m-%d')
if since_date <= today:
page_count = self.get_page_count()
wrote_count = 0
page1 = 0
random_pages = random.randint(1, 5)
self.start_date = datetime.now().strftime('%Y-%m-%d')
for page in tqdm(range(1, page_count + 1), desc='Progress'):
is_end = self.get_one_page(page)
if is_end:
break

self.write_data(wrote_count) # 将剩余不足20页的微博写入文件
print(u'微博爬取完成,共爬取%d条微博' % self.got_count)
if page % 20 == 0: # 每爬20页写入一次文件
self.write_data(wrote_count)
wrote_count = self.got_count

# 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限
# 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默
# 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间
if (page -
page1) % random_pages == 0 and page < page_count:
sleep(random.randint(6, 10))
page1 = page
random_pages = random.randint(1, 5)

self.write_data(wrote_count) # 将剩余不足20页的微博写入文件
print(u'微博爬取完成,共爬取%d条微博' % self.got_count)
except Exception as e:
print("Error: ", e)
traceback.print_exc()

def get_user_config_list(self, file_path):
"""获取文件中的微博id信息"""
Expand Down

0 comments on commit 700eab6

Please sign in to comment.