Skip to content

Commit

Permalink
处理了粉丝数和朋友数中无法解析"万"、"亿"字符串的问题,现在仍然返回整数型,分析系统不用改变
Browse files Browse the repository at this point in the history
  • Loading branch information
Faker-lz committed Feb 28, 2023
1 parent 1013a38 commit bbededb
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
10 changes: 6 additions & 4 deletions code/weibo_crawler/selector_parser/index_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from weibo_curl_error import CookieInvalidException, HTMLParseException
from .base_parser import BaseParser
import utils
import re

class IndexParser(BaseParser):
def __init__(self, user_id, response):
Expand All @@ -28,15 +29,16 @@ def get_user(self, user_info):
try:
user_info = self.selector.xpath("//div[@class='tip2']/*/text()")
self.user['id'] = self.user_id

# self.user['weibo_num'] = int(user_info[0][3:-1])
# self.user['following'] = int(user_info[1][3:-1])
# self.user['followers'] = int(user_info[2][3:-1])
row_data = re.findall("\\[(.*?)]", ''.join(user_info), re.I|re.M)
self.user['weibo_num'] = utils.str2value(row_data[0])
self.user['following'] = utils.str2value(row_data[1])
self.user['followers'] = utils.str2value(row_data[2])
return self.user
except Exception as e:
utils.report_log(e)
raise HTMLParseException


def get_page_num(self):
"""获取微博总页数"""
try:
Expand Down
22 changes: 21 additions & 1 deletion code/weibo_crawler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import sys
from settings import LOGGING
import traceback
import re


def report_log(exception: Exception):
Expand Down Expand Up @@ -35,12 +36,31 @@ def extract_from_one_table_node(table_node):
user_id = user_id[user_id.rfind(r'/') + 1:]
fans_num = table_node.xpath('text()') # 关注者的粉丝数
if len(fans_num) != 0:
fans_num = str(fans_num[0])
row_data = re.findall("粉丝(.+?)人", fans_num[0], re.I|re.M)
fans_num = str2value(row_data[0])
else:
fans_num = None
return dict(user_id=user_id, user_name=user_name, fans_num=fans_num)


def str2value(valueStr):
"""
微博粉丝、朋友数中万、亿的转换
"""
valueStr = str(valueStr)
idxOfYi = valueStr.find('亿')
idxOfWan = valueStr.find('万')
if idxOfYi != -1 and idxOfWan != -1:
return int(float(valueStr[:idxOfYi]) * 1e8 + float(valueStr[idxOfYi + 1:idxOfWan]) * 1e4)
elif idxOfYi != -1 and idxOfWan == -1:
return int(float(valueStr[:idxOfYi]) * 1e8)
elif idxOfYi == -1 and idxOfWan != -1:
return int(float(valueStr[idxOfYi + 1:idxOfWan]) * 1e4)
elif idxOfYi == -1 and idxOfWan == -1:
return int(valueStr)



def standardize_date(created_at):
"""标准化微博发布时间"""
if "刚刚" in created_at:
Expand Down

0 comments on commit bbededb

Please sign in to comment.