Skip to content
This repository has been archived by the owner on Jul 21, 2022. It is now read-only.

Commit

Permalink
Now can fetch answers sorted by creation time
Browse files Browse the repository at this point in the history
  • Loading branch information
laike9m committed Jan 5, 2016
1 parent a60b4a7 commit 9500434
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 35 deletions.
44 changes: 44 additions & 0 deletions test/zhihu-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,50 @@ def test_question():
print(question.topics)
print(question.last_edit_time)

# with sort parameter, repeat above tests

url = 'https://www.zhihu.com/question/24825703?sort=created'
question = client.question(url)
print(question.title)
print(question.details)
print(question.answer_num)
print(question.follower_num)
for _, follower in zip(range(10), question.followers):
print(follower.name)
print(question.topics)
print(question.top_answer.upvote_num)
for answer in question.top_i_answers(10):
print(answer.author.name, answer.upvote_num, answer.author.motto)
ctime = question.creation_time
print(ctime)
assert ctime == datetime.strptime('2014-08-12 17:58:07', "%Y-%m-%d %H:%M:%S")
last_edit_time = question.last_edit_time
print(last_edit_time)
assert last_edit_time >= datetime.strptime('2015-04-01 00:39:21', "%Y-%m-%d %H:%M:%S")
assert question.author is None
question = client.question('https://www.zhihu.com/question/38531356')
assert question.author.name == '杨捷'
assert question.author.url == 'https://www.zhihu.com/people/yangjiePro/'

question.refresh()

# test again
print(question.title)
print(question.details)
print(question.answer_num)
print(question.follower_num)
for _, follower in zip(range(10), question.followers):
print(follower.name)
print(question.topics)
print(question.last_edit_time)

# test fetching all sorted answers
question = client.question('https://www.zhihu.com/question/27459050?sort=created')
count = 0
for answer in question.answers:
count += 1
print(answer.author.name, answer.upvote_num, answer.author.motto)
assert count >= 84

def test_answer():
url = 'http://www.zhihu.com/question/24825703/answer/30975949'
Expand Down
2 changes: 1 addition & 1 deletion zhihu/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
Get_Collection_Url = Zhihu_URL + '/node/AnswerFavlists'

re_question_url = re.compile(r'^https?://www\.zhihu\.com/question/\d+(\?sort=created|/?)$')
re_question_url_std = re.compile(r'^https?://www\.zhihu\.com/question/\d+/?$')
re_question_url_std = re.compile(r'^https?://www\.zhihu\.com/question/\d+/?')
re_ans_url = re.compile(
r'^https?://www\.zhihu\.com/question/\d+/answer/\d+/?$')
re_author_url = re.compile(r'^https?://www\.zhihu\.com/people/[^/]+/?$')
Expand Down
106 changes: 72 additions & 34 deletions zhihu/question.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,29 +204,29 @@ def answers(self):
from .answer import Answer

self._make_soup()
new_header = dict(Default_Header)
new_header['Referer'] = self.url
params = {"url_token": self.id,
'pagesize': '50',
'offset': 0}
data = {'_xsrf': self.xsrf,
'method': 'next',
'params': ''}
for i in range(0, (self.answer_num - 1) // 50 + 1):
if i == 0:
# 修正各种建议修改的回答……
error_answers = self.soup.find_all('div', id='answer-status')

# TODO: 统一逻辑. 完全可以都用 _parse_answer_html 的逻辑替换
if self._url.endswith('sort=created'):
pager = self.soup.find('div', class_='zm-invite-pager')
max_page = int(pager.find_all('span')[-2].a.text)
for page in range(1, max_page + 1):
if page == 1:
soup = self.soup
else:
url = self._url + '&page=%d' % page
soup = BeautifulSoup(self._session.get(url).content)
error_answers = soup.find_all('div', id='answer-status')
for each in error_answers:
each['class'] = 'zm-editable-content'
answers_wrap = self.soup.find('div', id='zh-question-answer-wrap')
answers_wrap = soup.find('div', id='zh-question-answer-wrap')
# 正式处理
authors = answers_wrap.find_all(
'div', class_='zm-item-answer-author-info')
'div', class_='zm-item-answer-author-info')
urls = answers_wrap.find_all('a', class_='answer-date-link')
upvote_nums = answers_wrap.find_all('div',
class_='zm-item-vote-info')
class_='zm-item-vote-info')
contents = answers_wrap.find_all(
'div', class_='zm-editable-content')
'div', class_='zm-editable-content')
assert len(authors) == len(urls) == len(upvote_nums) == len(contents)
for author, url, upvote_num, content in \
zip(authors, urls, upvote_nums, contents):
Expand All @@ -238,14 +238,49 @@ def answers(self):
content = answer_content_process(content)
yield Answer(url, self, author_obj, upvote_num, content,
session=self._session)
else:
params['offset'] = i * 50
data['params'] = json.dumps(params)
r = self._session.post(Question_Get_More_Answer_URL, data=data,
headers=new_header)
answer_list = r.json()['msg']
for answer_html in answer_list:
yield self._parse_answer_html(answer_html, Author, Answer)
else:
new_header = dict(Default_Header)
new_header['Referer'] = self.url
params = {"url_token": self.id,
'pagesize': '50',
'offset': 0}
data = {'_xsrf': self.xsrf,
'method': 'next',
'params': ''}
for i in range(0, (self.answer_num - 1) // 50 + 1):
if i == 0:
# 修正各种建议修改的回答……
error_answers = self.soup.find_all('div', id='answer-status')
for each in error_answers:
each['class'] = 'zm-editable-content'
answers_wrap = self.soup.find('div', id='zh-question-answer-wrap')
# 正式处理
authors = answers_wrap.find_all(
'div', class_='zm-item-answer-author-info')
urls = answers_wrap.find_all('a', class_='answer-date-link')
upvote_nums = answers_wrap.find_all('div',
class_='zm-item-vote-info')
contents = answers_wrap.find_all(
'div', class_='zm-editable-content')
assert len(authors) == len(urls) == len(upvote_nums) == len(contents)
for author, url, upvote_num, content in \
zip(authors, urls, upvote_nums, contents):
a_url, name, motto, photo = parser_author_from_tag(author)
author_obj = Author(a_url, name, motto, photo_url=photo,
session=self._session)
url = Zhihu_URL + url['href']
upvote_num = int(upvote_num['data-votecount'])
content = answer_content_process(content)
yield Answer(url, self, author_obj, upvote_num, content,
session=self._session)
else:
params['offset'] = i * 50
data['params'] = json.dumps(params)
r = self._session.post(Question_Get_More_Answer_URL, data=data,
headers=new_header)
answer_list = r.json()['msg']
for answer_html in answer_list:
yield self._parse_answer_html(answer_html, Author, Answer)

@property
def top_answer(self):
Expand Down Expand Up @@ -363,19 +398,22 @@ def _parse_answer_html(self, answer_html, Author, Answer):
soup = BeautifulSoup(answer_html)
# 修正各种建议修改的回答……
error_answers = soup.find_all('div', id='answer-status')

for each in error_answers:
each['class'] = 'zm-editable-content'
answer_url = \
self.url + 'answer/' + soup.div['data-atoken']
author = soup.find(
'div', class_='zm-item-answer-author-info')

answer_url = self.url + 'answer/' + soup.div['data-atoken']
author = soup.find('div', class_='zm-item-answer-author-info')
upvote_num = int(soup.find(
'div', class_='zm-item-vote-info')['data-votecount'])
content = soup.find(
'div', class_='zm-editable-content')
content = soup.find('div', class_='zm-editable-content')
content = answer_content_process(content)
a_url, name, motto, photo = parser_author_from_tag(author)
author_obj = Author(a_url, name, motto, photo_url=photo,
session=self._session)
return Answer(answer_url, self, author_obj,
upvote_num, content, session=self._session)
author = Author(a_url, name, motto, photo_url=photo,
session=self._session)
return Answer(answer_url, self, author, upvote_num, content,
session=self._session)

def _get_content(self):
# override base class's method cause we need self._url not self.url
return self._session.get(self._url).content

0 comments on commit 9500434

Please sign in to comment.