Skip to content

Commit

Permalink
fetch douban info by parsing html dom
Browse files Browse the repository at this point in the history
  • Loading branch information
xwjdsh committed Jul 16, 2021
1 parent 9bdef89 commit 8047740
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 17 deletions.
41 changes: 30 additions & 11 deletions crawler_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
import re
import time
from datetime import datetime
from datetime import date
from itertools import cycle
from bs4 import BeautifulSoup

from django.core.exceptions import ObjectDoesNotExist
from django.utils.timezone import make_aware

from douban_group_spy.const import USER_AGENT, DATETIME_FORMAT
from douban_group_spy.const import USER_AGENT, DATETIME_FORMAT, DATE_FORMAT

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'douban_group_spy.settings')
import django
Expand All @@ -29,7 +31,7 @@


def process_posts(posts, group, keywords, exclude):
for t in posts['topics']:
for t in posts:
# ignore title or content including exclude keywords
exclude_flag = False
for e in exclude:
Expand Down Expand Up @@ -79,14 +81,17 @@ def crawl(group_id, pages, keywords, exclude):
try:
group = Group.objects.get(id=group_id)
except ObjectDoesNotExist:
g_info = requests.get(GROUP_INFO_BASE_URL.format(DOUBAN_BASE_HOST, group_id), headers={'User-Agent': USER_AGENT}).json()
html = requests.get(GROUP_INFO_BASE_URL.format(DOUBAN_BASE_HOST, group_id), headers={'User-Agent': USER_AGENT}).text
g_info = BeautifulSoup(html,'lxml')
lg.info(f'Getting group: {group_id} successful')
member_count_text=g_info.select_one(f"a[href='https://www.douban.com/group/{group_id}/members']").get_text()
created_text=g_info.select_one('div[class="group-board"] p').get_text()
group = Group(
id=g_info['uid'],
name=g_info['name'],
alt=g_info['alt'],
member_count=g_info['member_count'],
created=make_aware(datetime.strptime(g_info['created'], DATETIME_FORMAT))
id=group_id,
name=g_info.select_one('h1').get_text().strip(),
alt=g_info.select_one("div[class='group-intro']").get_text(),
member_count=int(re.findall(r'[(](.*?)[)]', member_count_text)[0]),
created=make_aware(datetime.strptime(re.findall(r"创建于(.+?) ",created_text)[0], DATE_FORMAT))
)
group.save(force_insert=True)

Expand All @@ -95,7 +100,7 @@ def crawl(group_id, pages, keywords, exclude):
# host = next(douban_base_host)
kwargs = {
'url': GROUP_TOPICS_BASE_URL.format(DOUBAN_BASE_HOST, group_id),
'params': {'start': p},
'params': {'start': p*25},
'headers': {'User-Agent': USER_AGENT}
}
req = getattr(requests, 'get')(**kwargs)
Expand All @@ -111,9 +116,23 @@ def crawl(group_id, pages, keywords, exclude):
lg.warning(f'Fail to getting: {req.url}, status: {req.status_code}')
continue

posts = req.json()
soup = BeautifulSoup(req.text,'lxml')
posts=[]
for row in soup.select('table[class="olt"] tr[class=""]'):
result={}
link=row.select_one('td[class="title"] a')
result['id']=int(re.findall(r"https://www.douban.com/group/topic/(.+?)/",link["href"])[0])
result['title']=link["title"]
result['content']=''
result['alt']=''
author_link=row.select("td")[1].select_one('a')
result['author']={'name':author_link.get_text(),'alt':author_link["href"]}
result['photos']=[]
result['created']='1970-01-01 00:00:00'
result['updated']=f'{date.today().year}-{row.select("td")[3].get_text()}:00'
posts.append(result)
process_posts(posts, group, keywords, exclude)


@click.command(help='example: python crawler_main.py -g 10086 -g 12345 -k xx花园 -k xx地铁 -e 求租')
@click.option('--groups', '-g', help='group id', required=True, multiple=True, type=str)
Expand Down
1 change: 1 addition & 0 deletions douban_group_spy/const.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
DATE_FORMAT = '%Y-%m-%d'

HREF_FORMAT = "<a href='{url}'>{url}</a>"
IMG_FORMAT = '<img src="{url}" height="400" width="400" referrerpolicy ="never"/><br/>'
Expand Down
6 changes: 3 additions & 3 deletions douban_group_spy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@

STATIC_URL = '/static/'

DOUBAN_BASE_HOST = 'https://api.douban.com'
DOUBAN_BASE_HOST = 'https://www.douban.com'

GROUP_TOPICS_BASE_URL = '{}/v2/group/{}/topics'
GROUP_INFO_BASE_URL = '{}/v2/group/{}/'
GROUP_TOPICS_BASE_URL = '{}/group/{}/discussion'
GROUP_INFO_BASE_URL = '{}/group/{}/'
15 changes: 12 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
django==2.2.20
beautifulsoup4==4.9.3
certifi==2021.5.30
charset-normalizer==2.0.2
Click==7.0
Django==2.2.20
idna==3.2
jsonfield==2.0.2
click==7.0.0
requests
lxml==4.6.3
pytz==2021.1
requests==2.26.0
soupsieve==2.2.1
sqlparse==0.4.1
urllib3==1.26.6

0 comments on commit 8047740

Please sign in to comment.