Node.js #13

itboos · 2018-03-24T12:28:25Z

一个简单的简书爬虫

主要技术栈:

cheerio https://github.com/cheeriojs/cheerio 页面数据解析
superagent http://visionmedia.github.io/superagent/ 页面数据下载

superagent是nodejs里一个非常方便的客户端请求代码模块，superagent是一个轻量级的
，渐进式的ajax API，可读性好，学习曲线低，内部依赖nodejs原生的请求API,适用于nodejs环境下。
cheerio :
 cheerio 为服务器特别定制的，快速、灵活、实施的jQuery核心实现, 大部分语法和jquery相同

目标

趴下简书首页前20个文章列表的信息
我们看到简书的首页的每篇文章像下面那样

  // 我们要数据的数据结构可以定义成下面那样👇
 var dataStruct = {
    title: '文章标题',
    author: '作者',
    avatar: '', // 头像
    time: '昨天',
    summery: '文章概览........', 

    seeCouts: 100,
    loves: 999,
    commits: 18,
    money: 2,
    href: '文章地址',
    commitHref: '评论地址',
    photo: '封面',

  };
});

项目初始化

  npm init -y 
 //   安装依赖:
 npm install superagent cheerio --S

 const superagent = require('superagent');
 const cheerio = require('cheerio');
 const fs = require('fs');
  
 const getUrl = 'https://www.jianshu.com/';
 const homeUrl = 'https://www.jianshu.com/';
 var articlesArr = [];
 
superagent.get(getUrl, (err, res)=> {
  if (err) {
    throw Error(err);
    return;
  }
  // 等待code
  let $ = cheerio.load(res.text);
  let ul = $('#list-container .note-list').children();
  ul.each((index, $ele) => {
    // 这里$ele就是为Dom元素,需要包装成cheerio对象后，才能使用它的方法.
    articlesArr.push(
      parseArticle($($ele))
    );
  });

  
  // 写入数据到本地的json文件
  fs.writeFileSync(__dirname + '/data/aiticle_list.json', 
    JSON.stringify({
      status: 0,
      data: articlesArr,
    }), { encoding: 'utf8'}
  );


/**
 * 根据某个cherrio 节点， query出我们想要的数据
 * @param {cherrio ele}  
 */
function parseArticle($ele) {
  // replace(/\n/g, '').replace(/\s/g, '') 目的是去掉换行和空格
  const article = {
    title:  $ele.find('.title').text().replace(/\n/g, '').replace(/\s/g, ''),
    author: $ele.find('.nickname').text().replace(/\n/g, '').replace(/\s/g, ''),
    home:  `${ homeUrl }${$ele.find('.avatar').attr('href')}`,
    avatar: `http://${$ele.find('.avatar > img').attr('src')}`, // 头像
    time: $ele.find('.info .time').attr('data-shared-at'),
    summery: $ele.find('.abstract').text().replace(/\n/g, '').replace(/\s/g, ''), 

    seeCouts: $ele.find('.meta a:first-child').text().replace(/\n/g, '').replace(/\s/g, ''),
    loves: $ele.find('.meta span').eq(0).text().replace(/\n/g, '').replace(/\s/g, ''),
    commits: $ele.find('.meta a').eq(1).text().replace(/\n/g, '').replace(/\s/g, ''),
    money: $ele.find('.meta span:last-child') && $ele.find('.meta span:last-child').text().replace(/\n/g, '').replace(/\s/g, '') || 0,
    aiticleHref: `${ homeUrl }${$ele.find('.meta a:first-child').attr('href')}`,
    commitHref: `${ homeUrl }${$ele.find('.meta a').eq(1).attr('href')}`,
    photo: `http:${$ele.find('.img-blur').attr('src')} || '' `,
  };
  return article;
}

写一个获取数据的接口

 index.js:
 // 安装express npm i express -S
 // 引入依赖
var express = require('express');

// 建立 express 实例
var app = express();
// 引入数据，前端就可以调用接口获取数据了
var list = require('./data/aiticle_list.json');

app.get('/list', function (req, res) {
  // 从 req.query 中取出我们的 q 参数。
  // 如果是 post 传来的 body 数据，则是在 req.body 里面，不过 express 默认不处理 body 中的信息，需要引入 https://github.com/expressjs/body-parser 这个中间件才会处理，这个后面会讲到。
  var q = req.query.q;
  res.send(JSON.stringify(list));
});

app.listen(3000, function (req, res) {
  console.log('app is running at port 3000');
});

浏览器获取数据:
localshot/list 就能拿到数据了
浏览器请求的展示如下：

这里每次请求到的都是一样的数据，感觉可以写个定时器，如每隔10分钟去爬下数据，然后保存到文件中，
等接口调的时候就能获取到给你更新的数据了

参考链接:
nodejs网络爬虫技术介绍: https://cnodejs.org/topic/56b807c626d02fc6626bb4ec
Node.js 编写爬虫的基本思路及抓取百度图片的实享:https://juejin.im/entry/56e7ad03f3609a0054398f1f

An NPM installer for PhantomJS, headless webkit with JS API:
https://github.com/Medium/phantomjs

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Node.js #13

Node.js #13

itboos commented Mar 24, 2018 •

edited

Loading

Node.js #13

Node.js #13

Comments

itboos commented Mar 24, 2018 • edited Loading

一个简单的简书爬虫

主要技术栈:

目标

项目初始化

写一个获取数据的接口

itboos commented Mar 24, 2018 •

edited

Loading