node+jsdom爬虫入门

2019-12-22

前言

本文分享使用node+jsdom来爬取网络小说《凌霄之上》的内容，

关于jsdom

在开始之前，我们先来了解一下 jsdom，官方介绍如下：
jsdom是一个纯粹由 javascript 实现的一系列 web标准，特别是 WHATWG 组织制定的DOM和 HTML 标准，用于在 nodejs 中使用。大体上来说，该项目的目标是模拟足够的Web浏览器子集，以便用于测试和挖掘真实世界的Web应用程序。
总的来说，就是通过jsdom，我们可以在node中使用js操作dom。

用到的api—-fromURL()

只需要知道传进去一个url地址，会虚拟出一个浏览器，返回一个可供操作的dom，然后我们可以使用js或者jq来操作它就可以了。

下面正式开始爬虫之旅

第一步先分析我们要爬取的页面

url：https://www.12zw.com/6/6942/
我们来分析一下，要爬取的文章页面，如下图

我们想要获取文章标题（在id为info的标签下的h1标签里）以及每一章的章节名称和链接地址（在id为list的标签下的a标签中），接着我们使用jsdom来解析这个地址，新建1getList.js
使用命令安装jsdom

1	npm install jsdom

文章代码如下

/* 
* 通过文章的地址 'https://www.12zw.com/6/6942/'，获取章节列表
*/

const jsdom = require('jsdom');
const { JSDOM } = jsdom;

let book = {};

//获取章节列表
async function getChaptersList(link) {
    //jsdom模拟出虚拟浏览器解析 html
    const dom = await JSDOM.fromURL(link);
    const document = dom.window;
    const $ = require('jquery')(document);

    const urls = $('#list a');
    book.title = $('#main-info h1').text();
    book.chapters = [];

    for(let i = 0; i< urls.length; i++){
        let url = urls[i];
        let _url = $(url).attr('href')+"";
        let title = $(url).text();
        book.chapters.push({
          title: title,
          url: _url
        })
    }

    console.log(book);
}

getChaptersList('https://www.12zw.com/6/6942/')

运行node 1getList.js,可以看到控制台打印输出如下

通过第一步获取的章节列表信息，来具体获取某一章节的内容

新建2getContent.js文件，我们来尝试获取第一章节的内容

/* 
*通过第一步获取的章节列表信息，来具体获取某一章节的内容
*/

const jsdom = require('jsdom');
const { JSDOM } = jsdom;


// 获取章节内容
async function getCharacter(chapter) {
    let url = 'https://www.12zw.com/6/6942/'+chapter.url;

    const dom = await JSDOM.fromURL(url);
    const document = dom.window;
    const $ = require('jquery')(document);

    let content = $('#content').html();
    console.log(chapter.title,content)
}

let chapter = { "title":"第一章 我请天眼射天狼","url":"4966237.html" }

getCharacter(chapter)

运行node 2getContent.js,查看控制台如图

成功获取到了！

接着把上两部合在一起，获取所有的章节内容

新建3getAllContent.js

 /* 
*根据第一步获取的章节列表来获取所有章节内容
*/

const jsdom = require('jsdom');
const { JSDOM } = jsdom;

let book = {};

// 休眠，为防止因短时间访问次数过多被封
sleep = ms => new Promise(r => setTimeout(r, ms));

//获取章节列表
async function getChaptersList(link) {
    const dom = await JSDOM.fromURL(link);
    const document = dom.window;
    const $ = require('jquery')(document);

    const urls = $('#list a');
    book.title = $('#maininfo h1').text();
    book.chapters = [];

    for(let i = 0; i< urls.length; i++){
        let url = urls[i];
        let _url = $(url).attr('href')+"";
        let title = $(url).text();
        book.chapters.push({
          title: title,
          url: _url
        })
    }
    
    await getChapters(book.chapters)
}

// 获取章节内容
async function getChapters(chapters) {
    
    for (let i = 0; i < chapters.length; i++) {
        try {
            let url = 'https://www.12zw.com/6/6942/'+chapters[i].url;
            const dom = await JSDOM.fromURL(url);
            const document = dom.window;
            const $ = require('jquery')(document);

            let content = $('#content').html();
            book.chapterInfo = [];

            book.chapterInfo.push({
                title: chapters[i].title,
                content: content
            })

            console.log(book.chapterInfo);
            
        } catch (e) {
            console.log('error', e);
        }
        await sleep(2000);
    }
}

getChaptersList('https://www.12zw.com/6/6942/')

这里为了避免频繁爬虫被封ip，所以我们每爬取一章节的内容，就休眠一次,运行，查看控制台

每隔2s会爬取一章节内容

是时候引入cli-progress了

一直在控制台打印也不是个办法，我们无法获取，具体的进度信息，和执行情况，别急，我们来引入cli-progress,npm install安装一下，接着新建4useProgress.js文件

/* 
*为了获取爬取进度，使用cli-progress来进行监控 
*/

const jsdom = require('jsdom');
const { JSDOM } = jsdom;

const Progress = require('cli-progress');

let book = {};
book.chapterInfo = [];

(async function() {
    console.log('正在获取章节列表...');
    await getChaptersList('https://www.12zw.com/6/6942/');
    console.log(`共获取到${book.chapters.length}个章节`);

    //使用progress获取爬虫进度
    const progress = new Progress.Bar({
        format: '进度 [{bar}] {percentage}% | {value}/{total} | 预计还需: {eta_formatted}',
    });
    progress.start(book.chapters.length, 0);

    for (let i = 0; i < book.chapters.length; i++) {
        try {
            await getChapters(book.chapters[i]);
        } catch (e) {
            console.log('error', e);
        }
        progress.update(i + 1);
        await sleep(2000);
    }
    progress.stop();
    console.log(`导入完成，共获取${book.chapters.length}章节，失败${fails.length}章。`);
})();

// 休眠
sleep = ms => new Promise(r => setTimeout(r, ms))

//获取章节列表
async function getChaptersList(link) {

    const dom = await JSDOM.fromURL(link);
    const document = dom.window;
    const $ = require('jquery')(document);

    const urls = $('#list a');
    book.title = $('#maininfo h1').text();
    book.chapters = [];

    for(let i = 0; i< urls.length; i++){
        let url = urls[i];
        let _url = $(url).attr('href')+"";
        let title = $(url).text();
        book.chapters.push({
          title: title,
          url: _url
        })
    }
}

// 获取章节内容
async function getChapters(chapter) {
    
    let url = 'https://www.12zw.com/6/6942/'+chapter.url;
    const dom = await JSDOM.fromURL(url);
    const document = dom.window;
    const $ = require('jquery')(document);

    let content = $('#content').html();

    book.chapterInfo.push({
        title: chapter.title,
        content: content
    })

    console.log('=========获取'+chapter.title);
}

运行，查看控制台

我们已经可以成功进行进度监控了

最后，写入文件，便于阅读

新建5write.js

/* 
将获取到的章节内容写入文件
*/

const jsdom = require('jsdom');
const { JSDOM } = jsdom;
const Progress = require('cli-progress');
const fs = require('fs');

let book = {};


(async function() {
    console.log('正在获取章节列表...');
    await getChaptersList('https://www.12zw.com/6/6942/');
    console.log(`共获取到${book.chapters.length}个章节`);

    const progress = new Progress.Bar({
        format: '进度 [{bar}] {percentage}% | {value}/{total} | 预计还需: {eta_formatted}',
    });
    progress.start(book.chapters.length, 0);

    for (let i = 0; i < book.chapters.length; i++) {
        try {
            await getChapters(book.chapters[i]);
        } catch (e) {
            console.log('error', e);
        }
        progress.update(i + 1);
        await sleep(2000);
    }
    progress.stop();
    console.log(`导入完成，共获取${book.chapters.length}章节`);
})();

//获取章节列表
async function getChaptersList(link) {
    const dom = await JSDOM.fromURL(link);
    const document = dom.window;
    const $ = require('jquery')(document);

    const urls = $('#list a');
    book.title = $('#maininfo h1').text();
    book.chapters = [];

    for(let i = 0; i< urls.length; i++){
        let url = urls[i];
        let _url = $(url).attr('href')+"";
        let title = $(url).text();
        book.chapters.push({
          title: title,
          url: _url
        })
    }

}

// 休眠
sleep = ms => new Promise(r => setTimeout(r, ms))

//获取具体章节内容
async function getChapters(chapter) {
    let url = 'https://www.12zw.com/6/6942/'+chapter.url;
    const dom = await JSDOM.fromURL(url);
    const document = dom.window;
    const $ = require('jquery')(document);
    let content = $('#content').html();
    book.chapterInfo = [];
    book.chapterInfo.push({
        title: chapter.title,
        content: content
    })
    write_chapter(chapter.title,content)
}

// 章节内容写入文件
function write_chapter(chapter, content){
    fs.writeFile('dist/book/' + chapter + '.html', content, function (err) {
      if (err) throw err;
      console.log(`=======${chapter}saved!`);
    });
}

这里我们使用了fs.writeFile模块，每一章分别写入一个html文件中，运行结果如图

成功get！