编码随笔

50行代码搞定百度贴吧小说抓取

//直接上代码
var prefix = require('superagent-prefix')('/static');
var request = require('superagent-charset');
var cheerio = require('cheerio');
var fs = require('fs');

var startUrl = 'http://tieba.baidu.com/p/3875062243';//起始地址
var fileName = 'E:\\novel.txt';//文件生成目录
//此程序为抓取百度小说数据,输入一个起始url,即可抓取全本小说
function doUrl(url){
request.get(url)
.use(prefix) // Prefixes *only* this request
.charset('utf-8')
.end(function(err, res){
if(err){
console.log(err);
}
var $ = cheerio.load(res.text,{decodeEntities: false});
appendToFile(fileName,$('.novel-post-title h3').html() + '\r\n');
appendToFile(fileName,$('.novel-post-content').html().replace(new RegExp("
","gm"),'\r\n'));
if($('.novel-next-chapter-btn').attr('data-pid') && $('.novel-next-chapter-btn').attr('data-forumid')){
var nextUrl = 'http://tieba.baidu.com/novel/getNextChapterThread?forum_id='+$('.novel-next-chapter-btn').attr('data-forumid')+'&thread_id='+$('.novel-next-chapter-btn').attr('data-pid');
getNextUrl(nextUrl);
}else{
console.log('end');
}
});
}
function getNextUrl(url){
request.get(url)
.use(prefix) // Prefixes *only* this request
.charset('utf-8')
.end(function(err, res){
if(err){
console.log(err);
}
var nextUrl = 'http://tieba.baidu.com/p/' + JSON.parse(res.text).data.thread_id;
console.log(nextUrl);
doUrl(nextUrl);
});
}
function appendToFile(fileName,line){
fs.appendFile(fileName,line,function(err){if(err) throw err});
}

doUrl(startUrl);//程序启动

PS:纯技术交流,请勿用于商业目的

by will,2015

签名: