nodejs使用axios抓取gb2312格式网页中文乱码

使用iconv-lite转码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
const fs = require('fs')
const cheerio = require('cheerio')
const axios = require('axios')
const iconv = require('iconv-lite')

async function getHtml(i = 0) {
const url = `https://www.2345.com/jzw/${i + 1}.htm`
const res = await axios({
url,
responseType: 'stream'
})
return new Promise(resolve => {
const chunks = []
res.data.on('data', chunk => {
chunks.push(chunk)
})
res.data.on('end', () => {
const buffer = Buffer.concat(chunks)
const str = iconv.decode(buffer, 'gb2312')
resolve(str)
})
})
}

let str = ''

async function fetchData() {
for (let i = 0; i < 73; i++) {
console.log(`当前是第${i}页`)
const html = await getHtml(i)
const $ = cheerio.load(html, { decodeEntities: false })
const $list = $('#J_listTable li')
$list.each((index, item) => {
const question = $(item)
.find('.table_left')
.text()
const answer = $(item)
.find('.table_right a')
.attr('onclick')
.match(/\'([^.]*)\'/)[1]
str += `${question}\n${answer}\n\n\n`
})
}
fs.writeFileSync('data.txt', str)
}

fetchData()