2022-08-16 14:15:24 +08:00
|
|
|
|
const request = require('request');
|
|
|
|
|
const fs = require('fs');
|
2022-08-16 14:47:29 +08:00
|
|
|
|
const wkhtmltopdf = require('wkhtmltopdf');
|
2022-08-16 14:15:24 +08:00
|
|
|
|
|
|
|
|
|
// 请求 APi 接口
|
|
|
|
|
async function getApiResult(url) {
|
|
|
|
|
var return_data = await new Promise((resolve) => {
|
|
|
|
|
request({
|
|
|
|
|
method: 'GET',
|
|
|
|
|
url: url,
|
|
|
|
|
}, (error, response, result) => {
|
2022-08-19 14:19:07 +08:00
|
|
|
|
// console.log("error, response, result", error, response, result);
|
2022-08-16 14:15:24 +08:00
|
|
|
|
if (!error && (response.statusCode == 200)) {
|
|
|
|
|
// 请求成功
|
|
|
|
|
resolve(result);
|
|
|
|
|
} else {
|
|
|
|
|
// 请求失败
|
|
|
|
|
console.log(`error is ${error}`);
|
|
|
|
|
resolve("error");
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
});
|
|
|
|
|
// console.log(`return_data is ${JSON.stringify(return_data)}`);
|
|
|
|
|
return return_data;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// https://www.nowcoder.com/issue/tutorial?tutorialId=94
|
|
|
|
|
// 目录:https://www.nowcoder.com/content/tutorial/catalog/94
|
|
|
|
|
// 文章:https://www.nowcoder.com/content/tutorial/detail/94/ea1986fcff294f6292385703e94689e8
|
|
|
|
|
async function main() {
|
2022-08-19 14:19:07 +08:00
|
|
|
|
console.log(`获取目录 ${pdfFilePath}`);
|
2022-08-16 14:15:24 +08:00
|
|
|
|
/**
|
|
|
|
|
* 爬取数据
|
|
|
|
|
*/
|
|
|
|
|
let urlContent = "https://www.nowcoder.com/content/tutorial/catalog/94";
|
|
|
|
|
let result = await getApiResult(urlContent);
|
|
|
|
|
let data = JSON.parse(result);
|
2022-08-19 14:19:07 +08:00
|
|
|
|
// console.log(data);
|
2022-08-16 14:15:24 +08:00
|
|
|
|
|
|
|
|
|
let catalog = data.data.catalog;
|
2022-08-19 14:19:07 +08:00
|
|
|
|
// console.log("catalog", catalog);
|
2022-08-16 14:15:24 +08:00
|
|
|
|
for (let chapter = 0; chapter < catalog.length; chapter++) {
|
|
|
|
|
const sectionList = catalog[chapter];
|
|
|
|
|
for (let section = 0; section < sectionList.length; section++) {
|
|
|
|
|
const sectionInfo = sectionList[section];
|
|
|
|
|
// console.log(`${sectionInfo.uuid} ${sectionInfo.title}`);
|
|
|
|
|
let res = await getDetail(sectionInfo.uuid);
|
2022-08-19 14:19:07 +08:00
|
|
|
|
// console.log(res);
|
2022-08-16 14:15:24 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2022-08-17 00:32:51 +08:00
|
|
|
|
console.log("完成");
|
2022-08-16 14:15:24 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function getDetail(uuid) {
|
2022-08-19 14:19:07 +08:00
|
|
|
|
console.log(`开始转换 ${pdfFilePath}`);
|
2022-08-16 14:15:24 +08:00
|
|
|
|
let urlDetail = `https://www.nowcoder.com/content/tutorial/detail/94/${uuid}`;
|
|
|
|
|
let result = await getApiResult(urlDetail);
|
|
|
|
|
let data = JSON.parse(result);
|
|
|
|
|
|
|
|
|
|
// console.log(data);
|
|
|
|
|
|
|
|
|
|
let section = data.data.section;
|
|
|
|
|
|
|
|
|
|
// console.log(section.content);
|
|
|
|
|
// console.log(section.title);
|
|
|
|
|
// console.log(section.chapterId);
|
|
|
|
|
// console.log(section.sectionId);
|
|
|
|
|
|
|
|
|
|
let fileName = `${section.chapterId}.${section.sectionId} ${section.title}.html`;
|
2022-08-16 14:47:29 +08:00
|
|
|
|
let pdfFileName = `${section.chapterId}.${section.sectionId} ${section.title}.pdf`;
|
2022-08-16 14:15:24 +08:00
|
|
|
|
|
2022-08-16 14:47:29 +08:00
|
|
|
|
fs.writeFileSync(`./output/html/${fileName}`, section.content);
|
|
|
|
|
await transferToPDF(`./output/html/${fileName}`, `./output/pdf/${pdfFileName}`);
|
2022-08-16 14:15:24 +08:00
|
|
|
|
}
|
|
|
|
|
|
2022-08-16 14:47:29 +08:00
|
|
|
|
// https://wkhtmltopdf.org/
|
|
|
|
|
async function transferToPDF(htmlFilePath, pdfFilePath) {
|
2022-08-17 00:32:51 +08:00
|
|
|
|
console.log(`开始转换 ${pdfFilePath}`);
|
2022-08-16 14:47:29 +08:00
|
|
|
|
let html = `
|
|
|
|
|
<html>
|
|
|
|
|
<head>
|
|
|
|
|
<meta charset="utf-8">
|
|
|
|
|
<style>
|
|
|
|
|
* {
|
2022-08-17 00:32:51 +08:00
|
|
|
|
/*
|
|
|
|
|
font-size: 30px;
|
|
|
|
|
font-size: large !important;
|
|
|
|
|
*/
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
img {
|
|
|
|
|
max-width: 100%;
|
2022-08-16 14:47:29 +08:00
|
|
|
|
}
|
|
|
|
|
</style>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
${fs.readFileSync(htmlFilePath)}
|
|
|
|
|
</body>
|
|
|
|
|
</html>`
|
2022-08-17 00:32:51 +08:00
|
|
|
|
// fs.writeFileSync(`./output/test.html`, html);
|
|
|
|
|
wkhtmltopdf(html, { pageSize: "A4", minimumFontSize: 10, disableSmartShrinking: true })
|
2022-08-16 14:47:29 +08:00
|
|
|
|
.pipe(fs.createWriteStream(pdfFilePath));
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-17 00:32:51 +08:00
|
|
|
|
// ========================================================
|
|
|
|
|
|
|
|
|
|
// 遍历 output/html 文件夹下的所有html文件
|
|
|
|
|
async function transferHTMLToPDF() {
|
|
|
|
|
let files = fs.readdirSync("./output/html");
|
|
|
|
|
for (let i = 0; i < files.length; i++) {
|
|
|
|
|
const fileName = files[i];
|
|
|
|
|
if (!fileName.endsWith(".html")) continue; // 过滤掉 html 文件
|
|
|
|
|
let pdfFileName = fileName.replace(".html", ".pdf");
|
|
|
|
|
await transferToPDF(`./output/html/${fileName}`, `./output/pdf/${pdfFileName}`);
|
|
|
|
|
}
|
|
|
|
|
console.log("完成");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 爬取 + 转换为pdf
|
2022-08-17 13:42:07 +08:00
|
|
|
|
main();
|
2022-08-17 00:32:51 +08:00
|
|
|
|
|
|
|
|
|
// 已经爬取过,只需要转pdf
|
2022-08-17 13:42:07 +08:00
|
|
|
|
// transferHTMLToPDF();
|
2022-08-17 00:32:51 +08:00
|
|
|
|
|
|
|
|
|
// 测试调试用
|
|
|
|
|
// transferToPDF(`./output/html/8.3 操作系统(三).html`, `./output/test.pdf`)
|