1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee

抽离出公共方法;添加B站热搜爬取代码

This commit is contained in:
程序员小墨 2022-07-28 22:35:07 +08:00
parent a76c39b030
commit cf28f475e3
5 changed files with 184 additions and 55 deletions

View File

@ -227,7 +227,7 @@
function getData() { function getData() {
var xhr = new XMLHttpRequest(); var xhr = new XMLHttpRequest();
xhr.open("GET", "../data/latest.json?t=" + Date.now(), true); xhr.open("GET", "../data/weibo/latest.json?t=" + Date.now(), true);
xhr.send(); xhr.send();
xhr.onreadystatechange = function () { xhr.onreadystatechange = function () {
if (xhr.readyState !== 4) return; if (xhr.readyState !== 4) return;

View File

@ -37,6 +37,7 @@ if (DEBUG_MODE) {
* 引入模块 * 引入模块
*/ */
const get_weibo_hotband = require('./src/get_weibo_hotband'); const get_weibo_hotband = require('./src/get_weibo_hotband');
const get_bilibili_hotband = require('./src/get_bilibili_hotband');
const execute_command = require('./src/execute_command'); const execute_command = require('./src/execute_command');
@ -52,6 +53,7 @@ console.log("Start running ...");
async function start() { async function start() {
// 爬取热搜数据 // 爬取热搜数据
await get_weibo_hotband.main(); await get_weibo_hotband.main();
await get_bilibili_hotband.main();
// 调试模式下 // 调试模式下
if (DEBUG_MODE) { if (DEBUG_MODE) {

114
src/get_bilibili_hotband.js Normal file
View File

@ -0,0 +1,114 @@
'use strict';
const request = require('request');
const fs = require('fs');
const path = require('path');
const utils = require('./utils/utils');
const DATA_FOLDER = path.join(path.dirname(__dirname), process.env.DATA_FOLDER ?? 'data', 'bilibili');
console.log("DATA_FOLDER", DATA_FOLDER);
utils.createFolder(DATA_FOLDER); // 程序运行就保证 data 目录存在
// 请求微博热搜 APi 接口
async function getApiResult(url) {
var return_data = await new Promise((resolve) => {
request({
method: 'GET',
url: url,
json: true,
}, (error, response, result) => {
if (!error && (response.statusCode == 200)) {
// 请求成功
resolve(result);
} else {
// 请求失败
console.log(`error is ${error}`);
resolve("error");
}
});
});
// console.log(`return_data is ${JSON.stringify(return_data)}`);
return return_data;
}
async function main() {
let requestTimestamp = Date.now();
let now = new Date(requestTimestamp + 8 * 3600 * 1000).toISOString();
let result = await getApiResult("https://app.bilibili.com/x/v2/search/trending/ranking");
if (result.code != 0) {
console.log(new Date(Date.now() + 8 * 60 * 60 * 1000).toISOString(), "bilibili", "请求成功,但服务器处理失败,正在重试。");
result = await getApiResult("https://app.bilibili.com/x/v2/search/trending/ranking");
if (result.ok != 1) {
console.log(new Date(Date.now() + 8 * 60 * 60 * 1000).toISOString(), "bilibili", "请求成功,但服务器处理失败,保存失败信息。");
// ok 不为 1那么久直接保存便于后续分析不进行后续处理
utils.saveJSON({
saveFolder: DATA_FOLDER,
now: now,
fileNameSuffix: `origin-error`,
object: result,
compress: true,
uncompress: false
});
return;
}
}
console.log(new Date(Date.now() + 8 * 60 * 60 * 1000).toISOString(), "bilibili", "请求成功");
// console.log("result", result);
let data = result.data;
// 去除 trackid
delete data["trackid"];
// console.log(data);
/**
* 保存原始数据
*/
utils.saveJSON({
saveFolder: DATA_FOLDER,
now: now,
fileNameSuffix: `origin`,
object: result,
compress: true,
uncompress: false
});
/**
* 获取需要的数据进行转换
*/
let convert = [];
data.list.forEach(item => {
// {
// "position": 1,
// "keyword": "关键词",
// "show_name": "热搜名称",
// "word_type": 8,
// "icon": "热搜的图标,也可能没有",
// "hot_id": 7399 // 热搜id
// }
convert.push(item);
});
utils.saveJSON({
saveFolder: DATA_FOLDER,
now: now,
fileNameSuffix: `final`,
object: convert,
compress: true,
uncompress: false,
});
/**
* 更新最新的
*/
fs.writeFileSync(`${DATA_FOLDER}/latest.json`, JSON.stringify({
update_time: requestTimestamp,
update_time_friendly: now.substring(0, 19).replace(/T/g, " "),
data: data
}));
}
exports.main = main;

View File

@ -4,11 +4,11 @@ const request = require('request');
const fs = require('fs'); const fs = require('fs');
const path = require('path'); const path = require('path');
const DATA_FOLDER = path.join(path.dirname(__dirname), process.env.DATA_FOLDER ?? 'data'); const utils = require('./utils/utils');
console.log("DATA_FOLDER", DATA_FOLDER);
createFolder(DATA_FOLDER); // 程序运行就保证 data 目录存在
const LATEST_DATA_ONLY = process.env.LATEST_DATA_ONLY == true; const DATA_FOLDER = path.join(path.dirname(__dirname), process.env.DATA_FOLDER ?? 'data', 'weibo');
console.log("DATA_FOLDER", DATA_FOLDER);
utils.createFolder(DATA_FOLDER); // 程序运行就保证 data 目录存在
// 请求微博热搜 APi 接口 // 请求微博热搜 APi 接口
async function getApiResult(url) { async function getApiResult(url) {
@ -32,59 +32,19 @@ async function getApiResult(url) {
return return_data; return return_data;
} }
// 创建目录
async function createFolder(folderToCreate) {
let currentFolder = folderToCreate.replace(/\\/g, '/');
let parentFolder = currentFolder.substring(0, currentFolder.lastIndexOf('/'));
if (!fs.existsSync(currentFolder)) {
// 文件夹不存在,创建文件夹
createFolder(parentFolder); // 保证父级文件夹存在
fs.mkdirSync(currentFolder); // 创建当前级文件夹
} else {
// 否则就什么也不做
}
}
// 保存 JSON
function saveJSON({ now, fileNameSuffix, object, compress = true, uncompress = true }) {
if (LATEST_DATA_ONLY) return;
let year = now.substring(0, 4);
let month = now.substring(5, 7);
let day = now.substring(8, 10);
let hour = now.substring(11, 13);
let minute = now.substring(14, 16);
// console.log(now);
// console.log( "year, month, day, hour, minute: " + year + ", " + month + ", " + day + ", " + hour + ", " + minute);
// 创建当前文件夹
let folder = `${DATA_FOLDER}/${fileNameSuffix}/${year}/${month}/${day}`;
createFolder(folder);
let fileName = `${folder}/${year}${month}${day}_${hour}${minute}`;
// 生成文件名
// '2022-07-23T10:11:38.650Z' => '20220723_1011'
// let fileName = now.replace(/T/, '_').replace(/:\d{2}.\d{3}Z/, '').replace(/[-:]/g, '');
// console.log(`fileName is ${fileName}`);
if (compress)
fs.writeFileSync(`${fileName}.min.json`, JSON.stringify(object));
if (uncompress)
fs.writeFileSync(`${fileName}.json`, JSON.stringify(object, "", "\t"));
}
async function main() { async function main() {
let requestTimestamp = Date.now(); let requestTimestamp = Date.now();
let now = new Date(requestTimestamp + 8 * 3600 * 1000).toISOString(); let now = new Date(requestTimestamp + 8 * 3600 * 1000).toISOString();
let result = await getApiResult("https://weibo.com/ajax/statuses/hot_band"); let result = await getApiResult("https://weibo.com/ajax/statuses/hot_band");
if (result.ok != 1) { if (result.ok != 1) {
console.log(new Date(Date.now() + 8 * 60 * 60 * 1000).toISOString(), "请求成功,但服务器处理失败,正在重试。"); console.log(new Date(Date.now() + 8 * 60 * 60 * 1000).toISOString(), "weibo", "请求成功,但服务器处理失败,正在重试。");
result = await getApiResult("https://weibo.com/ajax/statuses/hot_band"); result = await getApiResult("https://weibo.com/ajax/statuses/hot_band");
if (result.ok != 1) { if (result.ok != 1) {
console.log(new Date(Date.now() + 8 * 60 * 60 * 1000).toISOString(), "请求成功,但服务器处理失败,保存失败信息。"); console.log(new Date(Date.now() + 8 * 60 * 60 * 1000).toISOString(), "weibo", "请求成功,但服务器处理失败,保存失败信息。");
// ok 不为 1那么久直接保存便于后续分析不进行后续处理 // ok 不为 1那么久直接保存便于后续分析不进行后续处理
saveJSON({ utils.saveJSON({
saveFolder: DATA_FOLDER,
now: now, now: now,
fileNameSuffix: `origin-error`, fileNameSuffix: `origin-error`,
object: result, object: result,
@ -95,13 +55,14 @@ async function main() {
} }
} }
console.log(new Date(Date.now() + 8 * 60 * 60 * 1000).toISOString(), "请求成功"); console.log(new Date(Date.now() + 8 * 60 * 60 * 1000).toISOString(), "weibo", "请求成功");
// console.log("result", result); // console.log("result", result);
/** /**
* 保存原始数据 * 保存原始数据
*/ */
saveJSON({ utils.saveJSON({
saveFolder: DATA_FOLDER,
now: now, now: now,
fileNameSuffix: `origin`, fileNameSuffix: `origin`,
object: result, object: result,
@ -198,7 +159,8 @@ async function main() {
}, },
}); });
}); });
saveJSON({ utils.saveJSON({
saveFolder: DATA_FOLDER,
now: now, now: now,
fileNameSuffix: `final`, fileNameSuffix: `final`,
object: convert, object: convert,
@ -222,7 +184,8 @@ async function main() {
`原始:${item.raw_hot} 显示:${item.num} 调控: ${item.num - item.raw_hot}` `原始:${item.raw_hot} 显示:${item.num} 调控: ${item.num - item.raw_hot}`
]); ]);
}); });
saveJSON({ utils.saveJSON({
saveFolder: DATA_FOLDER,
now: now, now: now,
fileNameSuffix: `regulation`, fileNameSuffix: `regulation`,
object: { object: {
@ -242,7 +205,8 @@ async function main() {
data.band_list.forEach(function (item) { data.band_list.forEach(function (item) {
delete item["mblog"]; delete item["mblog"];
}); });
saveJSON({ utils.saveJSON({
saveFolder: DATA_FOLDER,
now: now, now: now,
fileNameSuffix: `simplify`, fileNameSuffix: `simplify`,
object: data, object: data,

49
src/utils/utils.js Normal file
View File

@ -0,0 +1,49 @@
const fs = require('fs');
const LATEST_DATA_ONLY = process.env.LATEST_DATA_ONLY == true;
// 创建目录
async function createFolder(folderToCreate) {
let currentFolder = folderToCreate.replace(/\\/g, '/');
let parentFolder = currentFolder.substring(0, currentFolder.lastIndexOf('/'));
if (!fs.existsSync(currentFolder)) {
// 文件夹不存在,创建文件夹
createFolder(parentFolder); // 保证父级文件夹存在
fs.mkdirSync(currentFolder); // 创建当前级文件夹
} else {
// 否则就什么也不做
}
}
// 保存 JSON
function saveJSON({ saveFolder, now, fileNameSuffix, object, compress = true, uncompress = true }) {
if (LATEST_DATA_ONLY) return;
let year = now.substring(0, 4);
let month = now.substring(5, 7);
let day = now.substring(8, 10);
let hour = now.substring(11, 13);
let minute = now.substring(14, 16);
// console.log(now);
// console.log( "year, month, day, hour, minute: " + year + ", " + month + ", " + day + ", " + hour + ", " + minute);
// 创建当前文件夹
let folder = `${saveFolder}/${fileNameSuffix}/${year}/${month}/${day}`;
createFolder(folder);
let fileName = `${folder}/${year}${month}${day}_${hour}${minute}`;
// 生成文件名
// '2022-07-23T10:11:38.650Z' => '20220723_1011'
// let fileName = now.replace(/T/, '_').replace(/:\d{2}.\d{3}Z/, '').replace(/[-:]/g, '');
// console.log(`fileName is ${fileName}`);
if (compress)
fs.writeFileSync(`${fileName}.min.json`, JSON.stringify(object));
if (uncompress)
fs.writeFileSync(`${fileName}.json`, JSON.stringify(object, "", "\t"));
}
module.exports = {
createFolder,
saveJSON,
}