1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee
tools/index.js
2022-07-23 21:13:16 +08:00

247 lines
8.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

'use strict';
const request = require('request');
const fs = require('fs');
console.log("Start running ...");
// 请求微博热搜 APi 接口
async function getApiResult(url) {
var return_data = await new Promise((resolve) => {
request({
method: 'GET',
url: url,
json: true,
}, (error, response, result) => {
if (!error && (response.statusCode == 200)) {
// 请求成功
resolve(result);
} else {
// 请求失败
console.log(`error is ${error}`);
resolve("error");
}
});
});
// console.log(`return_data is ${JSON.stringify(return_data)}`);
return return_data;
}
// 创建目录
async function createFolder(folderToCreate) {
let currentFolder = folderToCreate.replace(/\\/g, '/');
let parentFolder = currentFolder.substring(0, currentFolder.lastIndexOf('/'));
if (!fs.existsSync(currentFolder)) {
// 文件夹不存在,创建文件夹
createFolder(parentFolder); // 保证父级文件夹存在
fs.mkdirSync(currentFolder); // 创建当前级文件夹
} else {
// 否则就什么也不做
}
}
// 保存 JSON
function saveJSON({ now, fileNameSuffix, object, compress = true, uncompress = true }) {
let year = now.substring(0, 4);
let month = now.substring(5, 7);
let day = now.substring(8, 10);
let hour = now.substring(11, 13);
let minute = now.substring(14, 16);
// console.log(now);
// console.log( "year, month, day, hour, minute: " + year + ", " + month + ", " + day + ", " + hour + ", " + minute);
// 创建当前文件夹
let folder = `./data/${year}/${month}/${day}`;
createFolder(folder);
let fileName = `${folder}/${year}${month}${day}_${hour}${minute}_${fileNameSuffix}`;
// 生成文件名
// '2022-07-23T10:11:38.650Z' => '20220723_1011'
// let fileName = now.replace(/T/, '_').replace(/:\d{2}.\d{3}Z/, '').replace(/[-:]/g, '');
// console.log(`fileName is ${fileName}`);
if (compress)
fs.writeFileSync(`${fileName}.min.json`, JSON.stringify(object));
if (uncompress)
fs.writeFileSync(`${fileName}.json`, JSON.stringify(object, "", "\t"));
}
async function main() {
let requestTimestamp = Date.now();
let now = new Date(requestTimestamp + 8 * 3600 * 1000).toISOString();
let result = await getApiResult("https://weibo.com/ajax/statuses/hot_band");
// console.log("result", result);
if (result.ok != 1) {
console.log("请求成功,但服务器处理失败。");
} else {
console.log("请求成功。");
/**
* 保存原始数据
*/
saveJSON({
now: now,
fileNameSuffix: `origin`,
object: result,
compress: true,
uncompress: false
});
let data = JSON.parse(JSON.stringify(result.data));
/**
* 过滤掉不需要的数据
*/
// hotgov
delete data.hotgov["mblog"];
// 重复字段只保留一个
delete data.hotgov["note"]; // note word
delete data.hotgov["small_icon_desc"]; // icon_desc small_icon_desc
delete data.hotgov["small_icon_desc_color"]; // icon_desc_color small_icon_desc_color
// band_list
for (let i = 0; i < data.band_list.length; i++) {
const item = data.band_list[i];
// 过滤广告
if (item.is_ad) {
data.band_list.splice(i, 1);
i--;
}
// 过滤空字段
delete item["ad_info"];
// 重复字段只保留一个
delete item["note"]; // note word
delete item["icon_desc"]; delete item["small_icon_desc"]; // label_name icon_desc small_icon_desc
delete item["small_icon_desc_color"]; // icon_desc_color small_icon_desc_color
delete item["flag_desc"]; // flag_desc subject_label 这两个有值的时候相同,没有值的时候,前一个为 undefined后一个为 ""
}
/**
* 获取需要的数据,进行转换
*/
let convert = [];
data.band_list.forEach(item => {
let detail = "";
let pic_ids = [];
if (item.mblog) { // 有些热搜没有 mblog
var regex = /(<([^>]+)>)/ig
detail = item.mblog.text.replace(regex, "");
if (item.mblog.pics) {
pic_ids = item.mblog.pics.map(pic => `${pic}`);
}
}
convert.push({
// 热搜排行顺序
rank: item.rank,
realpos: item.realpos,
// 热搜信息
word: item.word, // 热搜标题
word_scheme: item.word_scheme, // 热搜话题 "#热搜标题#"
emoticon: item.emoticon, // 热搜小表情,如 "[泪]"
label_name: item.label_name, // 热搜标签,如 "爆" "热" "新" ""
onboard_time: item.onboard_time, // 热搜上线时间,秒级时间戳,如 1658565575
/**
* 热搜数据
*
* 大部分的 num 和 raw_hot 是相同的,页面上显示的是 num可能是人工调控的热搜
*
* 两者差值通过观测似乎最大是 1250000
* 例如 【爆】唐山打架事件8名违法嫌疑人已到案 这条热搜一开始 delta 首先不断增大,最大达到 1250000
* 然后热搜数量增加到 12600000 左右的时候delta 逐渐减小到 1040000 左右
* 所有热搜的 detla带正负 加起来就是基本上在100000-230000之间
*/
num: item.num,
raw_hot: item.raw_hot,
detla: item.num - item.raw_hot, // 计算值
url: `https://s.weibo.com/weibo?q=${encodeURIComponent(item.word_scheme)}`, // 热搜话题链接
// 分类
category: item.category ? item.category.split(',') : "",
subject_label: item.subject_label,
// 其他
more: {
is_new: item.is_new,
subject_querys: item.subject_querys,
mid: item.mid,
icon_desc_color: item.icon_desc_color,
detail: detail,
},
});
});
saveJSON({
now: now,
fileNameSuffix: `final`,
object: convert,
compress: true,
uncompress: true
});
/**
* 只统计微博调控信息
*/
let convert2 = [];
let total = 0;
data.band_list.forEach(item => {
total += item.num;
total -= item.raw_hot;
if (item.num - item.raw_hot == 0) return;
convert2.push([
`[${item.realpos}] ${item.word}${item.label_name}`,
`原始:${item.raw_hot} 显示:${item.num} 调控: ${item.num - item.raw_hot}`
]);
});
saveJSON({
now: now,
fileNameSuffix: `regulation`,
object: {
total_delta: total, // 所有调控值之和
data: convert2
},
compress: false,
uncompress: true
});
/**
* 保存预处理后数据
*/
// 过滤掉不需要的数据
// band_list
data.band_list.forEach(function (item) {
delete item["mblog"];
});
saveJSON({
now: now,
fileNameSuffix: `simplify`,
object: data,
compress: true,
uncompress: true
});
/**
* 更新最新的
*/
fs.writeFileSync(`./data/latest.json`, JSON.stringify({
update_time: requestTimestamp,
update_time_friendly: now.substring(0, 19).replace(/T/g, " "),
regulation: convert2,
data: convert
}));
}
}
main();