1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee
Files
tools/netease_music/src/getInfo/commentUtils.js
2022-10-04 14:12:33 +08:00

240 lines
9.6 KiB
JavaScript

const fs = require('fs');
const path = require('path');
const requestUtils = require('../../../utils/requestUtils');
const sleepUtils = require('../../../utils/sleepUtils');
const dbUtils = global.dbUtils;
// refer:
// https://neteasecloudmusicapi-docs.4everland.app/
// https://github.com/Binaryify/NeteaseCloudMusicApi
const { comment_music } = require('NeteaseCloudMusicApi');
async function fetchAll() {
console.log("start fetching comment ...")
// 首先将需要爬取的song_id导入comment_progress表
await dbUtils.query(`
INSERT INTO comment_progress ( song_id )
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress )
`, []);
// 首先查询有无正在爬取中的记录
var songIds = await dbUtils.query(`
-- 本机
SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id <= 30000000 LIMIT 1000
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id < 30000000 ORDER BY current_status DESC
-- 服务器
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id > 30000000 ORDER BY current_status DESC
`, []);
songIds = songIds.map(item => item.song_id);
for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit();
const songId = songIds[i];
console.log(`${i + 1}/${songIds.length} | comment: ${songId}`);
try {
await fetch({ songId: songId });
} catch (err) {
console.error(err);
}
await sleepUtils.sleep(global.sleepTime);
}
}
// 获取歌词详情
async function fetch({ songId, debug = false }) {
// // var url = `https://music.163.com/weapi/comment/resource/comments/get?csrf_token=`;
// var opts = {
// method: "POST",
// url: `https://music.163.com/api/v1/resource/comments/R_SO_4_${songId}`,
// headers: {
// 'content-type': 'application/x-www-form-urlencoded',
// 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53',
// },
// form: encrypt.weapi({
// rid: songId,
// limit: 20,
// offset: 20, // offset的取值为:(评论页数-1)*20
// before: 1664655762881
// })
// };
// 首先查询有无正在爬取中的记录
var commentProgress = await dbUtils.query(`
SELECT * FROM comment_progress WHERE song_id = ? and current_status != 2 LIMIT 1
`, [songId]);
if (commentProgress.length == 0) {
console.log('No commentProgress found, song_id:', songId);
return;
}
var item = commentProgress[0];
var progress = {
maxTime: item.max_time,
minTime: item.min_time,
currentTime: item.current_time,
currentStatus: item.current_status,
total: item.total,
};
// https://neteasecloudmusicapi-docs.4everland.app/#/?id=%e6%ad%8c%e6%9b%b2%e8%af%84%e8%ae%ba
var queryParams = {
id: songId,
limit: 20,
// before: undefined,
};
if (progress.currentTime != 0)
queryParams.before = progress.currentTime;
let isFinish = false; let pageCount = 0;
while (!isFinish) {
await global.checkIsExit();
console.log(`comment: ${songId}, page: ${++pageCount}`);
// 是否是第一页
let isFirstPage = progress.currentStatus === 0;
try {
// console.log(progress, queryParams);
var commentResult = await comment_music(queryParams);
// fs.writeFileSync(path.join(__dirname, "../../temp", `comment-${songId}-${pageCount}.json`), JSON.stringify(commentResult));
} catch (errors) {
console.error(errors);
await sleepUtils.sleep(1000);
continue;
}
var topComments = commentResult.body.topComments || [];
var hotComments = commentResult.body.hotComments || [];
var comments = commentResult.body.comments || [];
var commentInfoList = [
...topComments.map(comment => getCommitInfoForInsert(songId, comment, 2)),
...hotComments.map(comment => getCommitInfoForInsert(songId, comment, 1)),
...comments.map(comment => getCommitInfoForInsert(songId, comment, 0))
];
var userInfoList = [...topComments, ...hotComments, ...comments]
.map(comment => comment.user).filter(user => !!user).map(getUserInfoForInsert);
// console.log(commentInfoList);
// console.log(userInfoList);
let promiseList = [];
for (let commentInfo of commentInfoList) {
let promise = new Promise(async function (resolve, reject) {
let result = await dbUtils.query(`
INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
ON DUPLICATE KEY UPDATE content = ?, like_count = ?, comment_type = GREATEST(comment_type, ?), modify_time = CURRENT_TIMESTAMP
`, [
[[
commentInfo.comment_id,
commentInfo.parent_comment_id,
commentInfo.user_id,
commentInfo.song_id,
commentInfo.content,
commentInfo.time,
commentInfo.like_count,
commentInfo.comment_type
]],
commentInfo.content,
commentInfo.like_count,
commentInfo.comment_type
]);
// console.log(result);
// console.log("INSERT comment");
resolve();
});
promiseList.push(promise);
}
for (let userInfo of userInfoList) {
let promise = new Promise(async function (resolve, reject) {
let result = await dbUtils.query(`
INSERT INTO user ( user_id, user_type, nickname, avatar_url ) VALUES ?
ON DUPLICATE KEY UPDATE user_type = ?, nickname = ?, avatar_url = ?, modify_time = CURRENT_TIMESTAMP
`, [
[[
userInfo.user_id,
userInfo.user_type,
userInfo.nickname,
userInfo.avatar_url,
]],
userInfo.user_type,
userInfo.nickname,
userInfo.avatar_url
]);
// console.log(result);
// console.log("INSERT user");
resolve();
});
promiseList.push(promise);
}
await Promise.all(promiseList);
// console.log("INSERT finished comment and user finished");
// console.log(commentResult.body.more, comments.length, commentInfoList.length);
// 判断是否还有下一页
if (commentResult.body.more && comments.length > 0) {
// 更新 progress
progress.currentTime = comments[comments.length - 1].time;
if (progress.maxTime == progress.minTime) { // minTime = maxTime 代表这是本轮爬取的第一次
progress.maxTime = comments[0].time;
}
progress.currentStatus = 1; // 0-等待爬取/增量爬取 1-爬取中 2-完成
// 更新 queryParams
queryParams.before = progress.currentTime;
progress.total = commentResult.body.total;
} else {
isFinish = true;
console.log(`comment: ${songId} 结束了`);
progress.currentStatus = 2; // 0-等待爬取/增量爬取 1-爬取中 2-完成
if (progress.maxTime == 0) { // 第一次爬取 且 没有分页的情况
progress.maxTime = comments[0]?.time || 0;
}
progress.minTime = progress.maxTime; // minTime = maxTime 代表这一轮爬取完成了
progress.currentTime = progress.maxTime; // 可有可无
}
// progress更新到数据库中
await dbUtils.query('UPDATE comment_progress SET ? WHERE song_id = ? LIMIT 1', [{
max_time: progress.maxTime,
min_time: progress.minTime,
current_time: progress.currentTime,
current_status: progress.currentStatus,
total: progress.total,
}, songId]);
// console.log("UPDATE comment_progress");
// await sleepUtils.sleep(global.sleepTime);
}
// return commentInfo;
}
function getCommitInfoForInsert(songId, comment, commentType) {
return {
comment_id: comment.commentId,
parent_comment_id: comment.parentCommentId,
user_id: comment.user?.userId,
song_id: songId,
content: comment.content,
time: comment.time,
like_count: comment.likedCount,
comment_type: commentType, // 评论类型 0-comments 1-hotComments 2-topComments
}
}
function getUserInfoForInsert(user) {
var shortAvatarUrlUrl = user.avatarUrl.match(/^http:\/\/p\d+\.music\.126\.net\/(.*?)$/);
shortAvatarUrlUrl = shortAvatarUrlUrl ? shortAvatarUrlUrl[1] : user.avatarUrl;
return {
user_id: user.userId,
user_type: user.userType,
nickname: user.nickname,
avatar_url: shortAvatarUrlUrl || user.avatarUrl,
}
}
module.exports = {
fetchAll: fetchAll,
fetch: fetch,
}