240 lines
9.6 KiB
JavaScript
240 lines
9.6 KiB
JavaScript
const fs = require('fs');
|
|
const path = require('path');
|
|
|
|
const requestUtils = require('../../../utils/requestUtils');
|
|
const sleepUtils = require('../../../utils/sleepUtils');
|
|
|
|
const dbUtils = global.dbUtils;
|
|
|
|
// refer:
|
|
// https://neteasecloudmusicapi-docs.4everland.app/
|
|
// https://github.com/Binaryify/NeteaseCloudMusicApi
|
|
const { comment_music } = require('NeteaseCloudMusicApi');
|
|
|
|
async function fetchAll() {
|
|
console.log("start fetching comment ...")
|
|
// 首先将需要爬取的song_id导入comment_progress表
|
|
await dbUtils.query(`
|
|
INSERT INTO comment_progress ( song_id )
|
|
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress )
|
|
`, []);
|
|
|
|
// 首先查询有无正在爬取中的记录
|
|
var songIds = await dbUtils.query(`
|
|
-- 本机
|
|
SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id <= 30000000 LIMIT 1000
|
|
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id < 30000000 ORDER BY current_status DESC
|
|
-- 服务器
|
|
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id > 30000000 ORDER BY current_status DESC
|
|
`, []);
|
|
songIds = songIds.map(item => item.song_id);
|
|
|
|
for (let i = 0; i < songIds.length; i++) {
|
|
await global.checkIsExit();
|
|
const songId = songIds[i];
|
|
console.log(`${i + 1}/${songIds.length} | comment: ${songId}`);
|
|
try {
|
|
await fetch({ songId: songId });
|
|
} catch (err) {
|
|
console.error(err);
|
|
}
|
|
await sleepUtils.sleep(global.sleepTime);
|
|
}
|
|
}
|
|
|
|
// 获取歌词详情
|
|
async function fetch({ songId, debug = false }) {
|
|
// // var url = `https://music.163.com/weapi/comment/resource/comments/get?csrf_token=`;
|
|
// var opts = {
|
|
// method: "POST",
|
|
// url: `https://music.163.com/api/v1/resource/comments/R_SO_4_${songId}`,
|
|
// headers: {
|
|
// 'content-type': 'application/x-www-form-urlencoded',
|
|
// 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53',
|
|
// },
|
|
// form: encrypt.weapi({
|
|
// rid: songId,
|
|
// limit: 20,
|
|
// offset: 20, // offset的取值为:(评论页数-1)*20
|
|
// before: 1664655762881
|
|
// })
|
|
// };
|
|
|
|
// 首先查询有无正在爬取中的记录
|
|
var commentProgress = await dbUtils.query(`
|
|
SELECT * FROM comment_progress WHERE song_id = ? and current_status != 2 LIMIT 1
|
|
`, [songId]);
|
|
if (commentProgress.length == 0) {
|
|
console.log('No commentProgress found, song_id:', songId);
|
|
return;
|
|
}
|
|
var item = commentProgress[0];
|
|
var progress = {
|
|
maxTime: item.max_time,
|
|
minTime: item.min_time,
|
|
currentTime: item.current_time,
|
|
currentStatus: item.current_status,
|
|
total: item.total,
|
|
};
|
|
// https://neteasecloudmusicapi-docs.4everland.app/#/?id=%e6%ad%8c%e6%9b%b2%e8%af%84%e8%ae%ba
|
|
var queryParams = {
|
|
id: songId,
|
|
limit: 20,
|
|
// before: undefined,
|
|
};
|
|
if (progress.currentTime != 0)
|
|
queryParams.before = progress.currentTime;
|
|
|
|
let isFinish = false; let pageCount = 0;
|
|
while (!isFinish) {
|
|
await global.checkIsExit();
|
|
console.log(`comment: ${songId}, page: ${++pageCount}`);
|
|
|
|
// 是否是第一页
|
|
let isFirstPage = progress.currentStatus === 0;
|
|
|
|
try {
|
|
// console.log(progress, queryParams);
|
|
var commentResult = await comment_music(queryParams);
|
|
// fs.writeFileSync(path.join(__dirname, "../../temp", `comment-${songId}-${pageCount}.json`), JSON.stringify(commentResult));
|
|
} catch (errors) {
|
|
console.error(errors);
|
|
await sleepUtils.sleep(1000);
|
|
continue;
|
|
}
|
|
|
|
var topComments = commentResult.body.topComments || [];
|
|
var hotComments = commentResult.body.hotComments || [];
|
|
var comments = commentResult.body.comments || [];
|
|
|
|
var commentInfoList = [
|
|
...topComments.map(comment => getCommitInfoForInsert(songId, comment, 2)),
|
|
...hotComments.map(comment => getCommitInfoForInsert(songId, comment, 1)),
|
|
...comments.map(comment => getCommitInfoForInsert(songId, comment, 0))
|
|
];
|
|
var userInfoList = [...topComments, ...hotComments, ...comments]
|
|
.map(comment => comment.user).filter(user => !!user).map(getUserInfoForInsert);
|
|
|
|
// console.log(commentInfoList);
|
|
// console.log(userInfoList);
|
|
|
|
let promiseList = [];
|
|
for (let commentInfo of commentInfoList) {
|
|
let promise = new Promise(async function (resolve, reject) {
|
|
let result = await dbUtils.query(`
|
|
INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
|
|
ON DUPLICATE KEY UPDATE content = ?, like_count = ?, comment_type = GREATEST(comment_type, ?), modify_time = CURRENT_TIMESTAMP
|
|
`, [
|
|
[[
|
|
commentInfo.comment_id,
|
|
commentInfo.parent_comment_id,
|
|
commentInfo.user_id,
|
|
commentInfo.song_id,
|
|
commentInfo.content,
|
|
commentInfo.time,
|
|
commentInfo.like_count,
|
|
commentInfo.comment_type
|
|
]],
|
|
commentInfo.content,
|
|
commentInfo.like_count,
|
|
commentInfo.comment_type
|
|
]);
|
|
// console.log(result);
|
|
// console.log("INSERT comment");
|
|
resolve();
|
|
});
|
|
promiseList.push(promise);
|
|
}
|
|
|
|
for (let userInfo of userInfoList) {
|
|
let promise = new Promise(async function (resolve, reject) {
|
|
let result = await dbUtils.query(`
|
|
INSERT INTO user ( user_id, user_type, nickname, avatar_url ) VALUES ?
|
|
ON DUPLICATE KEY UPDATE user_type = ?, nickname = ?, avatar_url = ?, modify_time = CURRENT_TIMESTAMP
|
|
`, [
|
|
[[
|
|
userInfo.user_id,
|
|
userInfo.user_type,
|
|
userInfo.nickname,
|
|
userInfo.avatar_url,
|
|
]],
|
|
userInfo.user_type,
|
|
userInfo.nickname,
|
|
userInfo.avatar_url
|
|
]);
|
|
// console.log(result);
|
|
// console.log("INSERT user");
|
|
resolve();
|
|
});
|
|
promiseList.push(promise);
|
|
}
|
|
|
|
await Promise.all(promiseList);
|
|
// console.log("INSERT finished comment and user finished");
|
|
|
|
// console.log(commentResult.body.more, comments.length, commentInfoList.length);
|
|
|
|
// 判断是否还有下一页
|
|
if (commentResult.body.more && comments.length > 0) {
|
|
// 更新 progress
|
|
progress.currentTime = comments[comments.length - 1].time;
|
|
if (progress.maxTime == progress.minTime) { // minTime = maxTime 代表这是本轮爬取的第一次
|
|
progress.maxTime = comments[0].time;
|
|
}
|
|
progress.currentStatus = 1; // 0-等待爬取/增量爬取 1-爬取中 2-完成
|
|
// 更新 queryParams
|
|
queryParams.before = progress.currentTime;
|
|
progress.total = commentResult.body.total;
|
|
} else {
|
|
isFinish = true;
|
|
console.log(`comment: ${songId} 结束了`);
|
|
progress.currentStatus = 2; // 0-等待爬取/增量爬取 1-爬取中 2-完成
|
|
if (progress.maxTime == 0) { // 第一次爬取 且 没有分页的情况
|
|
progress.maxTime = comments[0]?.time || 0;
|
|
}
|
|
progress.minTime = progress.maxTime; // minTime = maxTime 代表这一轮爬取完成了
|
|
progress.currentTime = progress.maxTime; // 可有可无
|
|
}
|
|
|
|
// progress更新到数据库中
|
|
await dbUtils.query('UPDATE comment_progress SET ? WHERE song_id = ? LIMIT 1', [{
|
|
max_time: progress.maxTime,
|
|
min_time: progress.minTime,
|
|
current_time: progress.currentTime,
|
|
current_status: progress.currentStatus,
|
|
total: progress.total,
|
|
}, songId]);
|
|
// console.log("UPDATE comment_progress");
|
|
// await sleepUtils.sleep(global.sleepTime);
|
|
}
|
|
// return commentInfo;
|
|
}
|
|
|
|
function getCommitInfoForInsert(songId, comment, commentType) {
|
|
return {
|
|
comment_id: comment.commentId,
|
|
parent_comment_id: comment.parentCommentId,
|
|
user_id: comment.user?.userId,
|
|
song_id: songId,
|
|
content: comment.content,
|
|
time: comment.time,
|
|
like_count: comment.likedCount,
|
|
comment_type: commentType, // 评论类型 0-comments 1-hotComments 2-topComments
|
|
}
|
|
}
|
|
|
|
function getUserInfoForInsert(user) {
|
|
var shortAvatarUrlUrl = user.avatarUrl.match(/^http:\/\/p\d+\.music\.126\.net\/(.*?)$/);
|
|
shortAvatarUrlUrl = shortAvatarUrlUrl ? shortAvatarUrlUrl[1] : user.avatarUrl;
|
|
return {
|
|
user_id: user.userId,
|
|
user_type: user.userType,
|
|
nickname: user.nickname,
|
|
avatar_url: shortAvatarUrlUrl || user.avatarUrl,
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
fetchAll: fetchAll,
|
|
fetch: fetch,
|
|
} |