const fs = require('fs'); const path = require('path'); const requestUtils = require('../../../utils/requestUtils'); const sleepUtils = require('../../../utils/sleepUtils'); const dbUtils = global.dbUtils; // refer: // https://neteasecloudmusicapi-docs.4everland.app/ // https://github.com/Binaryify/NeteaseCloudMusicApi const { comment_music } = require('NeteaseCloudMusicApi'); async function fetchAll() { console.log("start fetching comment ...") // 首先将需要爬取的song_id导入comment_progress表 await dbUtils.query(` INSERT INTO comment_progress ( song_id ) SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress ) `, []); // 首先查询有无正在爬取中的记录 var songIds = await dbUtils.query(` -- 本机 SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id <= 30000000 LIMIT 1000 -- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id < 30000000 ORDER BY current_status DESC -- 服务器 -- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id > 30000000 ORDER BY current_status DESC `, []); songIds = songIds.map(item => item.song_id); for (let i = 0; i < songIds.length; i++) { await global.checkIsExit(); const songId = songIds[i]; console.log(`${i + 1}/${songIds.length} | comment: ${songId}`); try { await fetch({ songId: songId }); } catch (err) { console.error(err); } await sleepUtils.sleep(global.sleepTime); } } // 获取歌词详情 async function fetch({ songId, debug = false }) { // // var url = `https://music.163.com/weapi/comment/resource/comments/get?csrf_token=`; // var opts = { // method: "POST", // url: `https://music.163.com/api/v1/resource/comments/R_SO_4_${songId}`, // headers: { // 'content-type': 'application/x-www-form-urlencoded', // 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.53', // }, // form: encrypt.weapi({ // rid: songId, // limit: 20, // offset: 20, // offset的取值为:(评论页数-1)*20 // before: 1664655762881 // }) // }; // 首先查询有无正在爬取中的记录 var commentProgress = await dbUtils.query(` SELECT * FROM comment_progress WHERE song_id = ? and current_status != 2 LIMIT 1 `, [songId]); if (commentProgress.length == 0) { console.log('No commentProgress found, song_id:', songId); return; } var item = commentProgress[0]; var progress = { maxTime: item.max_time, minTime: item.min_time, currentTime: item.current_time, currentStatus: item.current_status, total: item.total, }; // https://neteasecloudmusicapi-docs.4everland.app/#/?id=%e6%ad%8c%e6%9b%b2%e8%af%84%e8%ae%ba var queryParams = { id: songId, limit: 20, // before: undefined, }; if (progress.currentTime != 0) queryParams.before = progress.currentTime; let isFinish = false; let pageCount = 0; while (!isFinish) { await global.checkIsExit(); console.log(`comment: ${songId}, page: ${++pageCount}`); // 是否是第一页 let isFirstPage = progress.currentStatus === 0; try { // console.log(progress, queryParams); var commentResult = await comment_music(queryParams); // fs.writeFileSync(path.join(__dirname, "../../temp", `comment-${songId}-${pageCount}.json`), JSON.stringify(commentResult)); } catch (errors) { console.error(errors); await sleepUtils.sleep(1000); continue; } var topComments = commentResult.body.topComments || []; var hotComments = commentResult.body.hotComments || []; var comments = commentResult.body.comments || []; var commentInfoList = [ ...topComments.map(comment => getCommitInfoForInsert(songId, comment, 2)), ...hotComments.map(comment => getCommitInfoForInsert(songId, comment, 1)), ...comments.map(comment => getCommitInfoForInsert(songId, comment, 0)) ]; var userInfoList = [...topComments, ...hotComments, ...comments] .map(comment => comment.user).filter(user => !!user).map(getUserInfoForInsert); // console.log(commentInfoList); // console.log(userInfoList); let promiseList = []; for (let commentInfo of commentInfoList) { let promise = new Promise(async function (resolve, reject) { let result = await dbUtils.query(` INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ? ON DUPLICATE KEY UPDATE content = ?, like_count = ?, comment_type = GREATEST(comment_type, ?), modify_time = CURRENT_TIMESTAMP `, [ [[ commentInfo.comment_id, commentInfo.parent_comment_id, commentInfo.user_id, commentInfo.song_id, commentInfo.content, commentInfo.time, commentInfo.like_count, commentInfo.comment_type ]], commentInfo.content, commentInfo.like_count, commentInfo.comment_type ]); // console.log(result); // console.log("INSERT comment"); resolve(); }); promiseList.push(promise); } for (let userInfo of userInfoList) { let promise = new Promise(async function (resolve, reject) { let result = await dbUtils.query(` INSERT INTO user ( user_id, user_type, nickname, avatar_url ) VALUES ? ON DUPLICATE KEY UPDATE user_type = ?, nickname = ?, avatar_url = ?, modify_time = CURRENT_TIMESTAMP `, [ [[ userInfo.user_id, userInfo.user_type, userInfo.nickname, userInfo.avatar_url, ]], userInfo.user_type, userInfo.nickname, userInfo.avatar_url ]); // console.log(result); // console.log("INSERT user"); resolve(); }); promiseList.push(promise); } await Promise.all(promiseList); // console.log("INSERT finished comment and user finished"); // console.log(commentResult.body.more, comments.length, commentInfoList.length); // 判断是否还有下一页 if (commentResult.body.more && comments.length > 0) { // 更新 progress progress.currentTime = comments[comments.length - 1].time; if (progress.maxTime == progress.minTime) { // minTime = maxTime 代表这是本轮爬取的第一次 progress.maxTime = comments[0].time; } progress.currentStatus = 1; // 0-等待爬取/增量爬取 1-爬取中 2-完成 // 更新 queryParams queryParams.before = progress.currentTime; progress.total = commentResult.body.total; } else { isFinish = true; console.log(`comment: ${songId} 结束了`); progress.currentStatus = 2; // 0-等待爬取/增量爬取 1-爬取中 2-完成 if (progress.maxTime == 0) { // 第一次爬取 且 没有分页的情况 progress.maxTime = comments[0]?.time || 0; } progress.minTime = progress.maxTime; // minTime = maxTime 代表这一轮爬取完成了 progress.currentTime = progress.maxTime; // 可有可无 } // progress更新到数据库中 await dbUtils.query('UPDATE comment_progress SET ? WHERE song_id = ? LIMIT 1', [{ max_time: progress.maxTime, min_time: progress.minTime, current_time: progress.currentTime, current_status: progress.currentStatus, total: progress.total, }, songId]); // console.log("UPDATE comment_progress"); // await sleepUtils.sleep(global.sleepTime); } // return commentInfo; } function getCommitInfoForInsert(songId, comment, commentType) { return { comment_id: comment.commentId, parent_comment_id: comment.parentCommentId, user_id: comment.user?.userId, song_id: songId, content: comment.content, time: comment.time, like_count: comment.likedCount, comment_type: commentType, // 评论类型 0-comments 1-hotComments 2-topComments } } function getUserInfoForInsert(user) { var shortAvatarUrlUrl = user.avatarUrl.match(/^http:\/\/p\d+\.music\.126\.net\/(.*?)$/); shortAvatarUrlUrl = shortAvatarUrlUrl ? shortAvatarUrlUrl[1] : user.avatarUrl; return { user_id: user.userId, user_type: user.userType, nickname: user.nickname, avatar_url: shortAvatarUrlUrl || user.avatarUrl, } } module.exports = { fetchAll: fetchAll, fetch: fetch, }