diff --git a/netease_music/index.js b/netease_music/index.js index a9e3172..361797e 100644 --- a/netease_music/index.js +++ b/netease_music/index.js @@ -56,6 +56,7 @@ async function main() { await albumInfoUtils.fetchAll({}); await artistInfoUtils.fetchAll(); await lyricInfoUtils.fetchAll(); + await commentUtils.fetchAll(); await sleepUtils.sleep(2000); } } diff --git a/netease_music/sql/structure.sql b/netease_music/sql/structure.sql index 4049362..bde2c04 100644 --- a/netease_music/sql/structure.sql +++ b/netease_music/sql/structure.sql @@ -61,8 +61,8 @@ CREATE TABLE `lyric` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; CREATE TABLE `user` ( - `user_id` int(10) unsigned NOT NULL COMMENT '用户id', - `user_type` tinyint(4) unsigned NOT NULL COMMENT '用户类型', + `user_id` bigint(20) unsigned NOT NULL COMMENT '用户id', + `user_type` varchar(50) NOT NULL COMMENT '用户类型', `nickname` varchar(200) NOT NULL COMMENT '用户昵称', `avatar_url` varchar(200) NOT NULL COMMENT '用户头像 http://p1.music.126.net/ 后面的部分', `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间', @@ -72,8 +72,8 @@ CREATE TABLE `user` ( CREATE TABLE `comment` ( `comment_id` bigint(20) unsigned NOT NULL COMMENT '评论id', - `parent_comment_id` int(10) unsigned NOT NULL COMMENT '父评论id', - `user_id` int(10) unsigned NOT NULL COMMENT '用户id', + `parent_comment_id` bigint(20) unsigned NOT NULL COMMENT '父评论id', + `user_id` bigint(20) unsigned NOT NULL COMMENT '用户id', `song_id` int(10) unsigned NOT NULL COMMENT '歌曲id', `content` text NOT NULL COMMENT '评论内容', `time` varchar(50) NOT NULL DEFAULT '' COMMENT '评论时间', @@ -86,9 +86,9 @@ CREATE TABLE `comment` ( CREATE TABLE `comment_progress` ( `song_id` int(10) unsigned NOT NULL COMMENT '歌曲id', - `max_time` int(10) NOT NULL DEFAULT 0 COMMENT '开始爬取/开始增量爬取的时候 最新一条评论的时间', - `min_time` int(10) NOT NULL DEFAULT 0 COMMENT '上一次爬取时最后一条评论的时间 第一次爬取时为0', - `current_time` int(10) NOT NULL DEFAULT 0 COMMENT '本次爬取/增量时,最早的一条评论时间', + `max_time` bigint(20) NOT NULL DEFAULT 0 COMMENT '开始爬取/开始增量爬取的时候 最新一条评论的时间', + `min_time` bigint(20) NOT NULL DEFAULT 0 COMMENT '上一次爬取时最后一条评论的时间 第一次爬取时为0', + `current_time` bigint(20) NOT NULL DEFAULT 0 COMMENT '本次爬取/增量时,最早的一条评论时间', `current_status` tinyint(4) unsigned NOT NULL DEFAULT 0 COMMENT '爬取进度 0-等待爬取/增量爬取 1-爬取中 2-完成', `total` int(10) NOT NULL DEFAULT 0 COMMENT '评论总数', `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间', diff --git a/netease_music/src/getInfo/commentUtils.js b/netease_music/src/getInfo/commentUtils.js index af9d86f..0319063 100644 --- a/netease_music/src/getInfo/commentUtils.js +++ b/netease_music/src/getInfo/commentUtils.js @@ -18,7 +18,7 @@ async function fetchAll() { // 首先查询有无正在爬取中的记录 var songIds = await dbUtils.query(` - SELECT song_id FROM comment_progress WHERE current_status != 2 LIMIT 1 + SELECT song_id FROM comment_progress WHERE current_status != 2 `, []); songIds = songIds.map(item => item.song_id); @@ -74,108 +74,142 @@ async function fetch({ songId, debug = false }) { limit: 20, // before: undefined, }; - console.log(progress); + if (progress.currentTime != 0) + queryParams.before = progress.currentTime; - let isFinish = false; + let isFinish = false; let pageCount = 0; while (!isFinish) { + await global.checkIsExit(); + console.log(`comment: ${songId}, 页数: ${++pageCount}`); + // 是否是第一页 let isFirstPage = progress.currentStatus === 0; try { + // console.log(progress, queryParams); var commentResult = await comment_music(queryParams); - fs.writeFileSync(path.join(__dirname, "../../temp", `comment-${songId}.json`), JSON.stringify(commentResult)); + // fs.writeFileSync(path.join(__dirname, "../../temp", `comment-${songId}-${pageCount}.json`), JSON.stringify(commentResult)); } catch (errors) { console.error(errors); await sleepUtils.sleep(1000); continue; } - var topComments = commentResult.body.hotComments || []; + var topComments = commentResult.body.topComments || []; var hotComments = commentResult.body.hotComments || []; - var comments = commentResult.body.hotComments || []; + var comments = commentResult.body.comments || []; - function getCommitInfoForInsert(comment, commentType) { - return { - comment_id: comment.commentId, - parent_comment_id: comment.parentCommentId, - user_id: comment.user?.userId, - song_id: songId, - content: comment.content, - time: comment.time, - like_count: comment.likedCount, - comment_type: commentType, // 评论类型 0-comments 1-hotComments 2-topComments - } - } - function getUserInfoForInsert(comment) { - const user = comment.user; - var shortAvatarUrlUrl = user.avatarUrl.match(/^http:\/\/p\d+\.music\.126\.net\/(.*?)$/); - shortAvatarUrlUrl = shortAvatarUrlUrl ? shortAvatarUrlUrl[1] : user.avatarUrl; - return { - user_id: user.userId, - user_type: user.userType, - nickname: user.nickname, - avatar_url: shortAvatarUrlUrl || user.avatarUrl, - } - } var commentInfoList = [ - ...topComments.map(comment => getCommitInfoForInsert(comment, 2)), - ...hotComments.map(comment => getCommitInfoForInsert(comment, 1)), - ...comments.map(comment => getCommitInfoForInsert(comment, 0)) + ...topComments.map(comment => getCommitInfoForInsert(songId, comment, 2)), + ...hotComments.map(comment => getCommitInfoForInsert(songId, comment, 1)), + ...comments.map(comment => getCommitInfoForInsert(songId, comment, 0)) ]; var userInfoList = [...topComments, ...hotComments, ...comments] - .filter(comment => comment.user).map(getUserInfoForInsert); + .map(comment => comment.user).filter(user => !!user).map(getUserInfoForInsert); - console.log(commentInfoList); + // console.log(commentInfoList); // console.log(userInfoList); - commentInfoList.forEach(async function (commentInfo) { + for (let commentInfo of commentInfoList) { let result = await dbUtils.query(` INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ? - ON DUPLICATE KEY UPDATE content = ? , like_count = ? , comment_type = GREATEST(comment_type, ? ), modify_time = CURRENT_TIMESTAMP + ON DUPLICATE KEY UPDATE content = ?, like_count = ?, comment_type = GREATEST(comment_type, ?), modify_time = CURRENT_TIMESTAMP `, [ - [ - [ - commentInfo.comment_id, - commentInfo.parent_comment_id, - commentInfo.user_id, - commentInfo.song_id, - commentInfo.content, - commentInfo.time, - commentInfo.like_count, - commentInfo.comment_type - ] - ], + [[ + commentInfo.comment_id, + commentInfo.parent_comment_id, + commentInfo.user_id, + commentInfo.song_id, + commentInfo.content, + commentInfo.time, + commentInfo.like_count, + commentInfo.comment_type + ]], commentInfo.content, commentInfo.like_count, commentInfo.comment_type ]); - console.log(result); - }); + // console.log(result); + } - // process.exit(0); + for (let userInfo of userInfoList) { + let result = await dbUtils.query(` + INSERT INTO user ( user_id, user_type, nickname, avatar_url ) VALUES ? + ON DUPLICATE KEY UPDATE user_type = ?, nickname = ?, avatar_url = ?, modify_time = CURRENT_TIMESTAMP + `, [ + [[ + userInfo.user_id, + userInfo.user_type, + userInfo.nickname, + userInfo.avatar_url, + ]], + userInfo.user_type, + userInfo.nickname, + userInfo.avatar_url + ]); + // console.log(result); + } - // 判断是否完成 - // if(){ - isFinish = true; - // } - // 更新 queryParams - queryParams.before = 1111; - // 更新 progress - progress.maxTime = 1000; - progress.currentTime = 1; + // console.log(commentResult.body.more, comments.length, commentInfoList.length); + + // 判断是否还有下一页 + if (commentResult.body.more && comments.length > 0) { + // console.log("还没结束"); + // 更新 progress + progress.currentTime = comments[comments.length - 1].time; + if (progress.maxTime == progress.minTime) { // minTime = maxTime 代表这是本轮爬取的第一次 + progress.maxTime = comments[0].time; + } + progress.currentStatus = 1; // 0-等待爬取/增量爬取 1-爬取中 2-完成 + // 更新 queryParams + queryParams.before = progress.currentTime; + progress.total = commentResult.body.total; + } else { + isFinish = true; + console.log(`comment: ${songId} 结束了`); + progress.currentStatus = 2; // 0-等待爬取/增量爬取 1-爬取中 2-完成 + progress.minTime = progress.maxTime; // minTime = maxTime 代表这一轮爬取完成了 + progress.currentTime = progress.maxTime; // 可有可无 + } // progress更新到数据库中 - // // console.log("commentInfo", commentInfo); - // dbUtils.query('INSERT IGNORE INTO comment SET ?', { - // comment_id: commentInfo.commentId, - // comment: commentInfo.comment, - // version: commentInfo.version, - // }); + await dbUtils.query('UPDATE comment_progress SET ? WHERE song_id = ? LIMIT 1',[ { + max_time: progress.maxTime, + min_time: progress.minTime, + current_time: progress.currentTime, + current_status: progress.currentStatus, + total: progress.total, + }, songId]); + await sleepUtils.sleep(global.sleepTime); } // return commentInfo; } +function getCommitInfoForInsert(songId, comment, commentType) { + return { + comment_id: comment.commentId, + parent_comment_id: comment.parentCommentId, + user_id: comment.user?.userId, + song_id: songId, + content: comment.content, + time: comment.time, + like_count: comment.likedCount, + comment_type: commentType, // 评论类型 0-comments 1-hotComments 2-topComments + } +} + +function getUserInfoForInsert(user) { + var shortAvatarUrlUrl = user.avatarUrl.match(/^http:\/\/p\d+\.music\.126\.net\/(.*?)$/); + shortAvatarUrlUrl = shortAvatarUrlUrl ? shortAvatarUrlUrl[1] : user.avatarUrl; + return { + user_id: user.userId, + user_type: user.userType, + nickname: user.nickname, + avatar_url: shortAvatarUrlUrl || user.avatarUrl, + } +} + module.exports = { fetchAll: fetchAll, fetch: fetch,