1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee
This commit is contained in:
2022-10-02 19:16:41 +08:00
parent 1939398579
commit 326201fb2f
3 changed files with 108 additions and 73 deletions

View File

@@ -56,6 +56,7 @@ async function main() {
await albumInfoUtils.fetchAll({}); await albumInfoUtils.fetchAll({});
await artistInfoUtils.fetchAll(); await artistInfoUtils.fetchAll();
await lyricInfoUtils.fetchAll(); await lyricInfoUtils.fetchAll();
await commentUtils.fetchAll();
await sleepUtils.sleep(2000); await sleepUtils.sleep(2000);
} }
} }

View File

@@ -61,8 +61,8 @@ CREATE TABLE `lyric` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `user` ( CREATE TABLE `user` (
`user_id` int(10) unsigned NOT NULL COMMENT '用户id', `user_id` bigint(20) unsigned NOT NULL COMMENT '用户id',
`user_type` tinyint(4) unsigned NOT NULL COMMENT '用户类型', `user_type` varchar(50) NOT NULL COMMENT '用户类型',
`nickname` varchar(200) NOT NULL COMMENT '用户昵称', `nickname` varchar(200) NOT NULL COMMENT '用户昵称',
`avatar_url` varchar(200) NOT NULL COMMENT '用户头像 http://p1.music.126.net/ 后面的部分', `avatar_url` varchar(200) NOT NULL COMMENT '用户头像 http://p1.music.126.net/ 后面的部分',
`create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间', `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间',
@@ -72,8 +72,8 @@ CREATE TABLE `user` (
CREATE TABLE `comment` ( CREATE TABLE `comment` (
`comment_id` bigint(20) unsigned NOT NULL COMMENT '评论id', `comment_id` bigint(20) unsigned NOT NULL COMMENT '评论id',
`parent_comment_id` int(10) unsigned NOT NULL COMMENT '父评论id', `parent_comment_id` bigint(20) unsigned NOT NULL COMMENT '父评论id',
`user_id` int(10) unsigned NOT NULL COMMENT '用户id', `user_id` bigint(20) unsigned NOT NULL COMMENT '用户id',
`song_id` int(10) unsigned NOT NULL COMMENT '歌曲id', `song_id` int(10) unsigned NOT NULL COMMENT '歌曲id',
`content` text NOT NULL COMMENT '评论内容', `content` text NOT NULL COMMENT '评论内容',
`time` varchar(50) NOT NULL DEFAULT '' COMMENT '评论时间', `time` varchar(50) NOT NULL DEFAULT '' COMMENT '评论时间',
@@ -86,9 +86,9 @@ CREATE TABLE `comment` (
CREATE TABLE `comment_progress` ( CREATE TABLE `comment_progress` (
`song_id` int(10) unsigned NOT NULL COMMENT '歌曲id', `song_id` int(10) unsigned NOT NULL COMMENT '歌曲id',
`max_time` int(10) NOT NULL DEFAULT 0 COMMENT '开始爬取/开始增量爬取的时候 最新一条评论的时间', `max_time` bigint(20) NOT NULL DEFAULT 0 COMMENT '开始爬取/开始增量爬取的时候 最新一条评论的时间',
`min_time` int(10) NOT NULL DEFAULT 0 COMMENT '上一次爬取时最后一条评论的时间 第一次爬取时为0', `min_time` bigint(20) NOT NULL DEFAULT 0 COMMENT '上一次爬取时最后一条评论的时间 第一次爬取时为0',
`current_time` int(10) NOT NULL DEFAULT 0 COMMENT '本次爬取/增量时,最早的一条评论时间', `current_time` bigint(20) NOT NULL DEFAULT 0 COMMENT '本次爬取/增量时,最早的一条评论时间',
`current_status` tinyint(4) unsigned NOT NULL DEFAULT 0 COMMENT '爬取进度 0-等待爬取/增量爬取 1-爬取中 2-完成', `current_status` tinyint(4) unsigned NOT NULL DEFAULT 0 COMMENT '爬取进度 0-等待爬取/增量爬取 1-爬取中 2-完成',
`total` int(10) NOT NULL DEFAULT 0 COMMENT '评论总数', `total` int(10) NOT NULL DEFAULT 0 COMMENT '评论总数',
`create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间', `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间',

View File

@@ -18,7 +18,7 @@ async function fetchAll() {
// 首先查询有无正在爬取中的记录 // 首先查询有无正在爬取中的记录
var songIds = await dbUtils.query(` var songIds = await dbUtils.query(`
SELECT song_id FROM comment_progress WHERE current_status != 2 LIMIT 1 SELECT song_id FROM comment_progress WHERE current_status != 2
`, []); `, []);
songIds = songIds.map(item => item.song_id); songIds = songIds.map(item => item.song_id);
@@ -74,108 +74,142 @@ async function fetch({ songId, debug = false }) {
limit: 20, limit: 20,
// before: undefined, // before: undefined,
}; };
console.log(progress); if (progress.currentTime != 0)
queryParams.before = progress.currentTime;
let isFinish = false; let isFinish = false; let pageCount = 0;
while (!isFinish) { while (!isFinish) {
await global.checkIsExit();
console.log(`comment: ${songId}, 页数: ${++pageCount}`);
// 是否是第一页 // 是否是第一页
let isFirstPage = progress.currentStatus === 0; let isFirstPage = progress.currentStatus === 0;
try { try {
// console.log(progress, queryParams);
var commentResult = await comment_music(queryParams); var commentResult = await comment_music(queryParams);
fs.writeFileSync(path.join(__dirname, "../../temp", `comment-${songId}.json`), JSON.stringify(commentResult)); // fs.writeFileSync(path.join(__dirname, "../../temp", `comment-${songId}-${pageCount}.json`), JSON.stringify(commentResult));
} catch (errors) { } catch (errors) {
console.error(errors); console.error(errors);
await sleepUtils.sleep(1000); await sleepUtils.sleep(1000);
continue; continue;
} }
var topComments = commentResult.body.hotComments || []; var topComments = commentResult.body.topComments || [];
var hotComments = commentResult.body.hotComments || []; var hotComments = commentResult.body.hotComments || [];
var comments = commentResult.body.hotComments || []; var comments = commentResult.body.comments || [];
function getCommitInfoForInsert(comment, commentType) {
return {
comment_id: comment.commentId,
parent_comment_id: comment.parentCommentId,
user_id: comment.user?.userId,
song_id: songId,
content: comment.content,
time: comment.time,
like_count: comment.likedCount,
comment_type: commentType, // 评论类型 0-comments 1-hotComments 2-topComments
}
}
function getUserInfoForInsert(comment) {
const user = comment.user;
var shortAvatarUrlUrl = user.avatarUrl.match(/^http:\/\/p\d+\.music\.126\.net\/(.*?)$/);
shortAvatarUrlUrl = shortAvatarUrlUrl ? shortAvatarUrlUrl[1] : user.avatarUrl;
return {
user_id: user.userId,
user_type: user.userType,
nickname: user.nickname,
avatar_url: shortAvatarUrlUrl || user.avatarUrl,
}
}
var commentInfoList = [ var commentInfoList = [
...topComments.map(comment => getCommitInfoForInsert(comment, 2)), ...topComments.map(comment => getCommitInfoForInsert(songId, comment, 2)),
...hotComments.map(comment => getCommitInfoForInsert(comment, 1)), ...hotComments.map(comment => getCommitInfoForInsert(songId, comment, 1)),
...comments.map(comment => getCommitInfoForInsert(comment, 0)) ...comments.map(comment => getCommitInfoForInsert(songId, comment, 0))
]; ];
var userInfoList = [...topComments, ...hotComments, ...comments] var userInfoList = [...topComments, ...hotComments, ...comments]
.filter(comment => comment.user).map(getUserInfoForInsert); .map(comment => comment.user).filter(user => !!user).map(getUserInfoForInsert);
console.log(commentInfoList); // console.log(commentInfoList);
// console.log(userInfoList); // console.log(userInfoList);
commentInfoList.forEach(async function (commentInfo) { for (let commentInfo of commentInfoList) {
let result = await dbUtils.query(` let result = await dbUtils.query(`
INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ? INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
ON DUPLICATE KEY UPDATE content = ? , like_count = ? , comment_type = GREATEST(comment_type, ? ), modify_time = CURRENT_TIMESTAMP ON DUPLICATE KEY UPDATE content = ?, like_count = ?, comment_type = GREATEST(comment_type, ?), modify_time = CURRENT_TIMESTAMP
`, [ `, [
[ [[
[ commentInfo.comment_id,
commentInfo.comment_id, commentInfo.parent_comment_id,
commentInfo.parent_comment_id, commentInfo.user_id,
commentInfo.user_id, commentInfo.song_id,
commentInfo.song_id, commentInfo.content,
commentInfo.content, commentInfo.time,
commentInfo.time, commentInfo.like_count,
commentInfo.like_count, commentInfo.comment_type
commentInfo.comment_type ]],
]
],
commentInfo.content, commentInfo.content,
commentInfo.like_count, commentInfo.like_count,
commentInfo.comment_type commentInfo.comment_type
]); ]);
console.log(result); // console.log(result);
}); }
// process.exit(0); for (let userInfo of userInfoList) {
let result = await dbUtils.query(`
INSERT INTO user ( user_id, user_type, nickname, avatar_url ) VALUES ?
ON DUPLICATE KEY UPDATE user_type = ?, nickname = ?, avatar_url = ?, modify_time = CURRENT_TIMESTAMP
`, [
[[
userInfo.user_id,
userInfo.user_type,
userInfo.nickname,
userInfo.avatar_url,
]],
userInfo.user_type,
userInfo.nickname,
userInfo.avatar_url
]);
// console.log(result);
}
// 判断是否完成 // console.log(commentResult.body.more, comments.length, commentInfoList.length);
// if(){
isFinish = true; // 判断是否还有下一页
// } if (commentResult.body.more && comments.length > 0) {
// 更新 queryParams // console.log("还没结束");
queryParams.before = 1111; // 更新 progress
// 更新 progress progress.currentTime = comments[comments.length - 1].time;
progress.maxTime = 1000; if (progress.maxTime == progress.minTime) { // minTime = maxTime 代表这是本轮爬取的第一次
progress.currentTime = 1; progress.maxTime = comments[0].time;
}
progress.currentStatus = 1; // 0-等待爬取/增量爬取 1-爬取中 2-完成
// 更新 queryParams
queryParams.before = progress.currentTime;
progress.total = commentResult.body.total;
} else {
isFinish = true;
console.log(`comment: ${songId} 结束了`);
progress.currentStatus = 2; // 0-等待爬取/增量爬取 1-爬取中 2-完成
progress.minTime = progress.maxTime; // minTime = maxTime 代表这一轮爬取完成了
progress.currentTime = progress.maxTime; // 可有可无
}
// progress更新到数据库中 // progress更新到数据库中
// // console.log("commentInfo", commentInfo); await dbUtils.query('UPDATE comment_progress SET ? WHERE song_id = ? LIMIT 1',[ {
// dbUtils.query('INSERT IGNORE INTO comment SET ?', { max_time: progress.maxTime,
// comment_id: commentInfo.commentId, min_time: progress.minTime,
// comment: commentInfo.comment, current_time: progress.currentTime,
// version: commentInfo.version, current_status: progress.currentStatus,
// }); total: progress.total,
}, songId]);
await sleepUtils.sleep(global.sleepTime);
} }
// return commentInfo; // return commentInfo;
} }
function getCommitInfoForInsert(songId, comment, commentType) {
return {
comment_id: comment.commentId,
parent_comment_id: comment.parentCommentId,
user_id: comment.user?.userId,
song_id: songId,
content: comment.content,
time: comment.time,
like_count: comment.likedCount,
comment_type: commentType, // 评论类型 0-comments 1-hotComments 2-topComments
}
}
function getUserInfoForInsert(user) {
var shortAvatarUrlUrl = user.avatarUrl.match(/^http:\/\/p\d+\.music\.126\.net\/(.*?)$/);
shortAvatarUrlUrl = shortAvatarUrlUrl ? shortAvatarUrlUrl[1] : user.avatarUrl;
return {
user_id: user.userId,
user_type: user.userType,
nickname: user.nickname,
avatar_url: shortAvatarUrlUrl || user.avatarUrl,
}
}
module.exports = { module.exports = {
fetchAll: fetchAll, fetchAll: fetchAll,
fetch: fetch, fetch: fetch,