1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee
This commit is contained in:
程序员小墨 2022-10-02 19:16:41 +08:00
parent 1939398579
commit 326201fb2f
3 changed files with 108 additions and 73 deletions

View File

@ -56,6 +56,7 @@ async function main() {
await albumInfoUtils.fetchAll({});
await artistInfoUtils.fetchAll();
await lyricInfoUtils.fetchAll();
await commentUtils.fetchAll();
await sleepUtils.sleep(2000);
}
}

View File

@ -61,8 +61,8 @@ CREATE TABLE `lyric` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `user` (
`user_id` int(10) unsigned NOT NULL COMMENT '用户id',
`user_type` tinyint(4) unsigned NOT NULL COMMENT '用户类型',
`user_id` bigint(20) unsigned NOT NULL COMMENT '用户id',
`user_type` varchar(50) NOT NULL COMMENT '用户类型',
`nickname` varchar(200) NOT NULL COMMENT '用户昵称',
`avatar_url` varchar(200) NOT NULL COMMENT '用户头像 http://p1.music.126.net/ 后面的部分',
`create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间',
@ -72,8 +72,8 @@ CREATE TABLE `user` (
CREATE TABLE `comment` (
`comment_id` bigint(20) unsigned NOT NULL COMMENT '评论id',
`parent_comment_id` int(10) unsigned NOT NULL COMMENT '父评论id',
`user_id` int(10) unsigned NOT NULL COMMENT '用户id',
`parent_comment_id` bigint(20) unsigned NOT NULL COMMENT '父评论id',
`user_id` bigint(20) unsigned NOT NULL COMMENT '用户id',
`song_id` int(10) unsigned NOT NULL COMMENT '歌曲id',
`content` text NOT NULL COMMENT '评论内容',
`time` varchar(50) NOT NULL DEFAULT '' COMMENT '评论时间',
@ -86,9 +86,9 @@ CREATE TABLE `comment` (
CREATE TABLE `comment_progress` (
`song_id` int(10) unsigned NOT NULL COMMENT '歌曲id',
`max_time` int(10) NOT NULL DEFAULT 0 COMMENT '开始爬取/开始增量爬取的时候 最新一条评论的时间',
`min_time` int(10) NOT NULL DEFAULT 0 COMMENT '上一次爬取时最后一条评论的时间 第一次爬取时为0',
`current_time` int(10) NOT NULL DEFAULT 0 COMMENT '本次爬取/增量时,最早的一条评论时间',
`max_time` bigint(20) NOT NULL DEFAULT 0 COMMENT '开始爬取/开始增量爬取的时候 最新一条评论的时间',
`min_time` bigint(20) NOT NULL DEFAULT 0 COMMENT '上一次爬取时最后一条评论的时间 第一次爬取时为0',
`current_time` bigint(20) NOT NULL DEFAULT 0 COMMENT '本次爬取/增量时,最早的一条评论时间',
`current_status` tinyint(4) unsigned NOT NULL DEFAULT 0 COMMENT '爬取进度 0-等待爬取/增量爬取 1-爬取中 2-完成',
`total` int(10) NOT NULL DEFAULT 0 COMMENT '评论总数',
`create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间',

View File

@ -18,7 +18,7 @@ async function fetchAll() {
// 首先查询有无正在爬取中的记录
var songIds = await dbUtils.query(`
SELECT song_id FROM comment_progress WHERE current_status != 2 LIMIT 1
SELECT song_id FROM comment_progress WHERE current_status != 2
`, []);
songIds = songIds.map(item => item.song_id);
@ -74,108 +74,142 @@ async function fetch({ songId, debug = false }) {
limit: 20,
// before: undefined,
};
console.log(progress);
if (progress.currentTime != 0)
queryParams.before = progress.currentTime;
let isFinish = false;
let isFinish = false; let pageCount = 0;
while (!isFinish) {
await global.checkIsExit();
console.log(`comment: ${songId}, 页数: ${++pageCount}`);
// 是否是第一页
let isFirstPage = progress.currentStatus === 0;
try {
// console.log(progress, queryParams);
var commentResult = await comment_music(queryParams);
fs.writeFileSync(path.join(__dirname, "../../temp", `comment-${songId}.json`), JSON.stringify(commentResult));
// fs.writeFileSync(path.join(__dirname, "../../temp", `comment-${songId}-${pageCount}.json`), JSON.stringify(commentResult));
} catch (errors) {
console.error(errors);
await sleepUtils.sleep(1000);
continue;
}
var topComments = commentResult.body.hotComments || [];
var topComments = commentResult.body.topComments || [];
var hotComments = commentResult.body.hotComments || [];
var comments = commentResult.body.hotComments || [];
var comments = commentResult.body.comments || [];
function getCommitInfoForInsert(comment, commentType) {
return {
comment_id: comment.commentId,
parent_comment_id: comment.parentCommentId,
user_id: comment.user?.userId,
song_id: songId,
content: comment.content,
time: comment.time,
like_count: comment.likedCount,
comment_type: commentType, // 评论类型 0-comments 1-hotComments 2-topComments
}
}
function getUserInfoForInsert(comment) {
const user = comment.user;
var shortAvatarUrlUrl = user.avatarUrl.match(/^http:\/\/p\d+\.music\.126\.net\/(.*?)$/);
shortAvatarUrlUrl = shortAvatarUrlUrl ? shortAvatarUrlUrl[1] : user.avatarUrl;
return {
user_id: user.userId,
user_type: user.userType,
nickname: user.nickname,
avatar_url: shortAvatarUrlUrl || user.avatarUrl,
}
}
var commentInfoList = [
...topComments.map(comment => getCommitInfoForInsert(comment, 2)),
...hotComments.map(comment => getCommitInfoForInsert(comment, 1)),
...comments.map(comment => getCommitInfoForInsert(comment, 0))
...topComments.map(comment => getCommitInfoForInsert(songId, comment, 2)),
...hotComments.map(comment => getCommitInfoForInsert(songId, comment, 1)),
...comments.map(comment => getCommitInfoForInsert(songId, comment, 0))
];
var userInfoList = [...topComments, ...hotComments, ...comments]
.filter(comment => comment.user).map(getUserInfoForInsert);
.map(comment => comment.user).filter(user => !!user).map(getUserInfoForInsert);
console.log(commentInfoList);
// console.log(commentInfoList);
// console.log(userInfoList);
commentInfoList.forEach(async function (commentInfo) {
for (let commentInfo of commentInfoList) {
let result = await dbUtils.query(`
INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
ON DUPLICATE KEY UPDATE content = ? , like_count = ? , comment_type = GREATEST(comment_type, ? ), modify_time = CURRENT_TIMESTAMP
ON DUPLICATE KEY UPDATE content = ?, like_count = ?, comment_type = GREATEST(comment_type, ?), modify_time = CURRENT_TIMESTAMP
`, [
[
[
commentInfo.comment_id,
commentInfo.parent_comment_id,
commentInfo.user_id,
commentInfo.song_id,
commentInfo.content,
commentInfo.time,
commentInfo.like_count,
commentInfo.comment_type
]
],
[[
commentInfo.comment_id,
commentInfo.parent_comment_id,
commentInfo.user_id,
commentInfo.song_id,
commentInfo.content,
commentInfo.time,
commentInfo.like_count,
commentInfo.comment_type
]],
commentInfo.content,
commentInfo.like_count,
commentInfo.comment_type
]);
console.log(result);
});
// console.log(result);
}
// process.exit(0);
for (let userInfo of userInfoList) {
let result = await dbUtils.query(`
INSERT INTO user ( user_id, user_type, nickname, avatar_url ) VALUES ?
ON DUPLICATE KEY UPDATE user_type = ?, nickname = ?, avatar_url = ?, modify_time = CURRENT_TIMESTAMP
`, [
[[
userInfo.user_id,
userInfo.user_type,
userInfo.nickname,
userInfo.avatar_url,
]],
userInfo.user_type,
userInfo.nickname,
userInfo.avatar_url
]);
// console.log(result);
}
// 判断是否完成
// if(){
isFinish = true;
// }
// 更新 queryParams
queryParams.before = 1111;
// 更新 progress
progress.maxTime = 1000;
progress.currentTime = 1;
// console.log(commentResult.body.more, comments.length, commentInfoList.length);
// 判断是否还有下一页
if (commentResult.body.more && comments.length > 0) {
// console.log("还没结束");
// 更新 progress
progress.currentTime = comments[comments.length - 1].time;
if (progress.maxTime == progress.minTime) { // minTime = maxTime 代表这是本轮爬取的第一次
progress.maxTime = comments[0].time;
}
progress.currentStatus = 1; // 0-等待爬取/增量爬取 1-爬取中 2-完成
// 更新 queryParams
queryParams.before = progress.currentTime;
progress.total = commentResult.body.total;
} else {
isFinish = true;
console.log(`comment: ${songId} 结束了`);
progress.currentStatus = 2; // 0-等待爬取/增量爬取 1-爬取中 2-完成
progress.minTime = progress.maxTime; // minTime = maxTime 代表这一轮爬取完成了
progress.currentTime = progress.maxTime; // 可有可无
}
// progress更新到数据库中
// // console.log("commentInfo", commentInfo);
// dbUtils.query('INSERT IGNORE INTO comment SET ?', {
// comment_id: commentInfo.commentId,
// comment: commentInfo.comment,
// version: commentInfo.version,
// });
await dbUtils.query('UPDATE comment_progress SET ? WHERE song_id = ? LIMIT 1',[ {
max_time: progress.maxTime,
min_time: progress.minTime,
current_time: progress.currentTime,
current_status: progress.currentStatus,
total: progress.total,
}, songId]);
await sleepUtils.sleep(global.sleepTime);
}
// return commentInfo;
}
function getCommitInfoForInsert(songId, comment, commentType) {
return {
comment_id: comment.commentId,
parent_comment_id: comment.parentCommentId,
user_id: comment.user?.userId,
song_id: songId,
content: comment.content,
time: comment.time,
like_count: comment.likedCount,
comment_type: commentType, // 评论类型 0-comments 1-hotComments 2-topComments
}
}
function getUserInfoForInsert(user) {
var shortAvatarUrlUrl = user.avatarUrl.match(/^http:\/\/p\d+\.music\.126\.net\/(.*?)$/);
shortAvatarUrlUrl = shortAvatarUrlUrl ? shortAvatarUrlUrl[1] : user.avatarUrl;
return {
user_id: user.userId,
user_type: user.userType,
nickname: user.nickname,
avatar_url: shortAvatarUrlUrl || user.avatarUrl,
}
}
module.exports = {
fetchAll: fetchAll,
fetch: fetch,