1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee

插入关联表时同事插入wait_check表;统一查询将要爬取的id代码到dataManager.js

This commit is contained in:
程序员小墨 2022-10-25 19:36:05 +08:00
parent 4753fd55ae
commit 3660fefda4
9 changed files with 192 additions and 119 deletions

View File

@ -3,18 +3,31 @@ if (process.argv.length <= 2) {
"参数不够",
"node index --utils [song|album|artist|lyric|comment] --min [number] --max [number] --order [false|ASC|DESC] --limit [number]",
// "",
// "node index --utils song --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils album --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils artist --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils lyric --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils comment --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils xxx --min xxx --max xxx --order ASC --limit 2000",
].join('\n');
console.log(output);
return;
}
var args = require('minimist')(process.argv.slice(2));
args = {
// 子模块
utils: args.utils,
// id 范围
min: Number(args.min) || undefined,
max: Number(args.max) || undefined,
// 顺序
order: args.order,
// 数量
limit: Number(args.limit) || undefined,
// 分区
partition: Number(args.partition) || undefined,
sleepTime: Number(args.sleepTime) || 100,
}
console.log("args:", args);
global.sleepTime = args.sleepTime; // 两次请求之间停顿时间
global.useMysqlPool = true;
const neteaseMusic = require('./src/index');
neteaseMusic.main(args);

View File

@ -212,26 +212,49 @@ CREATE TABLE `analysis` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `wait_song` (
`id` int(10) unsigned NOT NULL COMMENT 'id',
`partition` tinyint(4) unsigned NOT NULL COMMENT '分区 0-4',
CREATE TABLE `wait_check_song` (
`id` bigint(20) unsigned NOT NULL COMMENT 'id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `wait_artist` (
`id` int(10) unsigned NOT NULL COMMENT 'id',
`partition` tinyint(4) unsigned NOT NULL COMMENT '分区 0-4',
CREATE TABLE `wait_check_artist` (
`id` bigint(20) unsigned NOT NULL COMMENT 'id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `wait_album` (
`id` int(10) unsigned NOT NULL COMMENT 'id',
`partition` tinyint(4) unsigned NOT NULL COMMENT '分区 0-4',
CREATE TABLE `wait_check_album` (
`id` bigint(20) unsigned NOT NULL COMMENT 'id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `wait_lyric` (
`id` int(10) unsigned NOT NULL COMMENT 'id',
`partition` tinyint(4) unsigned NOT NULL COMMENT '分区 0-4',
CREATE TABLE `wait_check_lyric` (
`id` bigint(20) unsigned NOT NULL COMMENT 'id',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `wait_fetch_song` (
`id` bigint(20) unsigned NOT NULL COMMENT 'id',
`partition` tinyint(4) unsigned DEFAULT NULL COMMENT '分区 0-4',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `wait_fetch_artist` (
`id` bigint(20) unsigned NOT NULL COMMENT 'id',
`partition` tinyint(4) unsigned DEFAULT NULL COMMENT '分区 0-4',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `wait_fetch_album` (
`id` bigint(20) unsigned NOT NULL COMMENT 'id',
`partition` tinyint(4) unsigned DEFAULT NULL COMMENT '分区 0-4',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
CREATE TABLE `wait_fetch_lyric` (
`id` bigint(20) unsigned NOT NULL COMMENT 'id',
`partition` tinyint(4) unsigned DEFAULT NULL COMMENT '分区 0-4',
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;

View File

@ -20,7 +20,26 @@ module.exports = {
songInfo.noCopyrightRcmd, songInfo.mv, songInfo.single, songInfo.version, 2
])]);
},
getIdsToFetch: async (args) => {
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
let sql = `
SELECT song_id FROM wait_fetch_song WHERE ${whereClause}
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
// // 更新现有数据
// sql = `SELECT song_id FROM song WHERE data_version = 1`;
console.log(sql);
let songIds = await dbUtils.query(sql, []);
songIds = songIds.map(item => item.song_id);
return songIds;
},
},
album: {
insert: async (albumInfo) => {
@ -29,20 +48,60 @@ module.exports = {
update: async (albumId, albumInfo) => {
return await dbUtils.query(`UPDATE album SET ? WHERE album_id = ${albumId}`, albumInfo);
}
},
getIdsToFetch: async (args, isUpdate) => {
let sql = "";
if (isUpdate) {
sql = `SELECT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`;
} else {
let whereClause = [
args.min ? `album_id > ${args.min}` : '1=1',
args.max ? `album_id <= ${args.max}` : '1=1',
].join(' AND ');
sql = `
SELECT album_id FROM wait_fetch_album WHERE ${whereClause}
${args.order ? `ORDER BY album_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
}
console.log(sql);
let albumIds = await dbUtils.query(sql, []);
albumIds = albumIds.map(item => item.album_id);
return albumIds;
},
},
artist: {
insert: async (artistInfo) => {
return await dbUtils.query('INSERT IGNORE INTO artist SET ?', artistInfo);
},
getIdsToFetch: async (args) => {
let whereClause = [
args.min ? `artist_id > ${args.min}` : '1=1',
args.max ? `artist_id <= ${args.max}` : '1=1',
].join(' AND ');
let sql = `
SELECT artist_id FROM wait_fetch_artist WHERE ${whereClause}
${args.order ? `ORDER BY artist_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
let artistIds = await dbUtils.query(sql, []);
artistIds = artistIds.map(item => item.artist_id);
return artistIds;
}
},
lyric: {
insert: async (lyricInfo) => {
return await dbUtils.query('INSERT IGNORE INTO lyric SET ?', lyricInfo);
}
},
},
comment: {
insertCollection: async (commentInfoList) => {
@ -51,8 +110,9 @@ module.exports = {
INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
ON DUPLICATE KEY UPDATE content = VALUES(content), like_count = VALUES(like_count), comment_type = GREATEST(comment_type, VALUES(comment_type)), modify_time = CURRENT_TIMESTAMP
`, [commentInfoList]);
}
},
},
comment_progress: {
update: async (commentProgressInfo, songId) => {
@ -60,6 +120,7 @@ module.exports = {
},
},
playlist: {
insertCollection: async (playlistInfo) => {
if (playlistInfo.length == 0) return;
@ -67,8 +128,9 @@ module.exports = {
INSERT INTO playlist ( ${Object.keys(playlistInfo).map(field => `\`${field}\``).join(",")} ) VALUES ?
ON DUPLICATE KEY UPDATE ${Object.keys(playlistInfo).map(field => `${field}=VALUES(${field})`).join(", ")}
`, [[Object.values(playlistInfo)]]);
}
},
},
user: {
insertCollection: async (userInfoList) => {
@ -77,27 +139,44 @@ module.exports = {
INSERT INTO user ( user_id, user_type, nickname, avatar_url ) VALUES ?
ON DUPLICATE KEY UPDATE user_type = VALUES(user_type), nickname = VALUES(nickname), avatar_url = VALUES(avatar_url), modify_time = CURRENT_TIMESTAMP
`, [userInfoList]);
}
},
},
song_album: {
insertCollection: async (songAlbumRel) => {
if (songAlbumRel.length == 0) return;
return await dbUtils.query('INSERT IGNORE INTO song_album_relation (song_id, album_id) VALUES ?', [songAlbumRel]);
}
},
},
song_artist: {
insertCollection: async (songArtistRel) => {
if (songArtistRel.length == 0) return;
return await dbUtils.query('INSERT IGNORE INTO song_artist_relation (song_id, artist_id) VALUES ?', [songArtistRel]);
}
},
},
song_playlist: {
insertCollection: async (trackIds) => {
if (trackIds.length == 0) return;
return await dbUtils.query('INSERT IGNORE INTO song_playlist_relation (song_id, playlist_id, alg, rcmd_reason) VALUES ?', [trackIds]);
}
},
},
/* ##################################################### */
// 将 id 插入待检查表
wait_check: {
insert: async (type, ids) => {
// 过滤掉 id 为 0 的
ids = ids.filter(id => id < 0);
return await dbUtils.query(`INSERT IGNORE INTO wait_check_${type} (id) VALUES ?`, [ids]);
},
},
};

View File

@ -29,28 +29,7 @@ SELECT * FROM album WHERE (full_description = '' or full_description is null) an
async function fetchAll({ args = {}, isUpdate = false }) {
console.log("start fetching albums ...");
if (isUpdate) {
var sql = `
SELECT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'
`;
} else {
let whereClause = [
args.min ? `album_id > ${args.min}` : '1=1',
args.max ? `album_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
-- 查出来通过代码去重提高速度
SELECT album_id FROM song_album_relation WHERE ${whereClause} AND album_id NOT IN ( SELECT album_id FROM album )
${args.order ? `ORDER BY album_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
}
var albumIds = await dbUtils.query(sql, []);
albumIds = albumIds.map(item => item.album_id);
albumIds = Array.from(new Set(albumIds));
let albumIds = await dataManager.album.getIdsToFetch(args, isUpdate);
for (let i = 0; i < albumIds.length; i++) {
await global.checkIsExit();
const albumId = albumIds[i];
@ -147,7 +126,7 @@ async function fetch({ albumId, debug = false, update = false }) {
let image = /<meta property="og:image" content="http:\/\/p.\.music\.126\.net\/(.*?)" \/>/.exec(html)[1];
let songListJSONString = /<textarea id="song-list-pre-data" style="display:none;">(.*?)<\/textarea>/.exec(html)[1];
let songList = JSON.parse(songListJSONString);
let songIds = songList.map(song => song.id);
let songIds = songList.map(song => Number(song.id));
let albumInfo = {
album_id: albumId,
@ -160,8 +139,10 @@ async function fetch({ albumId, debug = false, update = false }) {
version: 1
};
// console.log("albumInfo", albumInfo);
await dataManager.wait_check.insert("song", songIds);
if (albumId > 0) {
let songAlbumRel = songIds.map(songId => [Number(songId), albumId]);
let songAlbumRel = songIds.map(songId => [songId, albumId]);
await dataManager.song_album.insertCollection(songAlbumRel);
}

View File

@ -25,22 +25,7 @@ async function getFromDatabase({ artistId }) {
// 从数据库中查出还缺少的歌手,并进行爬取
async function fetchAll({ args = {} }) {
console.log("start fetching artists ...");
let whereClause = [
args.min ? `artist_id > ${args.min}` : '1=1',
args.max ? `artist_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
-- 查出来通过代码去重提高速度
-- SELECT DISTINCT artist_id FROM song_artist_relation WHERE ${whereClause} AND artist_id NOT IN ( SELECT artist_id FROM artist )
SELECT artist_id FROM song_artist_relation WHERE ${whereClause} AND artist_id NOT IN ( SELECT artist_id FROM artist )
${args.order ? `ORDER BY artist_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var artistIds = await dbUtils.query(sql, []);
artistIds = artistIds.map(item => item.artist_id);
artistIds = Array.from(new Set(artistIds));
let artistIds = await dataManager.artist.getIdsToFetch(args);
for (let i = 0; i < artistIds.length; i++) {
await global.checkIsExit();
const artistId = artistIds[i];
@ -97,7 +82,7 @@ async function fetch({ artistId, debug = false }) {
try {
let songListJSONString = /<textarea id="song-list-pre-data" style="display:none;">(.*?)<\/textarea>/.exec(html)[1];
let songList = JSON.parse(songListJSONString);
songIds = songList.map(song => song.id);
songIds = songList.map(song => Number(song.id));
} catch (error) {
// 可能是歌手下面没有音乐 例如https://music.163.com/#/artist?id=30032762
}
@ -111,8 +96,9 @@ async function fetch({ artistId, debug = false }) {
};
// console.log("artistInfo", artistInfo);
await dataManager.wait_check.insert("song", songIds);
if (artistId > 0) {
let songArtistRel = songIds.map(songId => [Number(songId), artistId]);
let songArtistRel = songIds.map(songId => [songId, artistId]);
await dataManager.song_artist.insertCollection(songArtistRel);
}

View File

@ -173,6 +173,7 @@ async function fetch({ playlistId, debug = false }) {
process.exit(0);
}
await dataManager.wait_check.insert("song", playlist.trackIds.map(track => track.id));
let trackIds = playlist.trackIds.map(track => [track.id, playlist.id, track.alg, track.rcmdReason]);
await dataManager.song_playlist.insertCollection(trackIds);
await dataManager.playlist.insertCollection(playlistInfo);

View File

@ -1,6 +1,7 @@
const fs = require('fs');
const path = require('path');
const requestUtils = require('../../../utils/requestUtils');
const sleepUtils = require('../../../utils/sleepUtils');
const dataManager = require('../dataManager');
@ -11,35 +12,7 @@ const { song_detail } = require('NeteaseCloudMusicApi');
// 从数据库中查出还缺少的歌曲,并进行爬取
async function fetchAll({ args = {} }) {
console.log("start fetching songs ...");
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql1 = `
SELECT song_id FROM song_artist_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
var sql2 = `
SELECT song_id FROM song_album_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
// // 更新现有数据
// sql = `SELECT song_id FROM song WHERE data_version = 1`;
// 测试用
// sql = `SELECT song_id FROM song_artist_relation group by song_id limit 10`;
console.log(sql1);
var songIds1 = await dbUtils.query(sql1, []);
songIds1 = songIds1.map(item => item.song_id);
console.log(sql2);
var songIds2 = await dbUtils.query(sql2, []);
songIds2 = songIds2.map(item => item.song_id);
var songIds = songIds1.concat(songIds2);
songIds = Array.from(new Set(songIds)); // 去重
let songIds = await dataManager.song.getIdsToFetch(args);
// 0 - 100, 200 - 399, 400 - ..., ... - songIds.length-1
// 0 1 2 count-1
var step = 1000;
@ -71,9 +44,14 @@ async function fetch({ songIdArray, debug = false }) {
}
// console.log(songResult.body.songs.map(item => JSON.stringify(item)));
let albumIds = [], artistIds = [];
let songAlbumRel = [], songArtistRel = [];
let songInfoList = songResult.body.songs.map(song => {
song.ar.forEach(item => songArtistRel.push([song.id, item.id]));
song.ar.forEach(item => {
artistIds.push(item.id);
songArtistRel.push([song.id, item.id])
});
albumIds.push(song.al.id || 0);
songAlbumRel.push([song.id, song.al.id || 0])
return {
title: song.name, // 歌曲标题
@ -107,6 +85,8 @@ async function fetch({ songIdArray, debug = false }) {
if (songInfoList.length == 0) return;
console.log("插入数据库");
await dataManager.wait_check.insert("album", albumIds);
await dataManager.wait_check.insert("artist", artistIds);
await dataManager.song_album.insertCollection(songAlbumRel);
await dataManager.song_artist.insertCollection(songArtistRel);
await dataManager.song.insertCollection(songInfoList); // image 因为接口没有返回,所以不更新

View File

@ -12,9 +12,6 @@ dbUtils.create({
global.dbUtils = dbUtils;
console.log("global.useMysqlPool:", !!global.useMysqlPool);
// 两次请求之间停顿时间
global.sleepTime = 10;
// 引入utils
const songInfoUtils = require('./getInfo/songInfoUtils');
const artistInfoUtils = require('./getInfo/artistInfoUtils');
@ -57,19 +54,26 @@ async function main(args) {
// var affectedRows2 = await dbUtils.query(`DELETE FROM song_album_relation WHERE song_id = 0 OR album_id = 0`, []);
// console.log(`删除脏数据 affectedRows:`, affectedRows1.affectedRows, affectedRows2.affectedRows);
if (args.utils == "song")
switch (args.utils) {
case 'song':
await songInfoUtils.fetchAll({ args: args });
else if (args.utils == "album")
break;
case 'album':
await albumInfoUtils.fetchAll({ args: args });
else if (args.utils == "artist")
break;
case 'artist':
await artistInfoUtils.fetchAll({ args: args });
else if (args.utils == "lyric")
break;
case 'lyric':
await lyricInfoUtils.fetchAll({ args: args });
else if (args.utils == "comment")
break;
case 'comment':
await commentUtils.fetchAll({ args: args });
else if (args.utils == "playlist")
break;
case 'playlist':
await playlistUtils.fetchAll({ args: args });
else {
break;
default:
console.log("utils参数不匹配退出");
return;
}

View File

@ -22,15 +22,21 @@ node index --utils lyric --min 0 --max 400000000
node index --utils playlist #
后期:
思路:
通过一首歌查出对应的artist和album然后顺藤摸瓜查出网易云的其他song, album, artist, lyric, comment等
批量查库修改为一条SQL搞定
插入rel表的时候同时插入 wait_check_xx 表,然后后续检查这个表,如果不存在,那么就插入对应的 wait_fetch_xxx 表
之后查出 wait_fetch_xxx 表,进行数据拉取,形成闭环
后期:
歌单定时更新rel表中添加一个del字段先将歌单下面的全部置为删除状态再插入的时候把已有歌曲的标记重新修改为正常状态
评论的更新
爬取歌单playlist
爬取歌单playlist功能需要更新
被删除的aritst和album回头再通过其他表中的数据反查回来