1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee

通过命令行指定爬取参数,不用再修改代码了

This commit is contained in:
2022-10-06 21:06:09 +08:00
parent 9db9383934
commit be2658375c
11 changed files with 202 additions and 48 deletions

View File

@@ -1,3 +1,20 @@
if (process.argv.length <= 2) {
let output = [
"参数不够",
"node index --utils [song|album|artist|lyric|comment] --min [number] --max [number] --order [false|ASC|DESC] --limit [number]",
// "",
// "node index --utils song --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils album --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils artist --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils lyric --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils comment --min xxx --max xxx --order ASC --limit 2000",
].join('\n');
console.log(output);
return;
}
var args = require('minimist')(process.argv.slice(2));
console.log("args:", args);
global.useMysqlPool = true; global.useMysqlPool = true;
const neteaseMusic = require('./netease_music/index'); const neteaseMusic = require('./netease_music/index');
neteaseMusic.main(); neteaseMusic.main(args);

View File

@@ -13,7 +13,7 @@ global.dbUtils = dbUtils;
console.log("global.useMysqlPool:", !!global.useMysqlPool); console.log("global.useMysqlPool:", !!global.useMysqlPool);
// 两次请求之间停顿时间 // 两次请求之间停顿时间
global.sleepTime = 300; global.sleepTime = 10;
// 引入utils // 引入utils
const songInfoUtils = require('./src/getInfo/songInfoUtils'); const songInfoUtils = require('./src/getInfo/songInfoUtils');
@@ -46,7 +46,7 @@ async function test() {
/** /**
* 主函数 * 主函数
*/ */
async function main() { async function main(args) {
console.log("neteaseMusic Start fetch ..."); console.log("neteaseMusic Start fetch ...");
while (true) { while (true) {
// // 删除脏数据 // // 删除脏数据
@@ -54,11 +54,20 @@ async function main() {
// var affectedRows2 = await dbUtils.query(`DELETE FROM song_album_relation WHERE song_id = 0 OR album_id = 0`, []); // var affectedRows2 = await dbUtils.query(`DELETE FROM song_album_relation WHERE song_id = 0 OR album_id = 0`, []);
// console.log(`删除脏数据 affectedRows:`, affectedRows1.affectedRows, affectedRows2.affectedRows); // console.log(`删除脏数据 affectedRows:`, affectedRows1.affectedRows, affectedRows2.affectedRows);
await songInfoUtils.fetchAll(); if (args.utils == "song")
await albumInfoUtils.fetchAll({}); await songInfoUtils.fetchAll({ args: args });
await artistInfoUtils.fetchAll(); else if (args.utils == "album")
await lyricInfoUtils.fetchAll(); await albumInfoUtils.fetchAll({ args: args });
await commentUtils.fetchAll(); else if (args.utils == "artist")
await artistInfoUtils.fetchAll({ args: args });
else if (args.utils == "lyric")
await lyricInfoUtils.fetchAll({ args: args });
else if (args.utils == "comment")
await commentUtils.fetchAll({ args: args });
else {
console.log("utils参数不匹配退出");
return;
}
await sleepUtils.sleep(2000); await sleepUtils.sleep(2000);
} }
} }

View File

@@ -1,4 +1,4 @@
-- 查看需要爬取的音乐的分布 -- 查看需要爬取的 song 的分布
SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM ( FROM (
SELECT DISTINCT song_id FROM song_album_relation SELECT DISTINCT song_id FROM song_album_relation
@@ -9,6 +9,35 @@
GROUP BY s GROUP BY s
ORDER BY s DESC ORDER BY s DESC
-- 查看需要爬取的 album 的分布
SELECT cast( format( album_id / 1000000, 0) * 1000000 as UNSIGNED ) as s, count(*) as count
FROM song_album_relation
WHERE album_id NOT IN ( SELECT album_id FROM album )
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 artist 的分布
SELECT cast( format( artist_id / 2000000, 0) * 2000000 as UNSIGNED ) as s, count(*) as count
FROM song_artist_relation
WHERE artist_id NOT IN ( SELECT artist_id FROM artist )
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 comment 的分布
SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM comment_progress
WHERE current_status != 2
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 lyric 的分布
SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM song
WHERE song_id NOT IN ( SELECT song_id FROM lyric )
GROUP BY s
ORDER BY s DESC
-- optimize table -- optimize table
optimize table album; optimize table album;
optimize table artist; optimize table artist;

View File

@@ -23,19 +23,33 @@ async function getFromDatabase({ albumId }) {
// 正常应该查不出记录才对 // 正常应该查不出记录才对
/* /*
SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$' SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:([:space:]*?|[ ]*?)。,更多.*$'
*/ */
async function fetchAll({ isUpdate = false }) { async function fetchAll({ args = {}, isUpdate = false }) {
console.log("start fetching albums ...") console.log("start fetching albums ...");
var albumIds = await dbUtils.query(isUpdate
? `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'` if (isUpdate) {
: `SELECT DISTINCT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album )`, []); var sql = `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`;
} else {
let whereClause = [
args.min ? `album_id > ${args.min}` : '1=1',
args.max ? `album_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT album_id FROM song_album_relation WHERE ${whereClause} AND album_id NOT IN ( SELECT album_id FROM album )
${args.order ? `ORDER BY album_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
}
var albumIds = await dbUtils.query(sql, []);
albumIds = albumIds.map(item => item.album_id); albumIds = albumIds.map(item => item.album_id);
for (let i = 0; i < albumIds.length; i++) { for (let i = 0; i < albumIds.length; i++) {
await global.checkIsExit(); await global.checkIsExit();
const albumId = albumIds[i]; const albumId = albumIds[i];
console.log(`${i + 1}/${albumIds.length} | album: ${albumId}`); console.log(`${i + 1}/${albumIds.length} | album: ${albumId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try { try {
await fetch({ albumId: albumId, update: isUpdate }); await fetch({ albumId: albumId, update: isUpdate });
} catch (err) { } catch (err) {

View File

@@ -22,16 +22,25 @@ async function getFromDatabase({ artistId }) {
} }
// 从数据库中查出还缺少的歌手,并进行爬取 // 从数据库中查出还缺少的歌手,并进行爬取
async function fetchAll() { async function fetchAll({ args = {} }) {
console.log("start fetching artists ...") console.log("start fetching artists ...");
var artistIds = await dbUtils.query(` let whereClause = [
SELECT DISTINCT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist ) args.min ? `artist_id > ${args.min}` : '1=1',
`, []); args.max ? `artist_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT artist_id FROM song_artist_relation WHERE ${whereClause} AND artist_id NOT IN ( SELECT artist_id FROM artist )
${args.order ? `ORDER BY artist_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var artistIds = await dbUtils.query(sql, []);
artistIds = artistIds.map(item => item.artist_id); artistIds = artistIds.map(item => item.artist_id);
for (let i = 0; i < artistIds.length; i++) { for (let i = 0; i < artistIds.length; i++) {
await global.checkIsExit(); await global.checkIsExit();
const artistId = artistIds[i]; const artistId = artistIds[i];
console.log(`${i + 1}/${artistIds.length} | artist: ${artistId}`); console.log(`${i + 1}/${artistIds.length} | artist: ${artistId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try { try {
await fetch({ artistId: artistId }); await fetch({ artistId: artistId });
} catch (err) { } catch (err) {

View File

@@ -11,28 +11,33 @@ const dbUtils = global.dbUtils;
// https://github.com/Binaryify/NeteaseCloudMusicApi // https://github.com/Binaryify/NeteaseCloudMusicApi
const { comment_music } = require('NeteaseCloudMusicApi'); const { comment_music } = require('NeteaseCloudMusicApi');
async function fetchAll() { async function fetchAll({ args = {} }) {
console.log("start fetching comment ...") console.log("start fetching comment ...");
// 首先将需要爬取的song_id导入comment_progress表 // 首先将需要爬取的song_id导入comment_progress表
await dbUtils.query(` await dbUtils.query(`
INSERT INTO comment_progress ( song_id ) INSERT IGNORE INTO comment_progress ( song_id )
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress ) SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress )
`, []); `, []);
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT song_id FROM comment_progress WHERE ${whereClause} AND current_status != 2
ORDER BY current_status DESC${args.order ? `, song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
// 首先查询有无正在爬取中的记录 // 首先查询有无正在爬取中的记录
var songIds = await dbUtils.query(` var songIds = await dbUtils.query(sql, []);
-- 本机
SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id <= 30000000 LIMIT 1000
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id < 30000000 ORDER BY current_status DESC
-- 服务器
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id > 30000000 ORDER BY current_status DESC
`, []);
songIds = songIds.map(item => item.song_id); songIds = songIds.map(item => item.song_id);
for (let i = 0; i < songIds.length; i++) { for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit(); await global.checkIsExit();
const songId = songIds[i]; const songId = songIds[i];
console.log(`${i + 1}/${songIds.length} | comment: ${songId}`); console.log(`${i + 1}/${songIds.length} | comment: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try { try {
await fetch({ songId: songId }); await fetch({ songId: songId });
} catch (err) { } catch (err) {

View File

@@ -7,11 +7,20 @@ const sleepUtils = require('../../../utils/sleepUtils');
const dbUtils = global.dbUtils; const dbUtils = global.dbUtils;
// 从数据库中查出还缺少的歌词,并进行爬取 // 从数据库中查出还缺少的歌词,并进行爬取
async function fetchAll() { async function fetchAll({ args = {} }) {
console.log("start fetching lyrics ..."); console.log("start fetching lyrics ...");
var songIds = await dbUtils.query(` let whereClause = [
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric ) args.min ? `song_id > ${args.min}` : '1=1',
`, []); args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT song_id FROM song WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM lyric )
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var songIds = await dbUtils.query(sql, []);
songIds = songIds.map(song => song.song_id); songIds = songIds.map(song => song.song_id);
for (let i = 0; i < songIds.length; i++) { for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit(); await global.checkIsExit();

View File

@@ -24,18 +24,27 @@ async function getFromDatabase({ songId }) {
} }
// 从数据库中查出还缺少的歌曲,并进行爬取 // 从数据库中查出还缺少的歌曲,并进行爬取
async function fetchAll() { async function fetchAll({ args = {} }) {
console.log("start fetching songs ..."); console.log("start fetching songs ...");
var songIds = await dbUtils.query(` let whereClause = [
SELECT DISTINCT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song ) args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT song_id FROM song_artist_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
UNION UNION
SELECT DISTINCT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song ) SELECT DISTINCT song_id FROM song_album_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
`, []); ${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var songIds = await dbUtils.query(sql, []);
songIds = songIds.map(item => item.song_id); songIds = songIds.map(item => item.song_id);
for (let i = 0; i < songIds.length; i++) { for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit(); await global.checkIsExit();
const songId = songIds[i]; const songId = songIds[i];
console.log(`${i + 1}/${songIds.length} | song: ${songId}`); console.log(`${i + 1}/${songIds.length} | song: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try { try {
await fetch({ songId: songId }); await fetch({ songId: songId });
} catch (err) { } catch (err) {

11
package-lock.json generated
View File

@@ -12,6 +12,7 @@
"cheerio": "^1.0.0-rc.12", "cheerio": "^1.0.0-rc.12",
"crypto": "^1.0.1", "crypto": "^1.0.1",
"fs": "^0.0.1-security", "fs": "^0.0.1-security",
"minimist": "^1.2.6",
"mysql": "^2.18.1", "mysql": "^2.18.1",
"NeteaseCloudMusicApi": "^4.8.2", "NeteaseCloudMusicApi": "^4.8.2",
"node-schedule": "^2.1.0", "node-schedule": "^2.1.0",
@@ -1371,6 +1372,11 @@
"node": ">= 0.6" "node": ">= 0.6"
} }
}, },
"node_modules/minimist": {
"version": "1.2.6",
"resolved": "https://registry.npmmirror.com/minimist/-/minimist-1.2.6.tgz",
"integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q=="
},
"node_modules/ms": { "node_modules/ms": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz", "resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz",
@@ -3501,6 +3507,11 @@
"mime-db": "1.52.0" "mime-db": "1.52.0"
} }
}, },
"minimist": {
"version": "1.2.6",
"resolved": "https://registry.npmmirror.com/minimist/-/minimist-1.2.6.tgz",
"integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q=="
},
"ms": { "ms": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz", "resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz",

View File

@@ -12,6 +12,7 @@
"cheerio": "^1.0.0-rc.12", "cheerio": "^1.0.0-rc.12",
"crypto": "^1.0.1", "crypto": "^1.0.1",
"fs": "^0.0.1-security", "fs": "^0.0.1-security",
"minimist": "^1.2.6",
"mysql": "^2.18.1", "mysql": "^2.18.1",
"NeteaseCloudMusicApi": "^4.8.2", "NeteaseCloudMusicApi": "^4.8.2",
"node-schedule": "^2.1.0", "node-schedule": "^2.1.0",

View File

@@ -1,3 +1,44 @@
-- 本地
node index --utils song --min 1900000000 --max 2000000000 --order DESC --limit 2000
node index --utils song --min 1800000000 --max 1900000000 --order DESC --limit 2000
-- Linux服务器
node index --utils song --min 1290000000 --max 1500000000 --order DESC --limit 2000
-- Windows服务器
node index --utils song --min 400000000 --max 1000000000 --order ASC --limit 2000
node index --utils song --min 0 --max 400000000 --order ASC --limit 2000
-- Windows 服务器
node index --utils album --min 134000000 --max 160000000 --order DESC --limit 2000
-- 本机
node index --utils album --min 0 --max 134000000 --order DESC --limit 2000
-- Windows服务器
node index --utils artist --min 0 --max 12000000 --order DESC --limit 2000
-- Linux服务器
node index --utils artist --min 12000000 --max 38000000 --order DESC --limit 2000
-- 本机
node index --utils artist --min 38000000 --max 55000000 --order DESC --limit 2000
-- 本机
node index --utils comment --min 1800000000 --max 2000000000 --order DESC --limit 2000
-- Windows服务器
node index --utils comment --min 1290000000 --max 1500000000 --order DESC --limit 2000
node index --utils comment --min 400000000 --max 1000000000 --order ASC --limit 2000
-- Linux服务器
node index --utils comment --min 0 --max 400000000 --order ASC --limit 2000
-- Windows服务器
node index --utils lyric --min 1800000000 --max 2000000000 --order DESC --limit 2000
node index --utils lyric --min 0 --max 400000000 --order ASC --limit 2000
-- 本机
node index --utils lyric --min 400000000 --max 1000000000 --order ASC --limit 2000
后期: 后期:
考虑歌曲别名 例如https://music.163.com/#/song?id=26830207 考虑歌曲别名 例如https://music.163.com/#/song?id=26830207