1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee

通过命令行指定爬取参数,不用再修改代码了

This commit is contained in:
2022-10-06 21:06:09 +08:00
parent 9db9383934
commit be2658375c
11 changed files with 202 additions and 48 deletions

View File

@@ -1,3 +1,20 @@
if (process.argv.length <= 2) {
let output = [
"参数不够",
"node index --utils [song|album|artist|lyric|comment] --min [number] --max [number] --order [false|ASC|DESC] --limit [number]",
// "",
// "node index --utils song --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils album --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils artist --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils lyric --min xxx --max xxx --order ASC --limit 2000",
// "node index --utils comment --min xxx --max xxx --order ASC --limit 2000",
].join('\n');
console.log(output);
return;
}
var args = require('minimist')(process.argv.slice(2));
console.log("args:", args);
global.useMysqlPool = true;
const neteaseMusic = require('./netease_music/index');
neteaseMusic.main();
neteaseMusic.main(args);

View File

@@ -13,7 +13,7 @@ global.dbUtils = dbUtils;
console.log("global.useMysqlPool:", !!global.useMysqlPool);
// 两次请求之间停顿时间
global.sleepTime = 300;
global.sleepTime = 10;
// 引入utils
const songInfoUtils = require('./src/getInfo/songInfoUtils');
@@ -46,7 +46,7 @@ async function test() {
/**
* 主函数
*/
async function main() {
async function main(args) {
console.log("neteaseMusic Start fetch ...");
while (true) {
// // 删除脏数据
@@ -54,11 +54,20 @@ async function main() {
// var affectedRows2 = await dbUtils.query(`DELETE FROM song_album_relation WHERE song_id = 0 OR album_id = 0`, []);
// console.log(`删除脏数据 affectedRows:`, affectedRows1.affectedRows, affectedRows2.affectedRows);
await songInfoUtils.fetchAll();
await albumInfoUtils.fetchAll({});
await artistInfoUtils.fetchAll();
await lyricInfoUtils.fetchAll();
await commentUtils.fetchAll();
if (args.utils == "song")
await songInfoUtils.fetchAll({ args: args });
else if (args.utils == "album")
await albumInfoUtils.fetchAll({ args: args });
else if (args.utils == "artist")
await artistInfoUtils.fetchAll({ args: args });
else if (args.utils == "lyric")
await lyricInfoUtils.fetchAll({ args: args });
else if (args.utils == "comment")
await commentUtils.fetchAll({ args: args });
else {
console.log("utils参数不匹配退出");
return;
}
await sleepUtils.sleep(2000);
}
}

View File

@@ -1,13 +1,42 @@
-- 查看需要爬取的音乐的分布
SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM (
-- 查看需要爬取的 song 的分布
SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM (
SELECT DISTINCT song_id FROM song_album_relation
UNION
SELECT DISTINCT song_id FROM song_artist_relation
) as t_tmp
WHERE song_id NOT IN ( SELECT song_id FROM song )
GROUP BY s
ORDER BY s DESC
WHERE song_id NOT IN ( SELECT song_id FROM song )
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 album 的分布
SELECT cast( format( album_id / 1000000, 0) * 1000000 as UNSIGNED ) as s, count(*) as count
FROM song_album_relation
WHERE album_id NOT IN ( SELECT album_id FROM album )
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 artist 的分布
SELECT cast( format( artist_id / 2000000, 0) * 2000000 as UNSIGNED ) as s, count(*) as count
FROM song_artist_relation
WHERE artist_id NOT IN ( SELECT artist_id FROM artist )
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 comment 的分布
SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM comment_progress
WHERE current_status != 2
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 lyric 的分布
SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM song
WHERE song_id NOT IN ( SELECT song_id FROM lyric )
GROUP BY s
ORDER BY s DESC
-- optimize table
optimize table album;

View File

@@ -23,19 +23,33 @@ async function getFromDatabase({ albumId }) {
// 正常应该查不出记录才对
/*
SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'
SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:([:space:]*?|[ ]*?)。,更多.*$'
*/
async function fetchAll({ isUpdate = false }) {
console.log("start fetching albums ...")
var albumIds = await dbUtils.query(isUpdate
? `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`
: `SELECT DISTINCT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album )`, []);
async function fetchAll({ args = {}, isUpdate = false }) {
console.log("start fetching albums ...");
if (isUpdate) {
var sql = `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`;
} else {
let whereClause = [
args.min ? `album_id > ${args.min}` : '1=1',
args.max ? `album_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT album_id FROM song_album_relation WHERE ${whereClause} AND album_id NOT IN ( SELECT album_id FROM album )
${args.order ? `ORDER BY album_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
}
var albumIds = await dbUtils.query(sql, []);
albumIds = albumIds.map(item => item.album_id);
for (let i = 0; i < albumIds.length; i++) {
await global.checkIsExit();
const albumId = albumIds[i];
console.log(`${i + 1}/${albumIds.length} | album: ${albumId}`);
console.log(`${i + 1}/${albumIds.length} | album: ${albumId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try {
await fetch({ albumId: albumId, update: isUpdate });
} catch (err) {

View File

@@ -22,16 +22,25 @@ async function getFromDatabase({ artistId }) {
}
// 从数据库中查出还缺少的歌手,并进行爬取
async function fetchAll() {
console.log("start fetching artists ...")
var artistIds = await dbUtils.query(`
SELECT DISTINCT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist )
`, []);
async function fetchAll({ args = {} }) {
console.log("start fetching artists ...");
let whereClause = [
args.min ? `artist_id > ${args.min}` : '1=1',
args.max ? `artist_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT artist_id FROM song_artist_relation WHERE ${whereClause} AND artist_id NOT IN ( SELECT artist_id FROM artist )
${args.order ? `ORDER BY artist_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var artistIds = await dbUtils.query(sql, []);
artistIds = artistIds.map(item => item.artist_id);
for (let i = 0; i < artistIds.length; i++) {
await global.checkIsExit();
const artistId = artistIds[i];
console.log(`${i + 1}/${artistIds.length} | artist: ${artistId}`);
console.log(`${i + 1}/${artistIds.length} | artist: ${artistId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try {
await fetch({ artistId: artistId });
} catch (err) {

View File

@@ -11,28 +11,33 @@ const dbUtils = global.dbUtils;
// https://github.com/Binaryify/NeteaseCloudMusicApi
const { comment_music } = require('NeteaseCloudMusicApi');
async function fetchAll() {
console.log("start fetching comment ...")
async function fetchAll({ args = {} }) {
console.log("start fetching comment ...");
// 首先将需要爬取的song_id导入comment_progress表
await dbUtils.query(`
INSERT INTO comment_progress ( song_id )
INSERT IGNORE INTO comment_progress ( song_id )
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress )
`, []);
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT song_id FROM comment_progress WHERE ${whereClause} AND current_status != 2
ORDER BY current_status DESC${args.order ? `, song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
// 首先查询有无正在爬取中的记录
var songIds = await dbUtils.query(`
-- 本机
SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id <= 30000000 LIMIT 1000
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id < 30000000 ORDER BY current_status DESC
-- 服务器
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id > 30000000 ORDER BY current_status DESC
`, []);
var songIds = await dbUtils.query(sql, []);
songIds = songIds.map(item => item.song_id);
for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit();
const songId = songIds[i];
console.log(`${i + 1}/${songIds.length} | comment: ${songId}`);
console.log(`${i + 1}/${songIds.length} | comment: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try {
await fetch({ songId: songId });
} catch (err) {

View File

@@ -7,11 +7,20 @@ const sleepUtils = require('../../../utils/sleepUtils');
const dbUtils = global.dbUtils;
// 从数据库中查出还缺少的歌词,并进行爬取
async function fetchAll() {
async function fetchAll({ args = {} }) {
console.log("start fetching lyrics ...");
var songIds = await dbUtils.query(`
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric )
`, []);
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT song_id FROM song WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM lyric )
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var songIds = await dbUtils.query(sql, []);
songIds = songIds.map(song => song.song_id);
for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit();

View File

@@ -24,18 +24,27 @@ async function getFromDatabase({ songId }) {
}
// 从数据库中查出还缺少的歌曲,并进行爬取
async function fetchAll() {
async function fetchAll({ args = {} }) {
console.log("start fetching songs ...");
var songIds = await dbUtils.query(`
SELECT DISTINCT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song )
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT song_id FROM song_artist_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
UNION
SELECT DISTINCT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song )
`, []);
SELECT DISTINCT song_id FROM song_album_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var songIds = await dbUtils.query(sql, []);
songIds = songIds.map(item => item.song_id);
for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit();
const songId = songIds[i];
console.log(`${i + 1}/${songIds.length} | song: ${songId}`);
console.log(`${i + 1}/${songIds.length} | song: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try {
await fetch({ songId: songId });
} catch (err) {

11
package-lock.json generated
View File

@@ -12,6 +12,7 @@
"cheerio": "^1.0.0-rc.12",
"crypto": "^1.0.1",
"fs": "^0.0.1-security",
"minimist": "^1.2.6",
"mysql": "^2.18.1",
"NeteaseCloudMusicApi": "^4.8.2",
"node-schedule": "^2.1.0",
@@ -1371,6 +1372,11 @@
"node": ">= 0.6"
}
},
"node_modules/minimist": {
"version": "1.2.6",
"resolved": "https://registry.npmmirror.com/minimist/-/minimist-1.2.6.tgz",
"integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q=="
},
"node_modules/ms": {
"version": "2.0.0",
"resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz",
@@ -3501,6 +3507,11 @@
"mime-db": "1.52.0"
}
},
"minimist": {
"version": "1.2.6",
"resolved": "https://registry.npmmirror.com/minimist/-/minimist-1.2.6.tgz",
"integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q=="
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz",

View File

@@ -12,6 +12,7 @@
"cheerio": "^1.0.0-rc.12",
"crypto": "^1.0.1",
"fs": "^0.0.1-security",
"minimist": "^1.2.6",
"mysql": "^2.18.1",
"NeteaseCloudMusicApi": "^4.8.2",
"node-schedule": "^2.1.0",

View File

@@ -1,3 +1,44 @@
-- 本地
node index --utils song --min 1900000000 --max 2000000000 --order DESC --limit 2000
node index --utils song --min 1800000000 --max 1900000000 --order DESC --limit 2000
-- Linux服务器
node index --utils song --min 1290000000 --max 1500000000 --order DESC --limit 2000
-- Windows服务器
node index --utils song --min 400000000 --max 1000000000 --order ASC --limit 2000
node index --utils song --min 0 --max 400000000 --order ASC --limit 2000
-- Windows 服务器
node index --utils album --min 134000000 --max 160000000 --order DESC --limit 2000
-- 本机
node index --utils album --min 0 --max 134000000 --order DESC --limit 2000
-- Windows服务器
node index --utils artist --min 0 --max 12000000 --order DESC --limit 2000
-- Linux服务器
node index --utils artist --min 12000000 --max 38000000 --order DESC --limit 2000
-- 本机
node index --utils artist --min 38000000 --max 55000000 --order DESC --limit 2000
-- 本机
node index --utils comment --min 1800000000 --max 2000000000 --order DESC --limit 2000
-- Windows服务器
node index --utils comment --min 1290000000 --max 1500000000 --order DESC --limit 2000
node index --utils comment --min 400000000 --max 1000000000 --order ASC --limit 2000
-- Linux服务器
node index --utils comment --min 0 --max 400000000 --order ASC --limit 2000
-- Windows服务器
node index --utils lyric --min 1800000000 --max 2000000000 --order DESC --limit 2000
node index --utils lyric --min 0 --max 400000000 --order ASC --limit 2000
-- 本机
node index --utils lyric --min 400000000 --max 1000000000 --order ASC --limit 2000
后期:
考虑歌曲别名 例如https://music.163.com/#/song?id=26830207