通过命令行指定爬取参数,不用再修改代码了
This commit is contained in:
19
index.js
19
index.js
@@ -1,3 +1,20 @@
|
||||
if (process.argv.length <= 2) {
|
||||
let output = [
|
||||
"参数不够",
|
||||
"node index --utils [song|album|artist|lyric|comment] --min [number] --max [number] --order [false|ASC|DESC] --limit [number]",
|
||||
// "",
|
||||
// "node index --utils song --min xxx --max xxx --order ASC --limit 2000",
|
||||
// "node index --utils album --min xxx --max xxx --order ASC --limit 2000",
|
||||
// "node index --utils artist --min xxx --max xxx --order ASC --limit 2000",
|
||||
// "node index --utils lyric --min xxx --max xxx --order ASC --limit 2000",
|
||||
// "node index --utils comment --min xxx --max xxx --order ASC --limit 2000",
|
||||
].join('\n');
|
||||
console.log(output);
|
||||
return;
|
||||
}
|
||||
var args = require('minimist')(process.argv.slice(2));
|
||||
console.log("args:", args);
|
||||
|
||||
global.useMysqlPool = true;
|
||||
const neteaseMusic = require('./netease_music/index');
|
||||
neteaseMusic.main();
|
||||
neteaseMusic.main(args);
|
@@ -13,7 +13,7 @@ global.dbUtils = dbUtils;
|
||||
console.log("global.useMysqlPool:", !!global.useMysqlPool);
|
||||
|
||||
// 两次请求之间停顿时间
|
||||
global.sleepTime = 300;
|
||||
global.sleepTime = 10;
|
||||
|
||||
// 引入utils
|
||||
const songInfoUtils = require('./src/getInfo/songInfoUtils');
|
||||
@@ -46,7 +46,7 @@ async function test() {
|
||||
/**
|
||||
* 主函数
|
||||
*/
|
||||
async function main() {
|
||||
async function main(args) {
|
||||
console.log("neteaseMusic Start fetch ...");
|
||||
while (true) {
|
||||
// // 删除脏数据
|
||||
@@ -54,11 +54,20 @@ async function main() {
|
||||
// var affectedRows2 = await dbUtils.query(`DELETE FROM song_album_relation WHERE song_id = 0 OR album_id = 0`, []);
|
||||
// console.log(`删除脏数据 affectedRows:`, affectedRows1.affectedRows, affectedRows2.affectedRows);
|
||||
|
||||
await songInfoUtils.fetchAll();
|
||||
await albumInfoUtils.fetchAll({});
|
||||
await artistInfoUtils.fetchAll();
|
||||
await lyricInfoUtils.fetchAll();
|
||||
await commentUtils.fetchAll();
|
||||
if (args.utils == "song")
|
||||
await songInfoUtils.fetchAll({ args: args });
|
||||
else if (args.utils == "album")
|
||||
await albumInfoUtils.fetchAll({ args: args });
|
||||
else if (args.utils == "artist")
|
||||
await artistInfoUtils.fetchAll({ args: args });
|
||||
else if (args.utils == "lyric")
|
||||
await lyricInfoUtils.fetchAll({ args: args });
|
||||
else if (args.utils == "comment")
|
||||
await commentUtils.fetchAll({ args: args });
|
||||
else {
|
||||
console.log("utils参数不匹配,退出");
|
||||
return;
|
||||
}
|
||||
await sleepUtils.sleep(2000);
|
||||
}
|
||||
}
|
||||
|
@@ -1,13 +1,42 @@
|
||||
-- 查看需要爬取的音乐的分布
|
||||
SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM (
|
||||
-- 查看需要爬取的 song 的分布
|
||||
SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM (
|
||||
SELECT DISTINCT song_id FROM song_album_relation
|
||||
UNION
|
||||
SELECT DISTINCT song_id FROM song_artist_relation
|
||||
) as t_tmp
|
||||
WHERE song_id NOT IN ( SELECT song_id FROM song )
|
||||
GROUP BY s
|
||||
ORDER BY s DESC
|
||||
WHERE song_id NOT IN ( SELECT song_id FROM song )
|
||||
GROUP BY s
|
||||
ORDER BY s DESC
|
||||
|
||||
-- 查看需要爬取的 album 的分布
|
||||
SELECT cast( format( album_id / 1000000, 0) * 1000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM song_album_relation
|
||||
WHERE album_id NOT IN ( SELECT album_id FROM album )
|
||||
GROUP BY s
|
||||
ORDER BY s DESC
|
||||
|
||||
-- 查看需要爬取的 artist 的分布
|
||||
SELECT cast( format( artist_id / 2000000, 0) * 2000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM song_artist_relation
|
||||
WHERE artist_id NOT IN ( SELECT artist_id FROM artist )
|
||||
GROUP BY s
|
||||
ORDER BY s DESC
|
||||
|
||||
-- 查看需要爬取的 comment 的分布
|
||||
SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM comment_progress
|
||||
WHERE current_status != 2
|
||||
GROUP BY s
|
||||
ORDER BY s DESC
|
||||
|
||||
-- 查看需要爬取的 lyric 的分布
|
||||
SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM song
|
||||
WHERE song_id NOT IN ( SELECT song_id FROM lyric )
|
||||
GROUP BY s
|
||||
ORDER BY s DESC
|
||||
|
||||
|
||||
-- optimize table
|
||||
optimize table album;
|
||||
|
@@ -23,19 +23,33 @@ async function getFromDatabase({ albumId }) {
|
||||
|
||||
// 正常应该查不出记录才对
|
||||
/*
|
||||
SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'
|
||||
SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:([:space:]*?|[ ]*?)。,更多.*$'
|
||||
*/
|
||||
|
||||
async function fetchAll({ isUpdate = false }) {
|
||||
console.log("start fetching albums ...")
|
||||
var albumIds = await dbUtils.query(isUpdate
|
||||
? `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`
|
||||
: `SELECT DISTINCT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album )`, []);
|
||||
async function fetchAll({ args = {}, isUpdate = false }) {
|
||||
console.log("start fetching albums ...");
|
||||
|
||||
if (isUpdate) {
|
||||
var sql = `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`;
|
||||
} else {
|
||||
let whereClause = [
|
||||
args.min ? `album_id > ${args.min}` : '1=1',
|
||||
args.max ? `album_id <= ${args.max}` : '1=1',
|
||||
].join(' AND ');
|
||||
var sql = `
|
||||
SELECT DISTINCT album_id FROM song_album_relation WHERE ${whereClause} AND album_id NOT IN ( SELECT album_id FROM album )
|
||||
${args.order ? `ORDER BY album_id ${args.order}` : ''}
|
||||
${args.limit ? `LIMIT ${args.limit}` : ''}
|
||||
`;
|
||||
console.log(sql);
|
||||
}
|
||||
|
||||
var albumIds = await dbUtils.query(sql, []);
|
||||
albumIds = albumIds.map(item => item.album_id);
|
||||
for (let i = 0; i < albumIds.length; i++) {
|
||||
await global.checkIsExit();
|
||||
const albumId = albumIds[i];
|
||||
console.log(`${i + 1}/${albumIds.length} | album: ${albumId}`);
|
||||
console.log(`${i + 1}/${albumIds.length} | album: ${albumId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
|
||||
try {
|
||||
await fetch({ albumId: albumId, update: isUpdate });
|
||||
} catch (err) {
|
||||
|
@@ -22,16 +22,25 @@ async function getFromDatabase({ artistId }) {
|
||||
}
|
||||
|
||||
// 从数据库中查出还缺少的歌手,并进行爬取
|
||||
async function fetchAll() {
|
||||
console.log("start fetching artists ...")
|
||||
var artistIds = await dbUtils.query(`
|
||||
SELECT DISTINCT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist )
|
||||
`, []);
|
||||
async function fetchAll({ args = {} }) {
|
||||
console.log("start fetching artists ...");
|
||||
let whereClause = [
|
||||
args.min ? `artist_id > ${args.min}` : '1=1',
|
||||
args.max ? `artist_id <= ${args.max}` : '1=1',
|
||||
].join(' AND ');
|
||||
var sql = `
|
||||
SELECT DISTINCT artist_id FROM song_artist_relation WHERE ${whereClause} AND artist_id NOT IN ( SELECT artist_id FROM artist )
|
||||
${args.order ? `ORDER BY artist_id ${args.order}` : ''}
|
||||
${args.limit ? `LIMIT ${args.limit}` : ''}
|
||||
`;
|
||||
console.log(sql);
|
||||
|
||||
var artistIds = await dbUtils.query(sql, []);
|
||||
artistIds = artistIds.map(item => item.artist_id);
|
||||
for (let i = 0; i < artistIds.length; i++) {
|
||||
await global.checkIsExit();
|
||||
const artistId = artistIds[i];
|
||||
console.log(`${i + 1}/${artistIds.length} | artist: ${artistId}`);
|
||||
console.log(`${i + 1}/${artistIds.length} | artist: ${artistId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
|
||||
try {
|
||||
await fetch({ artistId: artistId });
|
||||
} catch (err) {
|
||||
|
@@ -11,28 +11,33 @@ const dbUtils = global.dbUtils;
|
||||
// https://github.com/Binaryify/NeteaseCloudMusicApi
|
||||
const { comment_music } = require('NeteaseCloudMusicApi');
|
||||
|
||||
async function fetchAll() {
|
||||
console.log("start fetching comment ...")
|
||||
async function fetchAll({ args = {} }) {
|
||||
console.log("start fetching comment ...");
|
||||
// 首先将需要爬取的song_id导入comment_progress表
|
||||
await dbUtils.query(`
|
||||
INSERT INTO comment_progress ( song_id )
|
||||
INSERT IGNORE INTO comment_progress ( song_id )
|
||||
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress )
|
||||
`, []);
|
||||
|
||||
let whereClause = [
|
||||
args.min ? `song_id > ${args.min}` : '1=1',
|
||||
args.max ? `song_id <= ${args.max}` : '1=1',
|
||||
].join(' AND ');
|
||||
var sql = `
|
||||
SELECT song_id FROM comment_progress WHERE ${whereClause} AND current_status != 2
|
||||
ORDER BY current_status DESC${args.order ? `, song_id ${args.order}` : ''}
|
||||
${args.limit ? `LIMIT ${args.limit}` : ''}
|
||||
`;
|
||||
console.log(sql);
|
||||
|
||||
// 首先查询有无正在爬取中的记录
|
||||
var songIds = await dbUtils.query(`
|
||||
-- 本机
|
||||
SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id <= 30000000 LIMIT 1000
|
||||
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id < 30000000 ORDER BY current_status DESC
|
||||
-- 服务器
|
||||
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id > 30000000 ORDER BY current_status DESC
|
||||
`, []);
|
||||
var songIds = await dbUtils.query(sql, []);
|
||||
songIds = songIds.map(item => item.song_id);
|
||||
|
||||
for (let i = 0; i < songIds.length; i++) {
|
||||
await global.checkIsExit();
|
||||
const songId = songIds[i];
|
||||
console.log(`${i + 1}/${songIds.length} | comment: ${songId}`);
|
||||
console.log(`${i + 1}/${songIds.length} | comment: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
|
||||
try {
|
||||
await fetch({ songId: songId });
|
||||
} catch (err) {
|
||||
|
@@ -7,11 +7,20 @@ const sleepUtils = require('../../../utils/sleepUtils');
|
||||
const dbUtils = global.dbUtils;
|
||||
|
||||
// 从数据库中查出还缺少的歌词,并进行爬取
|
||||
async function fetchAll() {
|
||||
async function fetchAll({ args = {} }) {
|
||||
console.log("start fetching lyrics ...");
|
||||
var songIds = await dbUtils.query(`
|
||||
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric )
|
||||
`, []);
|
||||
let whereClause = [
|
||||
args.min ? `song_id > ${args.min}` : '1=1',
|
||||
args.max ? `song_id <= ${args.max}` : '1=1',
|
||||
].join(' AND ');
|
||||
var sql = `
|
||||
SELECT DISTINCT song_id FROM song WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM lyric )
|
||||
${args.order ? `ORDER BY song_id ${args.order}` : ''}
|
||||
${args.limit ? `LIMIT ${args.limit}` : ''}
|
||||
`;
|
||||
console.log(sql);
|
||||
|
||||
var songIds = await dbUtils.query(sql, []);
|
||||
songIds = songIds.map(song => song.song_id);
|
||||
for (let i = 0; i < songIds.length; i++) {
|
||||
await global.checkIsExit();
|
||||
|
@@ -24,18 +24,27 @@ async function getFromDatabase({ songId }) {
|
||||
}
|
||||
|
||||
// 从数据库中查出还缺少的歌曲,并进行爬取
|
||||
async function fetchAll() {
|
||||
async function fetchAll({ args = {} }) {
|
||||
console.log("start fetching songs ...");
|
||||
var songIds = await dbUtils.query(`
|
||||
SELECT DISTINCT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song )
|
||||
let whereClause = [
|
||||
args.min ? `song_id > ${args.min}` : '1=1',
|
||||
args.max ? `song_id <= ${args.max}` : '1=1',
|
||||
].join(' AND ');
|
||||
var sql = `
|
||||
SELECT DISTINCT song_id FROM song_artist_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
|
||||
UNION
|
||||
SELECT DISTINCT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song )
|
||||
`, []);
|
||||
SELECT DISTINCT song_id FROM song_album_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
|
||||
${args.order ? `ORDER BY song_id ${args.order}` : ''}
|
||||
${args.limit ? `LIMIT ${args.limit}` : ''}
|
||||
`;
|
||||
console.log(sql);
|
||||
|
||||
var songIds = await dbUtils.query(sql, []);
|
||||
songIds = songIds.map(item => item.song_id);
|
||||
for (let i = 0; i < songIds.length; i++) {
|
||||
await global.checkIsExit();
|
||||
const songId = songIds[i];
|
||||
console.log(`${i + 1}/${songIds.length} | song: ${songId}`);
|
||||
console.log(`${i + 1}/${songIds.length} | song: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
|
||||
try {
|
||||
await fetch({ songId: songId });
|
||||
} catch (err) {
|
||||
|
11
package-lock.json
generated
11
package-lock.json
generated
@@ -12,6 +12,7 @@
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"crypto": "^1.0.1",
|
||||
"fs": "^0.0.1-security",
|
||||
"minimist": "^1.2.6",
|
||||
"mysql": "^2.18.1",
|
||||
"NeteaseCloudMusicApi": "^4.8.2",
|
||||
"node-schedule": "^2.1.0",
|
||||
@@ -1371,6 +1372,11 @@
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/minimist": {
|
||||
"version": "1.2.6",
|
||||
"resolved": "https://registry.npmmirror.com/minimist/-/minimist-1.2.6.tgz",
|
||||
"integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q=="
|
||||
},
|
||||
"node_modules/ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz",
|
||||
@@ -3501,6 +3507,11 @@
|
||||
"mime-db": "1.52.0"
|
||||
}
|
||||
},
|
||||
"minimist": {
|
||||
"version": "1.2.6",
|
||||
"resolved": "https://registry.npmmirror.com/minimist/-/minimist-1.2.6.tgz",
|
||||
"integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q=="
|
||||
},
|
||||
"ms": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz",
|
||||
|
@@ -12,6 +12,7 @@
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
"crypto": "^1.0.1",
|
||||
"fs": "^0.0.1-security",
|
||||
"minimist": "^1.2.6",
|
||||
"mysql": "^2.18.1",
|
||||
"NeteaseCloudMusicApi": "^4.8.2",
|
||||
"node-schedule": "^2.1.0",
|
||||
|
41
todo.txt
41
todo.txt
@@ -1,3 +1,44 @@
|
||||
-- 本地
|
||||
node index --utils song --min 1900000000 --max 2000000000 --order DESC --limit 2000
|
||||
node index --utils song --min 1800000000 --max 1900000000 --order DESC --limit 2000
|
||||
-- Linux服务器
|
||||
node index --utils song --min 1290000000 --max 1500000000 --order DESC --limit 2000
|
||||
-- Windows服务器
|
||||
node index --utils song --min 400000000 --max 1000000000 --order ASC --limit 2000
|
||||
node index --utils song --min 0 --max 400000000 --order ASC --limit 2000
|
||||
|
||||
|
||||
|
||||
-- Windows 服务器
|
||||
node index --utils album --min 134000000 --max 160000000 --order DESC --limit 2000
|
||||
-- 本机
|
||||
node index --utils album --min 0 --max 134000000 --order DESC --limit 2000
|
||||
|
||||
|
||||
-- Windows服务器
|
||||
node index --utils artist --min 0 --max 12000000 --order DESC --limit 2000
|
||||
-- Linux服务器
|
||||
node index --utils artist --min 12000000 --max 38000000 --order DESC --limit 2000
|
||||
-- 本机
|
||||
node index --utils artist --min 38000000 --max 55000000 --order DESC --limit 2000
|
||||
|
||||
|
||||
-- 本机
|
||||
node index --utils comment --min 1800000000 --max 2000000000 --order DESC --limit 2000
|
||||
-- Windows服务器
|
||||
node index --utils comment --min 1290000000 --max 1500000000 --order DESC --limit 2000
|
||||
node index --utils comment --min 400000000 --max 1000000000 --order ASC --limit 2000
|
||||
-- Linux服务器
|
||||
node index --utils comment --min 0 --max 400000000 --order ASC --limit 2000
|
||||
|
||||
|
||||
-- Windows服务器
|
||||
node index --utils lyric --min 1800000000 --max 2000000000 --order DESC --limit 2000
|
||||
node index --utils lyric --min 0 --max 400000000 --order ASC --limit 2000
|
||||
-- 本机
|
||||
node index --utils lyric --min 400000000 --max 1000000000 --order ASC --limit 2000
|
||||
|
||||
|
||||
后期:
|
||||
|
||||
考虑歌曲别名 例如:https://music.163.com/#/song?id=26830207
|
||||
|
Reference in New Issue
Block a user