From be2658375c832531ccebc6d1182339ef691f8c5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E5=B0=8F=E5=A2=A8?= <2291200076@qq.com> Date: Thu, 6 Oct 2022 21:06:09 +0800 Subject: [PATCH] =?UTF-8?q?=E9=80=9A=E8=BF=87=E5=91=BD=E4=BB=A4=E8=A1=8C?= =?UTF-8?q?=E6=8C=87=E5=AE=9A=E7=88=AC=E5=8F=96=E5=8F=82=E6=95=B0=EF=BC=8C?= =?UTF-8?q?=E4=B8=8D=E7=94=A8=E5=86=8D=E4=BF=AE=E6=94=B9=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E4=BA=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- index.js | 19 ++++++++- netease_music/index.js | 23 +++++++---- netease_music/sql/statistic.sql | 41 +++++++++++++++++--- netease_music/src/getInfo/albumInfoUtils.js | 28 +++++++++---- netease_music/src/getInfo/artistInfoUtils.js | 21 +++++++--- netease_music/src/getInfo/commentUtils.js | 27 +++++++------ netease_music/src/getInfo/lyricInfoUtils.js | 17 ++++++-- netease_music/src/getInfo/songInfoUtils.js | 21 +++++++--- package-lock.json | 11 ++++++ package.json | 1 + todo.txt | 41 ++++++++++++++++++++ 11 files changed, 202 insertions(+), 48 deletions(-) diff --git a/index.js b/index.js index f176bbf..172c909 100644 --- a/index.js +++ b/index.js @@ -1,3 +1,20 @@ +if (process.argv.length <= 2) { + let output = [ + "参数不够", + "node index --utils [song|album|artist|lyric|comment] --min [number] --max [number] --order [false|ASC|DESC] --limit [number]", + // "", + // "node index --utils song --min xxx --max xxx --order ASC --limit 2000", + // "node index --utils album --min xxx --max xxx --order ASC --limit 2000", + // "node index --utils artist --min xxx --max xxx --order ASC --limit 2000", + // "node index --utils lyric --min xxx --max xxx --order ASC --limit 2000", + // "node index --utils comment --min xxx --max xxx --order ASC --limit 2000", + ].join('\n'); + console.log(output); + return; +} +var args = require('minimist')(process.argv.slice(2)); +console.log("args:", args); + global.useMysqlPool = true; const neteaseMusic = require('./netease_music/index'); -neteaseMusic.main(); \ No newline at end of file +neteaseMusic.main(args); \ No newline at end of file diff --git a/netease_music/index.js b/netease_music/index.js index 59ea9ae..0ad7cf0 100644 --- a/netease_music/index.js +++ b/netease_music/index.js @@ -13,7 +13,7 @@ global.dbUtils = dbUtils; console.log("global.useMysqlPool:", !!global.useMysqlPool); // 两次请求之间停顿时间 -global.sleepTime = 300; +global.sleepTime = 10; // 引入utils const songInfoUtils = require('./src/getInfo/songInfoUtils'); @@ -46,7 +46,7 @@ async function test() { /** * 主函数 */ -async function main() { +async function main(args) { console.log("neteaseMusic Start fetch ..."); while (true) { // // 删除脏数据 @@ -54,11 +54,20 @@ async function main() { // var affectedRows2 = await dbUtils.query(`DELETE FROM song_album_relation WHERE song_id = 0 OR album_id = 0`, []); // console.log(`删除脏数据 affectedRows:`, affectedRows1.affectedRows, affectedRows2.affectedRows); - await songInfoUtils.fetchAll(); - await albumInfoUtils.fetchAll({}); - await artistInfoUtils.fetchAll(); - await lyricInfoUtils.fetchAll(); - await commentUtils.fetchAll(); + if (args.utils == "song") + await songInfoUtils.fetchAll({ args: args }); + else if (args.utils == "album") + await albumInfoUtils.fetchAll({ args: args }); + else if (args.utils == "artist") + await artistInfoUtils.fetchAll({ args: args }); + else if (args.utils == "lyric") + await lyricInfoUtils.fetchAll({ args: args }); + else if (args.utils == "comment") + await commentUtils.fetchAll({ args: args }); + else { + console.log("utils参数不匹配,退出"); + return; + } await sleepUtils.sleep(2000); } } diff --git a/netease_music/sql/statistic.sql b/netease_music/sql/statistic.sql index 60ec9d2..a9bd575 100644 --- a/netease_music/sql/statistic.sql +++ b/netease_music/sql/statistic.sql @@ -1,13 +1,42 @@ - -- 查看需要爬取的音乐的分布 - SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count - FROM ( +-- 查看需要爬取的 song 的分布 +SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count +FROM ( SELECT DISTINCT song_id FROM song_album_relation UNION SELECT DISTINCT song_id FROM song_artist_relation ) as t_tmp - WHERE song_id NOT IN ( SELECT song_id FROM song ) - GROUP BY s - ORDER BY s DESC +WHERE song_id NOT IN ( SELECT song_id FROM song ) +GROUP BY s +ORDER BY s DESC + +-- 查看需要爬取的 album 的分布 +SELECT cast( format( album_id / 1000000, 0) * 1000000 as UNSIGNED ) as s, count(*) as count +FROM song_album_relation +WHERE album_id NOT IN ( SELECT album_id FROM album ) +GROUP BY s +ORDER BY s DESC + +-- 查看需要爬取的 artist 的分布 +SELECT cast( format( artist_id / 2000000, 0) * 2000000 as UNSIGNED ) as s, count(*) as count +FROM song_artist_relation +WHERE artist_id NOT IN ( SELECT artist_id FROM artist ) +GROUP BY s +ORDER BY s DESC + +-- 查看需要爬取的 comment 的分布 +SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count +FROM comment_progress +WHERE current_status != 2 +GROUP BY s +ORDER BY s DESC + +-- 查看需要爬取的 lyric 的分布 +SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count +FROM song +WHERE song_id NOT IN ( SELECT song_id FROM lyric ) +GROUP BY s +ORDER BY s DESC + -- optimize table optimize table album; diff --git a/netease_music/src/getInfo/albumInfoUtils.js b/netease_music/src/getInfo/albumInfoUtils.js index 5974094..7154390 100644 --- a/netease_music/src/getInfo/albumInfoUtils.js +++ b/netease_music/src/getInfo/albumInfoUtils.js @@ -23,19 +23,33 @@ async function getFromDatabase({ albumId }) { // 正常应该查不出记录才对 /* -SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$' +SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:([:space:]*?|[ ]*?)。,更多.*$' */ -async function fetchAll({ isUpdate = false }) { - console.log("start fetching albums ...") - var albumIds = await dbUtils.query(isUpdate - ? `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'` - : `SELECT DISTINCT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album )`, []); +async function fetchAll({ args = {}, isUpdate = false }) { + console.log("start fetching albums ..."); + + if (isUpdate) { + var sql = `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`; + } else { + let whereClause = [ + args.min ? `album_id > ${args.min}` : '1=1', + args.max ? `album_id <= ${args.max}` : '1=1', + ].join(' AND '); + var sql = ` + SELECT DISTINCT album_id FROM song_album_relation WHERE ${whereClause} AND album_id NOT IN ( SELECT album_id FROM album ) + ${args.order ? `ORDER BY album_id ${args.order}` : ''} + ${args.limit ? `LIMIT ${args.limit}` : ''} + `; + console.log(sql); + } + + var albumIds = await dbUtils.query(sql, []); albumIds = albumIds.map(item => item.album_id); for (let i = 0; i < albumIds.length; i++) { await global.checkIsExit(); const albumId = albumIds[i]; - console.log(`${i + 1}/${albumIds.length} | album: ${albumId}`); + console.log(`${i + 1}/${albumIds.length} | album: ${albumId} | ${args.min ?? "?"}-${args.max ?? "?"}`); try { await fetch({ albumId: albumId, update: isUpdate }); } catch (err) { diff --git a/netease_music/src/getInfo/artistInfoUtils.js b/netease_music/src/getInfo/artistInfoUtils.js index 43ba0ff..48f4825 100644 --- a/netease_music/src/getInfo/artistInfoUtils.js +++ b/netease_music/src/getInfo/artistInfoUtils.js @@ -22,16 +22,25 @@ async function getFromDatabase({ artistId }) { } // 从数据库中查出还缺少的歌手,并进行爬取 -async function fetchAll() { - console.log("start fetching artists ...") - var artistIds = await dbUtils.query(` - SELECT DISTINCT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist ) - `, []); +async function fetchAll({ args = {} }) { + console.log("start fetching artists ..."); + let whereClause = [ + args.min ? `artist_id > ${args.min}` : '1=1', + args.max ? `artist_id <= ${args.max}` : '1=1', + ].join(' AND '); + var sql = ` + SELECT DISTINCT artist_id FROM song_artist_relation WHERE ${whereClause} AND artist_id NOT IN ( SELECT artist_id FROM artist ) + ${args.order ? `ORDER BY artist_id ${args.order}` : ''} + ${args.limit ? `LIMIT ${args.limit}` : ''} + `; + console.log(sql); + + var artistIds = await dbUtils.query(sql, []); artistIds = artistIds.map(item => item.artist_id); for (let i = 0; i < artistIds.length; i++) { await global.checkIsExit(); const artistId = artistIds[i]; - console.log(`${i + 1}/${artistIds.length} | artist: ${artistId}`); + console.log(`${i + 1}/${artistIds.length} | artist: ${artistId} | ${args.min ?? "?"}-${args.max ?? "?"}`); try { await fetch({ artistId: artistId }); } catch (err) { diff --git a/netease_music/src/getInfo/commentUtils.js b/netease_music/src/getInfo/commentUtils.js index ce8d3b5..4717b27 100644 --- a/netease_music/src/getInfo/commentUtils.js +++ b/netease_music/src/getInfo/commentUtils.js @@ -11,28 +11,33 @@ const dbUtils = global.dbUtils; // https://github.com/Binaryify/NeteaseCloudMusicApi const { comment_music } = require('NeteaseCloudMusicApi'); -async function fetchAll() { - console.log("start fetching comment ...") +async function fetchAll({ args = {} }) { + console.log("start fetching comment ..."); // 首先将需要爬取的song_id导入comment_progress表 await dbUtils.query(` - INSERT INTO comment_progress ( song_id ) + INSERT IGNORE INTO comment_progress ( song_id ) SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress ) `, []); + let whereClause = [ + args.min ? `song_id > ${args.min}` : '1=1', + args.max ? `song_id <= ${args.max}` : '1=1', + ].join(' AND '); + var sql = ` + SELECT song_id FROM comment_progress WHERE ${whereClause} AND current_status != 2 + ORDER BY current_status DESC${args.order ? `, song_id ${args.order}` : ''} + ${args.limit ? `LIMIT ${args.limit}` : ''} + `; + console.log(sql); + // 首先查询有无正在爬取中的记录 - var songIds = await dbUtils.query(` - -- 本机 - SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id <= 30000000 LIMIT 1000 - -- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id < 30000000 ORDER BY current_status DESC - -- 服务器 - -- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id > 30000000 ORDER BY current_status DESC - `, []); + var songIds = await dbUtils.query(sql, []); songIds = songIds.map(item => item.song_id); for (let i = 0; i < songIds.length; i++) { await global.checkIsExit(); const songId = songIds[i]; - console.log(`${i + 1}/${songIds.length} | comment: ${songId}`); + console.log(`${i + 1}/${songIds.length} | comment: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`); try { await fetch({ songId: songId }); } catch (err) { diff --git a/netease_music/src/getInfo/lyricInfoUtils.js b/netease_music/src/getInfo/lyricInfoUtils.js index 7c815d6..77b5017 100644 --- a/netease_music/src/getInfo/lyricInfoUtils.js +++ b/netease_music/src/getInfo/lyricInfoUtils.js @@ -7,11 +7,20 @@ const sleepUtils = require('../../../utils/sleepUtils'); const dbUtils = global.dbUtils; // 从数据库中查出还缺少的歌词,并进行爬取 -async function fetchAll() { +async function fetchAll({ args = {} }) { console.log("start fetching lyrics ..."); - var songIds = await dbUtils.query(` - SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric ) - `, []); + let whereClause = [ + args.min ? `song_id > ${args.min}` : '1=1', + args.max ? `song_id <= ${args.max}` : '1=1', + ].join(' AND '); + var sql = ` + SELECT DISTINCT song_id FROM song WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM lyric ) + ${args.order ? `ORDER BY song_id ${args.order}` : ''} + ${args.limit ? `LIMIT ${args.limit}` : ''} + `; + console.log(sql); + + var songIds = await dbUtils.query(sql, []); songIds = songIds.map(song => song.song_id); for (let i = 0; i < songIds.length; i++) { await global.checkIsExit(); diff --git a/netease_music/src/getInfo/songInfoUtils.js b/netease_music/src/getInfo/songInfoUtils.js index ebab707..4096618 100644 --- a/netease_music/src/getInfo/songInfoUtils.js +++ b/netease_music/src/getInfo/songInfoUtils.js @@ -24,18 +24,27 @@ async function getFromDatabase({ songId }) { } // 从数据库中查出还缺少的歌曲,并进行爬取 -async function fetchAll() { +async function fetchAll({ args = {} }) { console.log("start fetching songs ..."); - var songIds = await dbUtils.query(` - SELECT DISTINCT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song ) + let whereClause = [ + args.min ? `song_id > ${args.min}` : '1=1', + args.max ? `song_id <= ${args.max}` : '1=1', + ].join(' AND '); + var sql = ` + SELECT DISTINCT song_id FROM song_artist_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song ) UNION - SELECT DISTINCT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song ) - `, []); + SELECT DISTINCT song_id FROM song_album_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song ) + ${args.order ? `ORDER BY song_id ${args.order}` : ''} + ${args.limit ? `LIMIT ${args.limit}` : ''} + `; + console.log(sql); + + var songIds = await dbUtils.query(sql, []); songIds = songIds.map(item => item.song_id); for (let i = 0; i < songIds.length; i++) { await global.checkIsExit(); const songId = songIds[i]; - console.log(`${i + 1}/${songIds.length} | song: ${songId}`); + console.log(`${i + 1}/${songIds.length} | song: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`); try { await fetch({ songId: songId }); } catch (err) { diff --git a/package-lock.json b/package-lock.json index 2760cab..fcd911e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,6 +12,7 @@ "cheerio": "^1.0.0-rc.12", "crypto": "^1.0.1", "fs": "^0.0.1-security", + "minimist": "^1.2.6", "mysql": "^2.18.1", "NeteaseCloudMusicApi": "^4.8.2", "node-schedule": "^2.1.0", @@ -1371,6 +1372,11 @@ "node": ">= 0.6" } }, + "node_modules/minimist": { + "version": "1.2.6", + "resolved": "https://registry.npmmirror.com/minimist/-/minimist-1.2.6.tgz", + "integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q==" + }, "node_modules/ms": { "version": "2.0.0", "resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz", @@ -3501,6 +3507,11 @@ "mime-db": "1.52.0" } }, + "minimist": { + "version": "1.2.6", + "resolved": "https://registry.npmmirror.com/minimist/-/minimist-1.2.6.tgz", + "integrity": "sha512-Jsjnk4bw3YJqYzbdyBiNsPWHPfO++UGG749Cxs6peCu5Xg4nrena6OVxOYxrQTqww0Jmwt+Ref8rggumkTLz9Q==" + }, "ms": { "version": "2.0.0", "resolved": "https://registry.npmmirror.com/ms/-/ms-2.0.0.tgz", diff --git a/package.json b/package.json index 4d0d122..87fad19 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "cheerio": "^1.0.0-rc.12", "crypto": "^1.0.1", "fs": "^0.0.1-security", + "minimist": "^1.2.6", "mysql": "^2.18.1", "NeteaseCloudMusicApi": "^4.8.2", "node-schedule": "^2.1.0", diff --git a/todo.txt b/todo.txt index 6eb7d77..7f29cd5 100644 --- a/todo.txt +++ b/todo.txt @@ -1,3 +1,44 @@ +-- 本地 +node index --utils song --min 1900000000 --max 2000000000 --order DESC --limit 2000 +node index --utils song --min 1800000000 --max 1900000000 --order DESC --limit 2000 +-- Linux服务器 +node index --utils song --min 1290000000 --max 1500000000 --order DESC --limit 2000 +-- Windows服务器 +node index --utils song --min 400000000 --max 1000000000 --order ASC --limit 2000 +node index --utils song --min 0 --max 400000000 --order ASC --limit 2000 + + + +-- Windows 服务器 +node index --utils album --min 134000000 --max 160000000 --order DESC --limit 2000 +-- 本机 +node index --utils album --min 0 --max 134000000 --order DESC --limit 2000 + + +-- Windows服务器 +node index --utils artist --min 0 --max 12000000 --order DESC --limit 2000 +-- Linux服务器 +node index --utils artist --min 12000000 --max 38000000 --order DESC --limit 2000 +-- 本机 +node index --utils artist --min 38000000 --max 55000000 --order DESC --limit 2000 + + +-- 本机 +node index --utils comment --min 1800000000 --max 2000000000 --order DESC --limit 2000 +-- Windows服务器 +node index --utils comment --min 1290000000 --max 1500000000 --order DESC --limit 2000 +node index --utils comment --min 400000000 --max 1000000000 --order ASC --limit 2000 +-- Linux服务器 +node index --utils comment --min 0 --max 400000000 --order ASC --limit 2000 + + +-- Windows服务器 +node index --utils lyric --min 1800000000 --max 2000000000 --order DESC --limit 2000 +node index --utils lyric --min 0 --max 400000000 --order ASC --limit 2000 +-- 本机 +node index --utils lyric --min 400000000 --max 1000000000 --order ASC --limit 2000 + + 后期: 考虑歌曲别名 例如:https://music.163.com/#/song?id=26830207