diff --git a/auto.bat b/auto.bat new file mode 100644 index 0000000..b8a7b14 --- /dev/null +++ b/auto.bat @@ -0,0 +1,5 @@ +cd ./netease_music +start auto.bat +cd ../hifini_music +start auto.bat +exit \ No newline at end of file diff --git a/hifini_music/auto.bat b/hifini_music/auto.bat new file mode 100644 index 0000000..9ceca01 --- /dev/null +++ b/hifini_music/auto.bat @@ -0,0 +1,2 @@ +start cmd /k "node index" +exit \ No newline at end of file diff --git a/netease_music/auto.bat b/netease_music/auto.bat new file mode 100644 index 0000000..261610b --- /dev/null +++ b/netease_music/auto.bat @@ -0,0 +1,10 @@ +start cmd /k "node index --utils assistant" + +start cmd /k "node index --utils song" +start cmd /k "node index --utils album" +start cmd /k "node index --utils artist" +start cmd /k "node index --utils comment --limit 10000" +start cmd /k "node index --utils lyric" + +@REM start cmd /k "node index --utils playlist" +exit \ No newline at end of file diff --git a/netease_music/src/assistantUtils.js b/netease_music/src/assistantUtils.js index cd99cf7..68c303f 100644 --- a/netease_music/src/assistantUtils.js +++ b/netease_music/src/assistantUtils.js @@ -12,13 +12,14 @@ function getDiffSet(a, b) { async function migrateIdsFromCheckToFetch(tableName, fieldName, insertSql = null) { console.log(`更新待爬取列表: ${tableName}`); - let stepLength = 1000; + let stepLength = 5000; while (true) { // 从 check 表中分块查出待处理数据 let idsResult = await dbUtils.query(`SELECT id FROM wait_check_${tableName} LIMIT ${stepLength}`, []); let ids = idsResult.map(row => row.id); // console.log("ids", ids); if (ids.length == 0) { + console.log(`${tableName} done.`); break; }; @@ -39,19 +40,25 @@ async function migrateIdsFromCheckToFetch(tableName, fieldName, insertSql = null // 从待检查表中删除 if (ids.length > 0) await dbUtils.query(`DELETE FROM wait_check_${tableName} WHERE id IN ?`, [[ids]]); - console.log(`table: ${tableName} | ${ids[0]} - ${ids.slice(-1)[0]}`) + console.log(`table: ${tableName} | ${ids[0]} - ${ids.slice(-1)[0]}`); } } +function getPromise(tableName, fieldName, insertSql) { + return new Promise(async function (resolve) { + await migrateIdsFromCheckToFetch(tableName, fieldName, insertSql); + resolve(); + }); +} async function updateWaitTable() { - await migrateIdsFromCheckToFetch("song", "song_id"); - await migrateIdsFromCheckToFetch("lyric", "song_id"); - await migrateIdsFromCheckToFetch("comment", "song_id", `INSERT IGNORE INTO comment_progress (song_id) VALUES ?`); - await migrateIdsFromCheckToFetch("album", "album_id"); - await migrateIdsFromCheckToFetch("artist", "artist_id"); - - // comment 搬到 comment_progress - console.log("done.\n"); + await Promise.all([ + getPromise("song", "song_id"), + getPromise("lyric", "song_id"), + getPromise("comment", "song_id", `INSERT IGNORE INTO comment_progress (song_id) VALUES ?`), + getPromise("album", "album_id"), + getPromise("artist", "artist_id") + ]); + console.log("All done.\n"); } module.exports = { diff --git a/netease_music/src/dataManager.js b/netease_music/src/dataManager.js index 8aa3df3..6554d09 100644 --- a/netease_music/src/dataManager.js +++ b/netease_music/src/dataManager.js @@ -60,18 +60,18 @@ module.exports = { sql = `SELECT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`; } else { let whereClause = [ - args.min ? `album_id > ${args.min}` : '1=1', - args.max ? `album_id <= ${args.max}` : '1=1', + args.min ? `id > ${args.min}` : '1=1', + args.max ? `id <= ${args.max}` : '1=1', ].join(' AND '); sql = ` - SELECT album_id FROM wait_fetch_album WHERE ${whereClause} - ${args.order ? `ORDER BY album_id ${args.order}` : ''} + SELECT id FROM wait_fetch_album WHERE ${whereClause} + ${args.order ? `ORDER BY id ${args.order}` : ''} ${args.limit ? `LIMIT ${args.limit}` : ''} `; } console.log(sql); let albumIds = await dbUtils.query(sql, []); - albumIds = albumIds.map(item => item.album_id); + albumIds = albumIds.map(item => item.id); return albumIds; }, }, @@ -84,17 +84,17 @@ module.exports = { getIdsToFetch: async (args) => { let whereClause = [ - args.min ? `artist_id > ${args.min}` : '1=1', - args.max ? `artist_id <= ${args.max}` : '1=1', + args.min ? `id > ${args.min}` : '1=1', + args.max ? `id <= ${args.max}` : '1=1', ].join(' AND '); let sql = ` - SELECT artist_id FROM wait_fetch_artist WHERE ${whereClause} - ${args.order ? `ORDER BY artist_id ${args.order}` : ''} + SELECT id FROM wait_fetch_artist WHERE ${whereClause} + ${args.order ? `ORDER BY id ${args.order}` : ''} ${args.limit ? `LIMIT ${args.limit}` : ''} `; console.log(sql); let artistIds = await dbUtils.query(sql, []); - artistIds = artistIds.map(item => item.artist_id); + artistIds = artistIds.map(item => item.id); return artistIds; }, }, @@ -107,17 +107,17 @@ module.exports = { getIdsToFetch: async (args) => { let whereClause = [ - args.min ? `song_id > ${args.min}` : '1=1', - args.max ? `song_id <= ${args.max}` : '1=1', + args.min ? `id > ${args.min}` : '1=1', + args.max ? `id <= ${args.max}` : '1=1', ].join(' AND '); var sql = ` - SELECT song_id FROM wait_fetch_lyric WHERE ${whereClause} - ${args.order ? `ORDER BY song_id ${args.order}` : ''} + SELECT id FROM wait_fetch_lyric WHERE ${whereClause} + ${args.order ? `ORDER BY id ${args.order}` : ''} ${args.limit ? `LIMIT ${args.limit}` : ''} `; console.log(sql); let songIds = await dbUtils.query(sql, []); - songIds = songIds.map(song => song.song_id); + songIds = songIds.map(item => item.id); return songIds; }, }, @@ -218,6 +218,7 @@ module.exports = { wait_fetch: { deleteCollection: async function (type, ids) { + // console.log("wait_fetch.deleteCollection", type, ids); if (ids.length > 0) return await dbUtils.query(`DELETE FROM wait_fetch_${type} WHERE id IN ?`, [[ids]]); } diff --git a/netease_music/src/getInfo/albumInfoUtils.js b/netease_music/src/getInfo/albumInfoUtils.js index 26aacc6..5016f57 100644 --- a/netease_music/src/getInfo/albumInfoUtils.js +++ b/netease_music/src/getInfo/albumInfoUtils.js @@ -48,6 +48,8 @@ async function fetch({ albumId, debug = false, update = false }) { let result = await dbUtils.query('SELECT count(*) as count FROM album WHERE album_id = ?', [albumId]); if (!debug && !update && result[0].count > 0) { console.log(`数据库中已有数据,跳过 albumId: ${albumId}`); + // 从待爬取表中删除记录 + await dataManager.wait_fetch.deleteCollection("album", [albumId]); return; } else if (update && result[0].count == 0) { console.log(`数据库中沒有数据,跳过 albumId: ${albumId}`); diff --git a/netease_music/src/getInfo/artistInfoUtils.js b/netease_music/src/getInfo/artistInfoUtils.js index 10ed3c0..a4c0da0 100644 --- a/netease_music/src/getInfo/artistInfoUtils.js +++ b/netease_music/src/getInfo/artistInfoUtils.js @@ -44,6 +44,8 @@ async function fetch({ artistId, debug = false }) { let result = await dbUtils.query('SELECT count(*) as count FROM artist WHERE artist_id = ?', [artistId]); if (result[0].count > 0 && !debug) { console.log(`数据库中已有数据,跳过 artistId: ${artistId}`); + // 从待爬取表中删除记录 + await dataManager.wait_fetch.deleteCollection("artist", [artistId]); return; } diff --git a/netease_music/src/getInfo/lyricInfoUtils.js b/netease_music/src/getInfo/lyricInfoUtils.js index e8dd1d3..3fb09e7 100644 --- a/netease_music/src/getInfo/lyricInfoUtils.js +++ b/netease_music/src/getInfo/lyricInfoUtils.js @@ -30,6 +30,8 @@ async function fetch({ songId, debug = false }) { if (result[0].count > 0 && !debug) { // 这里暂时跳过,后期可能要考虑歌词version更新的问题 console.log(`数据库中已有数据,跳过 songId: ${songId}`); + // 从待爬取表中删除记录 + await dataManager.wait_fetch.deleteCollection("lyric", [songId]); return; } diff --git a/netease_music/src/getInfo/songInfoUtils.js b/netease_music/src/getInfo/songInfoUtils.js index 4efef36..db373f3 100644 --- a/netease_music/src/getInfo/songInfoUtils.js +++ b/netease_music/src/getInfo/songInfoUtils.js @@ -97,7 +97,7 @@ async function fetch({ songIdArray, debug = false }) { await dataManager.song.insertCollection(songInfoList); // image 因为接口没有返回,所以不更新 // 从待爬取表中删除记录 - await dataManager.wait_fetch.deleteCollection("song", [songId]); + await dataManager.wait_fetch.deleteCollection("song", songIdArray); } // 获取音乐详情 diff --git a/netease_music/todo.txt b/netease_music/todo.txt index 6dd38e9..53d7016 100644 --- a/netease_music/todo.txt +++ b/netease_music/todo.txt @@ -21,6 +21,25 @@ node index --utils lyric --min 0 --max 400000000 ############################################################################################# node index --utils playlist # +正式库 +node index --utils song +node index --utils album --min 10000000 +node index --utils album --order desc +node index --utils artist +node index --utils playlist +node index --utils comment --limit 10000 +node index --utils lyric +node index --utils assistant + +本地库测试 +node index --database neteasemusic_develop --utils song +node index --database neteasemusic_develop --utils album --min 10000000 +node index --database neteasemusic_develop --utils album --order desc +node index --database neteasemusic_develop --utils artist +node index --database neteasemusic_develop --utils playlist +node index --database neteasemusic_develop --utils comment --limit 10000 +node index --database neteasemusic_develop --utils lyric +node index --database neteasemusic_develop --utils assistant 思路: 通过一首歌,查出对应的artist和album,然后顺藤摸瓜查出网易云的其他song, album, artist, lyric, comment等 diff --git a/start_cmd.bat b/start_cmd.bat new file mode 100644 index 0000000..001ba9a --- /dev/null +++ b/start_cmd.bat @@ -0,0 +1 @@ +start cmd \ No newline at end of file