diff --git a/netease_music/sql/structure.sql b/netease_music/sql/structure.sql index 1a24824..a84a7df 100644 --- a/netease_music/sql/structure.sql +++ b/netease_music/sql/structure.sql @@ -56,7 +56,7 @@ CREATE TABLE `album` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; CREATE TABLE `playlist` ( - `playlist_id` int(10) unsigned NOT NULL COMMENT '歌单id', + `playlist_id` bigint(20) unsigned NOT NULL COMMENT '歌单id', `title` varchar(200) NOT NULL COMMENT '歌单名', `english_title` varchar(200) DEFAULT NULL COMMENT '歌单名(英文)', `description` varchar(1500) NOT NULL COMMENT '歌单简介', @@ -81,8 +81,8 @@ CREATE TABLE `playlist` ( `status` tinyint(4) DEFAULT NULL COMMENT '保留状态码', `privacy` tinyint(4) DEFAULT NULL COMMENT '保留状态码', `ad_type` tinyint(4) DEFAULT NULL COMMENT '保留状态码', - `special_type` tinyint(4) DEFAULT NULL COMMENT '保留状态码', - `official_playlist_type` tinyint(4) DEFAULT NULL COMMENT '保留状态码', + `special_type` int(11) DEFAULT NULL COMMENT '保留状态码', + `official_playlist_type` varchar(20) DEFAULT NULL COMMENT '保留状态码', `op_recommend` tinyint(4) DEFAULT NULL COMMENT '保留状态码 0-false 1-true', `high_quality` tinyint(4) DEFAULT NULL COMMENT '保留状态码 0-false 1-true', `new_imported` tinyint(4) DEFAULT NULL COMMENT '保留状态码 0-false 1-true', @@ -106,7 +106,7 @@ CREATE TABLE `song_playlist_relation` ( `song_id` int(10) unsigned NOT NULL COMMENT '歌曲id', `playlist_id` int(10) unsigned NOT NULL COMMENT '歌单id', `alg` varchar(20) DEFAULT NULL COMMENT '保留字段', - `rcmdReason` varchar(20) DEFAULT NULL COMMENT '保留字段', + `rcmd_reason` varchar(20) DEFAULT NULL COMMENT '保留字段', `create_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬取时间', `modify_time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '最后更新时间', PRIMARY KEY (`song_id`, `playlist_id`), @@ -185,11 +185,13 @@ CREATE TABLE `comment_progress` ( CREATE TABLE `category` ( `id` int NOT NULL AUTO_INCREMENT COMMENT '分类id', - `qianqian_id` int DEFAULT NULL COMMENT '千千音乐id', - `netease_id` int DEFAULT NULL COMMENT '网易音乐id', `title` varchar(255) NOT NULL COMMENT '分类名称', - `qianqian_group` varchar(255) DEFAULT NULL COMMENT '分类所属分组', - `qianqian_group_chinese` varchar(255) DEFAULT NULL COMMENT '分类所属分组(中文)', + `netease_id` int DEFAULT NULL COMMENT '网易音乐id', + `qianqian_id` int DEFAULT NULL COMMENT '千千音乐id', + `alias` varchar(255) DEFAULT NULL COMMENT '分类别名', + `qianqian_group` varchar(255) DEFAULT NULL COMMENT '千千音乐 分类所属分组', + `qianqian_group_chinese` varchar(255) DEFAULT NULL COMMENT '千千音乐 分类所属分组(中文)', + `netease_group_chinese` varchar(255) DEFAULT NULL COMMENT '网易音乐 分类所属分组(中文)', PRIMARY KEY (`id`), UNIQUE KEY `title` (`title`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; diff --git a/netease_music/src/getInfo/playlistUtils.js b/netease_music/src/getInfo/playlistUtils.js index 54bc035..cc1a913 100644 --- a/netease_music/src/getInfo/playlistUtils.js +++ b/netease_music/src/getInfo/playlistUtils.js @@ -9,70 +9,164 @@ const dbUtils = global.dbUtils; // refer: // https://neteasecloudmusicapi-docs.4everland.app/ // https://github.com/Binaryify/NeteaseCloudMusicApi -const { playlist_catlist, playlist_hot, playlist_detail, playlist_track_all, song_detail } = require('NeteaseCloudMusicApi'); +const { playlist_catlist, playlist_hot, playlist_detail } = require('NeteaseCloudMusicApi'); -// // 从数据库中查出还缺少的歌词,并进行爬取 -// async function fetchAll() { -// console.log("start fetching lyrics ..."); -// var playlistIds = await dbUtils.query(` -// SELECT DISTINCT playlist_id FROM playlist WHERE playlist_id NOT IN ( SELECT playlist_id FROM lyric ) -// `, []); -// playlistIds = playlistIds.map(playlist => playlist.playlist_id); -// for (let i = 0; i < playlistIds.length; i++) { -// await global.checkIsExit(); -// const playlistId = playlistIds[i]; -// console.log(`${i + 1}/${playlistIds.length} | lyric: ${playlistId}`); -// try { -// await fetch({ playlistId: playlistId }); -// } catch (err) { -// console.error(err); -// } -// await sleepUtils.sleep(global.sleepTime); -// } -// } +async function fetchAll({ args }) { + console.log("start fetching playlists ..."); + console.log("playlist 需要一口气爬完,中途不能停止,否则下次又要重头爬(歌单不会重复爬取,但是分页列表会)") + + // 从数据库中查出所有的网易云分类 + let result = await dbUtils.query(`SELECT title FROM category WHERE netease_group_chinese IS NOT NULL`); + cate = result.map(cate => cate.title); + cate.unshift('全部'); // 插入第一个 + console.log(cate); + for (let i = 0; i < cate.length; i++) { + const categoryName = cate[i]; + try { + await fetchCategory({ categoryName: categoryName, progress: `${i + 1}/${cate.length}` }); + } catch (err) { + console.error(err); + } + } +} + +async function fetchCategory({ categoryName, progress }) { + // 首先去网易云音乐首页获得歌单 (每一首音乐右侧都会有几个包含该音乐的歌单) + let haveNext = true; + let perPage = 35; + let offset = 0; + while (haveNext) { + let url = `https://music.163.com/discover/playlist?cat=${encodeURIComponent(categoryName)}&limit=${perPage}&offset=${offset}`; + try { + // var html = fs.readFileSync(path.join(__dirname, "../../temp", `discover-playlist.html`), 'utf8'); + var html = await requestUtils.getApiResult(url); + // fs.writeFileSync(path.join(__dirname, "../../temp", `discover-playlist.html`), html); + + var matcher = html.matchAll(/"\/playlist\?id=(\d{1,20})"/g); + var m = matcher.next(); + var a = new Set(); // 因为每个歌单id会出现两次,所以使用Set去重 + while (!m.done) { + a.add(Number(m.value[1])); + m = matcher.next(); + } + var playlistIds = Array.from(a).sort(); + } catch (errors) { + console.error(errors); + return; + } + + // 从数据库查出已爬取的歌单ids,并从 playlistIds 中排除这部分歌单 + var exceptPlaylistIds = await dbUtils.query(` + SELECT playlist_id FROM playlist WHERE playlist_id IN ? + `, [[playlistIds]]); + exceptPlaylistIds = exceptPlaylistIds.map(playlist => playlist.playlist_id); + + var finalPlaylistIds = playlistIds.filter(playlistId => exceptPlaylistIds.indexOf(playlistId) == -1); + + // console.log("playlistIds", playlistIds); + // console.log("exceptPlaylistIds", exceptPlaylistIds); + // console.log("finalPlaylistIds", finalPlaylistIds); + console.log("finalPlaylistIds.length", finalPlaylistIds.length); + + for (let i = 0; i < finalPlaylistIds.length; i++) { + await global.checkIsExit(); + const playlistId = finalPlaylistIds[i]; + // console.log(offset, i, finalPlaylistIds.length); + console.log(`分类: ${progress} | 歌单: ${offset + i + 1}/${offset + finalPlaylistIds.length} | playlist: ${playlistId}`); + try { + await fetch({ playlistId: playlistId }); + } catch (err) { + console.error(err); + } + await sleepUtils.sleep(global.sleepTime); + } + + // 最有一页判断标识 + if (html.indexOf(`class="zbtn znxt js-disabled">下一页`) > -1) haveNext = false; + offset += perPage; + } +} // 获取歌词详情 async function fetch({ playlistId, debug = false }) { + let result = await dbUtils.query('SELECT count(*) as count FROM playlist WHERE playlist_id = ?', [playlistId]); + if (result[0].count > 0 && !debug) { + console.log(`数据库中已有数据,跳过 playlistId: ${playlistId}`); + return; + } + // https://neteasecloudmusicapi-docs.4everland.app/#/?id=%e6%ad%8c%e5%8d%95%e5%88%86%e7%b1%bb - var queryParams = {}; try { // 获取歌单分类 - // var playlistResult = await playlist_catlist(queryParams); - // var playlistResult = await playlist_hot(queryParams); - // var playlistResult = await playlist_detail({ - // id: playlistId, - // }); - var playlistResult = await song_detail({ - // ids: ["536623501", "536623501"].join(','), + // var playlistResult = await playlist_catlist({}); + // var playlistResult = await playlist_hot({}); + var playlistResult = await playlist_detail({ + id: playlistId, }); - // var playlistResult = await playlist_track_all({ - // id: playlistId, - // limit: 10, - // offset: 0, - // }); - fs.writeFileSync(path.join(__dirname, "../../temp", `playlist-${playlistId}.json`), JSON.stringify(playlistResult)); + // fs.writeFileSync(path.join(__dirname, "../../temp", `playlist-${playlistId}.json`), JSON.stringify(playlistResult)); } catch (errors) { console.error(errors); return; } - console.log(playlistResult); + let playlist = playlistResult.body.playlist; + // console.log("playlist", playlist); - // let lyricInfo = { - // playlistId: playlistId, - // lyric: lyric.lyric, - // version: lyric.version, - // }; - // // console.log("lyricInfo", lyricInfo); - // dbUtils.query('INSERT IGNORE INTO lyric SET ?', { - // playlist_id: lyricInfo.playlistId, - // lyric: lyricInfo.lyric, - // version: lyricInfo.version, - // }); - // return lyricInfo; + let playlistInfo = { + playlist_id: playlist.id, + title: playlist.name, + english_title: playlist.englishTitle, + description: playlist.description, + user_id: playlist.userId, + tags: JSON.stringify(playlist.tags), + alg_tags: JSON.stringify(playlist.algTags), + playlist_create_time: playlist.createTime, + playlist_update_time: playlist.updateTime, + track_count: playlist.trackCount, + play_count: playlist.playCount, + subscribed_count: playlist.subscribedCount, + share_count: playlist.shareCount, + comment_count: playlist.commentCount, + cover_image: playlist.coverImgUrl ? /^https?:\/\/p.\.music\.126\.net\/(.*?)$/.exec(playlist.coverImgUrl)[1] : '', + title_image: playlist.titleImageUrl ? /^https?:\/\/p.\.music\.126\.net\/(.*?)$/.exec(playlist.titleImageUrl)[1] : '', + background_cover: playlist.backgroundCoverUrl ? /^https?:\/\/p.\.music\.126\.net\/(.*?)$/.exec(playlist.backgroundCoverUrl)[1] : '', + ordered: playlist.ordered, + copied: playlist.copied, + status: playlist.status, + privacy: playlist.privacy, + ad_type: playlist.adType, + special_type: playlist.specialType, + official_playlist_type: playlist.officialPlaylistType, + op_recommend: playlist.opRecommend, + high_quality: playlist.highQuality, + new_imported: playlist.newImported, + update_frequency: playlist.updateFrequency, + grade_status: playlist.gradeStatus, + score: playlist.score, + creator: JSON.stringify(playlist.creator), + video_ids: JSON.stringify(playlist.videoIds), + videos: JSON.stringify(playlist.videos), + banned_track_ids: JSON.stringify(playlist.bannedTrackIds), + remix_video: JSON.stringify(playlist.remixVideo), + }; + // console.log("playlistInfo", playlistInfo); + + if (playlist.bannedTrackIds) { + console.log("bannedTrackIds", playlist.bannedTrackIds); + process.exit(0); + + } + let trackIds = playlist.trackIds.map(track => [track.id, playlist.id, track.alg, track.rcmdReason]); + if (trackIds.length > 0) + await dbUtils.query('INSERT IGNORE INTO song_playlist_relation (song_id, playlist_id, alg, rcmd_reason) VALUES ?', [trackIds]); + await dbUtils.query(` + INSERT INTO playlist ( ${Object.keys(playlistInfo).map(field => `\`${field}\``).join(",")} ) VALUES ? + ON DUPLICATE KEY UPDATE ${Object.keys(playlistInfo).map(field => `${field}=VALUES(${field})`).join(", ")} + `, [[Object.values(playlistInfo)]]); + return playlistInfo; } module.exports = { fetch: fetch, - // fetchAll: fetchAll, + fetchAll: fetchAll, } \ No newline at end of file diff --git a/netease_music/src/getInfo/songInfoUtils.js b/netease_music/src/getInfo/songInfoUtils.js index 8808228..85e72b0 100644 --- a/netease_music/src/getInfo/songInfoUtils.js +++ b/netease_music/src/getInfo/songInfoUtils.js @@ -22,7 +22,7 @@ async function fetchAll({ args = {} }) { ${args.limit ? `LIMIT ${args.limit}` : ''} `; // // 更新现有数据 - // sql = `SELECT song_id FROM song WHERE ${whereClause} AND data_version = 1`; + // sql = `SELECT song_id FROM song WHERE data_version = 1`; // 测试用 // sql = `SELECT song_id FROM song_artist_relation group by song_id limit 10`; console.log(sql); diff --git a/netease_music/src/index.js b/netease_music/src/index.js index 382886d..85fd3dc 100644 --- a/netease_music/src/index.js +++ b/netease_music/src/index.js @@ -32,6 +32,7 @@ async function test() { // 不是所有歌手都有个人主页 例如 https://music.163.com/#/artist?id=1079075 // let res = await songInfoUtils.fetchAll({ args: {} }); + // let res = await playlistUtils.fetchAll(); // let res = await albumInfoUtils.fetch({ albumId: "9156", debug: true }); // let res = await artistInfoUtils.fetch({ artistId: "12023508" }); @@ -66,6 +67,8 @@ async function main(args) { await lyricInfoUtils.fetchAll({ args: args }); else if (args.utils == "comment") await commentUtils.fetchAll({ args: args }); + else if (args.utils == "playlist") + await playlistUtils.fetchAll({ args: args }); else { console.log("utils参数不匹配,退出"); return; diff --git a/netease_music/src/one_time_code/get_cate.js b/netease_music/src/one_time_code/get_cate.js new file mode 100644 index 0000000..d3b90e3 --- /dev/null +++ b/netease_music/src/one_time_code/get_cate.js @@ -0,0 +1,28 @@ +const fs = require('fs'); +const path = require('path'); + +var html = fs.readFileSync(path.join(__dirname, 'get_cate_html.html'), 'utf8'); + +var htmlGroup = html.split('
'); + +var rows = []; +htmlGroup.forEach(function (group) { + let title = group.match(/<\/i>(.*?)<\/dt>/); + if (!title) return; // 排除第一个 全部 + title = title[1]; + + var matcher = group.matchAll(/data-cat="(.*?)"/g); + var m = matcher.next(); + var cate = []; + while (!m.done) { + let category = m.value[1].replace(/&/g, "&"); + cate.push(category); + rows.push(`('${category}', '${title}')`); + m = matcher.next(); + } + console.log({ title, cate }); +}); +console.log(` +INSERT INTO category (title, netease_group_chinese) VALUES ${rows.join(',')} ON DUPLICATE KEY UPDATE netease_group_chinese=VALUES(netease_group_chinese) +`); +return; diff --git a/netease_music/src/one_time_code/get_cate_html.html b/netease_music/src/one_time_code/get_cate_html.html new file mode 100644 index 0000000..5bb7f7d --- /dev/null +++ b/netease_music/src/one_time_code/get_cate_html.html @@ -0,0 +1,96 @@ +

全部风格

+
+
语种
+
+华语| +欧美| +日语| +韩语| +粤语| +
+
+
+
风格
+
+流行| +摇滚| +民谣| +电子| +舞曲| +说唱| +轻音乐| +爵士| +乡村| +R&B/Soul| +古典| +民族| +英伦| +金属| +朋克| +蓝调| +雷鬼| +世界音乐| +拉丁| +New Age| +古风| +后摇| +Bossa Nova| +
+
+
+
场景
+
+清晨| +夜晚| +学习| +工作| +午休| +下午茶| +地铁| +驾车| +运动| +旅行| +散步| +酒吧| +
+
+
+
情感
+
+怀旧| +清新| +浪漫| +伤感| +治愈| +放松| +孤独| +感动| +兴奋| +快乐| +安静| +思念| +
+
+
+
主题
+
+综艺| +影视原声| +ACG| +儿童| +校园| +游戏| +70后| +80后| +90后| +网络歌曲| +KTV| +经典| +翻唱| +吉他| +钢琴| +器乐| +榜单| +00后| +
+
\ No newline at end of file diff --git a/netease_music/todo.txt b/netease_music/todo.txt index 4ad5252..03ca364 100644 --- a/netease_music/todo.txt +++ b/netease_music/todo.txt @@ -18,6 +18,8 @@ node index --utils comment --min 0 --max 400000000 --order ASC node index --utils lyric --min 1800000000 --max 2000000000 # node index --utils lyric --min 400000000 --max 1000000000 # node index --utils lyric --min 0 --max 400000000 # +############################################################################################# +node index --utils playlist # 后期: @@ -29,6 +31,11 @@ node index --utils lyric --min 0 --max 400000000 被删除的aritst和album回头再通过其他表中的数据反查回来 +说明: +song表中data_version=1的音乐是第一次爬取的时候存在,但是后面再爬取时不存在的音乐 + + + 后续分区(不能在现有表上修改,只能重新查出数据到新表) alter table song add partition ( PARTITION p1 VALUES LESS THAN ( 50000000),