diff --git a/netease_music/auto.bat b/netease_music/auto.bat deleted file mode 100644 index d507d20..0000000 --- a/netease_music/auto.bat +++ /dev/null @@ -1,10 +0,0 @@ -start cmd /k "node index --utils assistant" - -start cmd /k "node index --utils song" -start cmd /k "node index --utils album --limit 10000" -start cmd /k "node index --utils artist --limit 10000" -start cmd /k "node index --utils comment --limit 10000" -start cmd /k "node index --utils lyric --limit 10000" - -@REM start cmd /k "node index --utils playlist" -exit \ No newline at end of file diff --git a/netease_music/manual-script/.gitignore b/netease_music/manual-script/.gitignore new file mode 100644 index 0000000..5950476 --- /dev/null +++ b/netease_music/manual-script/.gitignore @@ -0,0 +1 @@ +comment id segment.txt \ No newline at end of file diff --git a/netease_music/start_cmd.bat b/netease_music/manual-script/@deprecated/start_cmd.bat similarity index 100% rename from netease_music/start_cmd.bat rename to netease_music/manual-script/@deprecated/start_cmd.bat diff --git a/netease_music/表合并.sql b/netease_music/manual-script/@deprecated/表合并.sql similarity index 92% rename from netease_music/表合并.sql rename to netease_music/manual-script/@deprecated/表合并.sql index b91557d..9852eb8 100644 --- a/netease_music/表合并.sql +++ b/netease_music/manual-script/@deprecated/表合并.sql @@ -1,9 +1,9 @@ -REPLACE INTO `comment_origin_` SELECT * FROM `comment`; -- 80.1G -REPLACE INTO `user_origin_` SELECT * FROM `user`; -- 5.20G - - -DELETE `comment` -FROM `comment` INNER JOIN `comment_origin_` ON `comment`.comment_id = `comment_origin_`.comment_id; - -DELETE `user` +REPLACE INTO `comment_origin_` SELECT * FROM `comment`; -- 80.1G +REPLACE INTO `user_origin_` SELECT * FROM `user`; -- 5.20G + + +DELETE `comment` +FROM `comment` INNER JOIN `comment_origin_` ON `comment`.comment_id = `comment_origin_`.comment_id; + +DELETE `user` FROM `user` INNER JOIN `user_origin_` ON `user`.user_id = `user_origin_`.user_id; \ No newline at end of file diff --git a/netease_music/auto - 0 aliyun shell.sh b/netease_music/manual-script/auto - 0 aliyun shell.sh similarity index 100% rename from netease_music/auto - 0 aliyun shell.sh rename to netease_music/manual-script/auto - 0 aliyun shell.sh diff --git a/netease_music/auto - 0 local.sh b/netease_music/manual-script/auto - 0 local.sh similarity index 77% rename from netease_music/auto - 0 local.sh rename to netease_music/manual-script/auto - 0 local.sh index 6e5dd20..c6a8069 100644 --- a/netease_music/auto - 0 local.sh +++ b/netease_music/manual-script/auto - 0 local.sh @@ -2,20 +2,16 @@ # cd ./netease_music cd tools/netease_music/ +# 【ING】 start cmd /k "node index --utils assistant" start cmd /k "node index --utils song" start cmd /k "node index --utils artist --limit 50000" start cmd /k "node index --utils album --limit 50000" -start cmd /k "node index --utils lyric" +start cmd /k "node index --utils lyric --limit 10000" +# start cmd /k "node index --utils comment --limit 10000" start cmd /k "node index --utils playlist" -exit - -# 把增量数据带上来 - -# node index --utils lyric --limit 500 --order desc -# node index --utils lyric --limit 500 # lyric_5 # 【ING】 @@ -99,39 +95,54 @@ node index --utils comment --min 2000000 --max 2500000 --limit 10000 node index --utils comment --min 2500000 --max 3000000 --limit 10000 node index --utils comment --min 3000000 --max 3500000 --limit 10000 -# comment_3 +# comment_3 配置待更新 # 【阿里云ing】 node index --utils comment --min 3500000 --max 4000000 --limit 10000 & node index --utils comment --min 4000000 --max 4500000 --limit 10000 & node index --utils comment --min 4500000 --max 5000000 --limit 10000 & node index --utils comment --min 5000000 --max 5500000 --limit 10000 & -node index --utils comment --min 5500000 --max 6000000 --limit 10000 & +# node index --utils comment --min 5500000 --max 6000000 --limit 10000 -# comment_4 -# 【阿里云ing】 -node index --utils comment --min 6000000 --max 6500000 --limit 10000 & -node index --utils comment --min 6500000 --max 7000000 --limit 10000 & -node index --utils comment --min 7000000 --max 7500000 --limit 10000 & -node index --utils comment --min 7500000 --max 8000000 --limit 10000 & +# comment_4 配置待更新 -# comment_5 -node index --utils comment --min 8000000 --max 8500000 --limit 10000 -node index --utils comment --min 8500000 --max 9000000 --limit 10000 +# comment_5 配置待更新 +# 【公司电脑ing】 +# node index --utils comment --min 6000000 --max 9000000 --limit 10000 node index --utils comment --min 9000000 --max 9500000 --limit 10000 -node index --utils comment --min 9500000 --max 10000000 --limit 10000 +# node index --utils comment --min 9500000 --max 10000000 --limit 10000 # comment_n +# 【公司电脑ing】 node index --utils comment --min 10000000 --max 20000000 --limit 10000 node index --utils comment --min 20000000 --max 30000000 --limit 10000 node index --utils comment --min 30000000 --max 40000000 --limit 10000 # node index --utils comment --min 40000000 --max 50000000 --limit 10000 node index --utils comment --min 50000000 --max 500000000 --limit 10000 -# comment_2n -node index --utils comment --min 1000000000 --max 1500000000 --limit 10000 -node index --utils comment --min 1500000000 --max 2000000000 --limit 10000 -node index --utils comment --min 2000000000 --max 2500000000 --limit 10000 -node index --utils comment --min 2500000000 --limit 10000 +# comment_2n_1 配置待更新 +# 【公司电脑ing】 +# node index --utils comment --min 1000000000 --max 1100000000 --limit 10000 +# node index --utils comment --min 1100000000 --max 1200000000 --limit 10000 +node index --utils comment --min 1200000000 --max 1300000000 --limit 10000 +node index --utils comment --min 1300000000 --max 1400000000 --limit 10000 +node index --utils comment --min 1400000000 --max 1500000000 --limit 10000 + +# comment_2n_2 配置待更新 +# 【手机ing】 +node index --utils comment --min 1500000000 --max 1600000000 --limit 10000 & +node index --utils comment --min 1600000000 --max 1700000000 --limit 10000 & +node index --utils comment --min 1700000000 --max 1800000000 --limit 10000 & +node index --utils comment --min 1800000000 --max 1900000000 --limit 10000 & +node index --utils comment --min 1900000000 --max 2000000000 --limit 10000 & + +# comment_2n_3 配置待更新 +# 【阿里云ing】 +node index --utils comment --min 2000000000 --max 2100000000 --limit 10000 & +node index --utils comment --min 2100000000 --max 2200000000 --limit 10000 & +# node index --utils comment --min 2200000000 --max 2300000000 --limit 10000 +# node index --utils comment --min 2300000000 --max 2400000000 --limit 10000 +# node index --utils comment --min 2400000000 --max 2500000000 --limit 10000 +# node index --utils comment --min 2500000000 --limit 10000 # # 待整理 2000000 - 1999000000 # start cmd /k "node index --utils comment --limit 10000 --min --max " # \ No newline at end of file diff --git a/netease_music/auto - comment id expand.js b/netease_music/manual-script/comment id segment generator.js similarity index 60% rename from netease_music/auto - comment id expand.js rename to netease_music/manual-script/comment id segment generator.js index 1390755..9189254 100644 --- a/netease_music/auto - comment id expand.js +++ b/netease_music/manual-script/comment id segment generator.js @@ -1,80 +1,103 @@ -let a = `1990000000 -1980000000 -1970000000 -1960000000 -1950000000 -1940000000 -1930000000 -1920000000 -1910000000 -1900000000 -1890000000 -1880000000 -1870000000 -1860000000 -1850000000 -1840000000 -1830000000 -1820000000 -1810000000 -1800000000 -1500000000 -1490000000 -1480000000 -1470000000 -1460000000 -1450000000 -1440000000 -1430000000 -1420000000 -1410000000 -1400000000 -1390000000 -1380000000 -1370000000 -1360000000 -1350000000 -1340000000 -1330000000 -1320000000 -1310000000 -1300000000 -1290000000 -860000000 -570000000 -560000000 -550000000 -540000000 -530000000 -520000000 -510000000 -500000000 -490000000 -480000000 -470000000 -460000000 -450000000 -440000000 -430000000 -420000000 -410000000 -400000000 -390000000 -40000000 -30000000 -20000000 -10000000 -0` - -const splitCount = 1000 -const step = 10000000 / splitCount - -let b = [] -a.split('\n') - .map(i => Number(i)) - .forEach(n => { - for (let i = splitCount; i > 0; i--) { - b.push(Number(n) + (i - 1) * step) - } - }); -console.log(b.join('\n')) \ No newline at end of file +// -- 查看需要爬取的 comment 的分布 +// SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count +// FROM comment_progress +// WHERE current_status != 2 +// GROUP BY s +// ORDER BY s DESC; + +// 变量 a 为通过执行以上SQL获取的分段 +let a = `2110000000 +2100000000 +2090000000 +2080000000 +2070000000 +2060000000 +2050000000 +2040000000 +2030000000 +2020000000 +2010000000 +2000000000 +1990000000 +1980000000 +1970000000 +1960000000 +1950000000 +1940000000 +1930000000 +1920000000 +1910000000 +1900000000 +1890000000 +1880000000 +1870000000 +1860000000 +1850000000 +1840000000 +1830000000 +1820000000 +1810000000 +1800000000 +1500000000 +1490000000 +1480000000 +1470000000 +1460000000 +1450000000 +1440000000 +1430000000 +1420000000 +1410000000 +1400000000 +1390000000 +1380000000 +1370000000 +1360000000 +1350000000 +1340000000 +1330000000 +1320000000 +1310000000 +1300000000 +1290000000 +860000000 +570000000 +560000000 +550000000 +540000000 +530000000 +520000000 +510000000 +500000000 +490000000 +480000000 +470000000 +460000000 +450000000 +440000000 +430000000 +420000000 +410000000 +400000000 +390000000 +30000000 +20000000 +10000000 +0` + +const splitCount = 1 +const step = 10000000 / splitCount + +let b = [] +a.split('\n') + .map(i => Number(i)) + .forEach(n => { + for (let i = splitCount; i > 0; i--) { + b.push(Number(n) + (i - 1) * step) + } + }); +let content = b.join('\n') +// console.log(content) + +const fs = require('fs') +fs.writeFileSync('comment id segment.txt', content, 'utf-8') \ No newline at end of file diff --git a/netease_music/sql/statistic.sql b/netease_music/manual-script/statistic.sql similarity index 95% rename from netease_music/sql/statistic.sql rename to netease_music/manual-script/statistic.sql index 82dfaec..5c1bab0 100644 --- a/netease_music/sql/statistic.sql +++ b/netease_music/manual-script/statistic.sql @@ -1,136 +1,147 @@ --- 更新统计数据 --- songCount 容易超时,有几张表查询时容易发生死锁,所以请在没有爬取时进行统计 --- 4G: 4294967296 (4 * 1024 * 1024 * 1024) 64M: 67108864 --- my.ini 配置文件中设置 innodb_buffer_pool_size=4G -show variables like "%innodb_buffer_pool_size%"; -DELETE FROM analysis WHERE `key` LIKE '%_old'; -UPDATE analysis SET `key`=concat(`key`,'_old'), modify_time=modify_time WHERE `key` NOT LIKE '%_old'; -INSERT INTO analysis (`key`, `value`) VALUES ('songCount', (SELECT count(*) as count FROM song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('songWaiting', (SELECT count(*) as count FROM wait_fetch_song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('playlistCount', (SELECT count(*) AS count FROM playlist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('albumCount', (SELECT count(*) as count FROM album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('albumWaiting', (SELECT count(*) as count FROM wait_fetch_album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('artistCount', (SELECT count(*) AS count FROM artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('artistWaiting', (SELECT count(*) as count FROM wait_fetch_artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('lyricCount', (SELECT count(*) AS count FROM lyric) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('commentCount', (SELECT count( DISTINCT song_id ) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('commentTotalCount', (SELECT count(*) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('userCount', (SELECT count(*) AS count FROM user) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('songPlaylistCount', (SELECT count(*) AS count FROM song_playlist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('songAlbumCount', (SELECT count(*) AS count FROM song_album_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); -INSERT INTO analysis (`key`, `value`) VALUES ('songArtistCount', (SELECT count(*) AS count FROM song_artist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); - - - --- 更新后初次全表扫描 -INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00'; -INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00'; -INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00'; - -INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00'; -INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00'; -INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00'; -INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00'; - -INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00'; -INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00'; -INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00'; -INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00'; - -INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00'; - -INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00'; - - - --- 全量更新 -INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song ); -INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song ); -INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE song_id NOT IN ( SELECT song_id FROM song ); -INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric ); -INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress ); -INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist ); -INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album ); - - - --- 查看需要爬取的 song 的分布 -SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count -FROM wait_fetch_song -GROUP BY s -ORDER BY s DESC; - --- 查看需要爬取的 album 的分布 -SELECT cast( FLOOR( id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count -FROM wait_fetch_album -GROUP BY s -ORDER BY s DESC; - --- 查看需要爬取的 artist 的分布 -SELECT cast( FLOOR(id / 100000 ) * 100000 as UNSIGNED ) as s, count(*) as count -FROM wait_fetch_artist -GROUP BY s -ORDER BY s DESC; - --- 查看需要爬取的 comment 的分布 -SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count -FROM comment_progress -WHERE current_status != 2 -GROUP BY s -ORDER BY s DESC; - --- 查看需要爬取的 lyric 的分布 -SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count -FROM wait_fetch_lyric -GROUP BY s -ORDER BY s DESC; - - - - --- 查看本地已有 song 的分布 -SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count -FROM song -GROUP BY s -ORDER BY s DESC; - --- 查看本地已有 user 的分布 -SELECT cast( FLOOR( user_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count -FROM user -GROUP BY s -ORDER BY s DESC; - --- 查看本地已有 album 的分布 -SELECT cast( FLOOR( album_id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count -FROM album -GROUP BY s -ORDER BY s DESC; - --- 查看本地已有 artist 的分布 -SELECT cast( FLOOR( artist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count -FROM artist -GROUP BY s -ORDER BY s DESC; - --- 查看本地已有 playlist 的分布 -SELECT cast( FLOOR( playlist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count -FROM playlist -GROUP BY s -ORDER BY s DESC; - - - --- 查询单个数据库里面各个表所占磁盘空间大小包括其索引的大小 -SELECT - table_schema AS '数据库', - table_name AS '表名', - table_rows AS '记录数', - TRUNCATE (data_length / 1024 / 1024, 2) AS '数据容量(MB)', - TRUNCATE (index_length / 1024 / 1024, 2) AS '索引容量(MB)', - TRUNCATE ((data_length + index_length) / 1024 / 1024 / 1024, 2) AS '总容量(GB)' -FROM - information_schema.TABLES -WHERE - table_schema = 'neteasemusic' -ORDER BY - table_rows DESC; +-- 更新统计数据 +-- songCount 容易超时,有几张表查询时容易发生死锁,所以请在没有爬取时进行统计 +-- 4G: 4294967296 (4 * 1024 * 1024 * 1024) 64M: 67108864 +-- my.ini 配置文件中设置 innodb_buffer_pool_size=4G +show variables like "%innodb_buffer_pool_size%"; +DELETE FROM analysis WHERE `key` LIKE '%_old'; +UPDATE analysis SET `key`=concat(`key`,'_old'), modify_time=modify_time WHERE `key` NOT LIKE '%_old'; +INSERT INTO analysis (`key`, `value`) VALUES ('songCount', (SELECT count(*) as count FROM song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('songWaiting', (SELECT count(*) as count FROM wait_fetch_song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('playlistCount', (SELECT count(*) AS count FROM playlist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('albumCount', (SELECT count(*) as count FROM album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('albumWaiting', (SELECT count(*) as count FROM wait_fetch_album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('artistCount', (SELECT count(*) AS count FROM artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('artistWaiting', (SELECT count(*) as count FROM wait_fetch_artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('lyricCount', (SELECT count(*) AS count FROM lyric) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('commentCount', (SELECT count( DISTINCT song_id ) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('commentTotalCount', (SELECT count(*) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('userCount', (SELECT count(*) AS count FROM user) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('songPlaylistCount', (SELECT count(*) AS count FROM song_playlist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('songAlbumCount', (SELECT count(*) AS count FROM song_album_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); +INSERT INTO analysis (`key`, `value`) VALUES ('songArtistCount', (SELECT count(*) AS count FROM song_artist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`); + + + +-- 更新后初次全表扫描 +INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00'; +INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00'; +INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00'; + +INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00'; +INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00'; +INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00'; +INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00'; + +INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00'; +INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00'; +INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00'; +INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00'; + +INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00'; + +INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00'; + + + +-- 全量更新 +INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song ); +INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song ); +INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE song_id NOT IN ( SELECT song_id FROM song ); +INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric ); +INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress ); +INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist ); +INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album ); + + + +-- 查看需要爬取的 song 的分布 +SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count +FROM wait_fetch_song +GROUP BY s +ORDER BY s DESC; + +-- 查看需要爬取的 album 的分布 +SELECT cast( FLOOR( id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count +FROM wait_fetch_album +GROUP BY s +ORDER BY s DESC; + +-- 查看需要爬取的 artist 的分布 +SELECT cast( FLOOR(id / 100000 ) * 100000 as UNSIGNED ) as s, count(*) as count +FROM wait_fetch_artist +GROUP BY s +ORDER BY s DESC; + +-- 查看需要爬取的 comment 的分布 +SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count +FROM comment_progress +WHERE current_status != 2 +GROUP BY s +ORDER BY s DESC; + +-- 查看需要爬取的 lyric 的分布 +SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count +FROM wait_fetch_lyric +GROUP BY s +ORDER BY s DESC; + + + + +-- 查看本地已有 song 的分布 +SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count +FROM song +GROUP BY s +ORDER BY s DESC; + +-- 查看本地已有 user 的分布 +SELECT cast( FLOOR( user_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count +FROM user +GROUP BY s +ORDER BY s DESC; + +-- 查看本地已有 album 的分布 +SELECT cast( FLOOR( album_id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count +FROM album +GROUP BY s +ORDER BY s DESC; + +-- 查看本地已有 artist 的分布 +SELECT cast( FLOOR( artist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count +FROM artist +GROUP BY s +ORDER BY s DESC; + +-- 查看本地已有 playlist 的分布 +SELECT cast( FLOOR( playlist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count +FROM playlist +GROUP BY s +ORDER BY s DESC; + + + +-- 查询单个数据库里面各个表所占磁盘空间大小包括其索引的大小 +SELECT + table_schema AS '数据库', + table_name AS '表名', + table_rows AS '记录数', + TRUNCATE (data_length / 1024 / 1024, 2) AS '数据容量(MB)', + TRUNCATE (index_length / 1024 / 1024, 2) AS '索引容量(MB)', + TRUNCATE ((data_length + index_length) / 1024 / 1024 / 1024, 2) AS '总容量(GB)' +FROM + information_schema.TABLES +WHERE + table_schema = 'neteasemusic' +ORDER BY + table_rows DESC; + + + +-- 统计等待爬取的数据条数 2023.12.25 +SELECT 'comment' as wait_fetch, count(*) as `count` FROM `comment_progress` where current_status = 0 +UNION ALL +SELECT 'album', count(*) FROM `wait_fetch_album` +UNION ALL +SELECT 'artist', count(*) FROM `wait_fetch_artist` +UNION ALL +SELECT 'lyric', count(*) FROM `wait_fetch_lyric` diff --git a/netease_music/todo.txt b/netease_music/manual-script/todo.txt similarity index 96% rename from netease_music/todo.txt rename to netease_music/manual-script/todo.txt index 7e6eefb..1a93769 100644 --- a/netease_music/todo.txt +++ b/netease_music/manual-script/todo.txt @@ -1,74 +1,79 @@ -windows服务器 -cd C:\Users\Administrator\Desktop\tools\netease_music - -linux服务器 -cd /www/neteasemusic/tools - - -本地库测试 -node index --database neteasemusic_develop --utils song -node index --database neteasemusic_develop --utils album --min 10000000 -node index --database neteasemusic_develop --utils album --order desc -node index --database neteasemusic_develop --utils artist -node index --database neteasemusic_develop --utils playlist -node index --database neteasemusic_develop --utils comment --limit 10000 -node index --database neteasemusic_develop --utils lyric -node index --database neteasemusic_develop --utils assistant - - - -思路: -通过一首歌,查出对应的artist和album,然后顺藤摸瓜查出网易云的其他song, album, artist, lyric, comment等 - -插入rel表的时候同时插入 wait_check_xx 表,然后后续检查这个表,如果不存在,那么就插入对应的 wait_fetch_xxx 表 -之后查出 wait_fetch_xxx 表,进行数据拉取,形成闭环 - - - -后期: -歌单定时更新(rel表中添加一个del字段,先将歌单下面的全部置为删除状态,再插入的时候把已有歌曲的标记重新修改为正常状态) - -评论的更新 - -被删除的aritst和album回头再通过其他表中的数据反查回来 - -歌曲目前爬取之后,会有一部分没有image封面,还是需要用旧方法爬取到 - - - -说明: -song表中data_version=1的音乐是第一次爬取的时候存在,但是后面再爬取时不存在的音乐 - - - -后续分区(不能在现有表上修改,只能重新查出数据到新表) -alter table song add partition ( - PARTITION p1 VALUES LESS THAN ( 50000000), - PARTITION p2 VALUES LESS THAN (1000000000), - PARTITION p3 VALUES LESS THAN (1500000000), - PARTITION p4 VALUES LESS THAN (2000000000), - PARTITION p5 VALUES LESS THAN MAXVALUE -); - - - -SQL文件说明 -sql/structure.sql 中的SQL为最简,不包含字段的编码集 -sql/neteasemusic.sql 中的SQL为数据库导出,包含字段的编码集 -项目数据库 CHARACTER SET 统一使用 'utf8mb4',COLLATE 统一使用 'utf8mb4_general_ci' - - - - - -# # 查看列表 -# screen -ls - -# # 创建一个screen -# screen + - -# # 切换到指定屏幕 -# screen -r - -# # 切出屏幕 +windows服务器 +cd C:\Users\Administrator\Desktop\tools\netease_music + +linux服务器 +cd /www/neteasemusic/tools + +手机 Termux +pkg update +pkg install git +pkg install nodejs + + +本地库测试 +node index --database neteasemusic_develop --utils song +node index --database neteasemusic_develop --utils album --min 10000000 +node index --database neteasemusic_develop --utils album --order desc +node index --database neteasemusic_develop --utils artist +node index --database neteasemusic_develop --utils playlist +node index --database neteasemusic_develop --utils comment --limit 10000 +node index --database neteasemusic_develop --utils lyric +node index --database neteasemusic_develop --utils assistant + + + +思路: +通过一首歌,查出对应的artist和album,然后顺藤摸瓜查出网易云的其他song, album, artist, lyric, comment等 + +插入rel表的时候同时插入 wait_check_xx 表,然后后续检查这个表,如果不存在,那么就插入对应的 wait_fetch_xxx 表 +之后查出 wait_fetch_xxx 表,进行数据拉取,形成闭环 + + + +后期: +歌单定时更新(rel表中添加一个del字段,先将歌单下面的全部置为删除状态,再插入的时候把已有歌曲的标记重新修改为正常状态) + +评论的更新 + +被删除的aritst和album回头再通过其他表中的数据反查回来 + +歌曲目前爬取之后,会有一部分没有image封面,还是需要用旧方法爬取到 + + + +说明: +song表中data_version=1的音乐是第一次爬取的时候存在,但是后面再爬取时不存在的音乐 + + + +后续分区(不能在现有表上修改,只能重新查出数据到新表) +alter table song add partition ( + PARTITION p1 VALUES LESS THAN ( 50000000), + PARTITION p2 VALUES LESS THAN (1000000000), + PARTITION p3 VALUES LESS THAN (1500000000), + PARTITION p4 VALUES LESS THAN (2000000000), + PARTITION p5 VALUES LESS THAN MAXVALUE +); + + + +SQL文件说明 +sql/structure.sql 中的SQL为最简,不包含字段的编码集 +sql/neteasemusic.sql 中的SQL为数据库导出,包含字段的编码集 +项目数据库 CHARACTER SET 统一使用 'utf8mb4',COLLATE 统一使用 'utf8mb4_general_ci' + + + + + +# # 查看列表 +# screen -ls + +# # 创建一个screen +# screen + + +# # 切换到指定屏幕 +# screen -r + +# # 切出屏幕 # Ctrl + A D \ No newline at end of file diff --git a/netease_music/打印SQL.js b/netease_music/manual-script/打印SQL.js similarity index 97% rename from netease_music/打印SQL.js rename to netease_music/manual-script/打印SQL.js index 28397dc..b20074e 100644 --- a/netease_music/打印SQL.js +++ b/netease_music/manual-script/打印SQL.js @@ -1,40 +1,40 @@ - -// const mysql = require('mysql'); -// await new Promise(function (resolve, reject) { -// //通过MySQL中方法创建连接对象 -// var connection = mysql.createConnection({ -// "charset": "utf8mb4", -// "host": "localhost", -// "user": "root", -// "password": "123456", -// "port": 3306, -// "database": "" -// }); -// //开始连接 -// connection.connect(); -// var sql = ` -// INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ? -// ON DUPLICATE KEY UPDATE content = VALUES(content), like_count = VALUES(like_count), comment_type = GREATEST(comment_type, VALUES(comment_type)), modify_time = CURRENT_TIMESTAMP -// `; -// var params = commentInfoList.map(commentInfo => [ -// commentInfo.comment_id, -// commentInfo.parent_comment_id, -// commentInfo.user_id, -// commentInfo.song_id, -// commentInfo.content, -// commentInfo.time, -// commentInfo.like_count, -// commentInfo.comment_type -// ]); -// var formattedSql = connection.format(sql, [params]); // 返回一个格式化后的SQL字符串 -// console.log(params); // 打印原始SQL语句 -// console.log(formattedSql); // 打印原始SQL语句 -// //最后需要关闭连接 -// connection.end(); -// }); -// process.exit(0); - - -// node index --utils comment --min 1935500000 --max 1935550000 --limit 10 - - + +// const mysql = require('mysql'); +// await new Promise(function (resolve, reject) { +// //通过MySQL中方法创建连接对象 +// var connection = mysql.createConnection({ +// "charset": "utf8mb4", +// "host": "localhost", +// "user": "root", +// "password": "123456", +// "port": 3306, +// "database": "" +// }); +// //开始连接 +// connection.connect(); +// var sql = ` +// INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ? +// ON DUPLICATE KEY UPDATE content = VALUES(content), like_count = VALUES(like_count), comment_type = GREATEST(comment_type, VALUES(comment_type)), modify_time = CURRENT_TIMESTAMP +// `; +// var params = commentInfoList.map(commentInfo => [ +// commentInfo.comment_id, +// commentInfo.parent_comment_id, +// commentInfo.user_id, +// commentInfo.song_id, +// commentInfo.content, +// commentInfo.time, +// commentInfo.like_count, +// commentInfo.comment_type +// ]); +// var formattedSql = connection.format(sql, [params]); // 返回一个格式化后的SQL字符串 +// console.log(params); // 打印原始SQL语句 +// console.log(formattedSql); // 打印原始SQL语句 +// //最后需要关闭连接 +// connection.end(); +// }); +// process.exit(0); + + +// node index --utils comment --min 1935500000 --max 1935550000 --limit 10 + +