1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee

更新 netease_music 脚本

This commit is contained in:
程序员小墨 2023-12-25 16:46:39 +08:00
parent e3ea397f03
commit fc46e76d74
10 changed files with 412 additions and 371 deletions

View File

@ -1,10 +0,0 @@
start cmd /k "node index --utils assistant"
start cmd /k "node index --utils song"
start cmd /k "node index --utils album --limit 10000"
start cmd /k "node index --utils artist --limit 10000"
start cmd /k "node index --utils comment --limit 10000"
start cmd /k "node index --utils lyric --limit 10000"
@REM start cmd /k "node index --utils playlist"
exit

View File

@ -0,0 +1 @@
comment id segment.txt

View File

@ -1,9 +1,9 @@
REPLACE INTO `comment_origin_` SELECT * FROM `comment`; -- 80.1G
REPLACE INTO `user_origin_` SELECT * FROM `user`; -- 5.20G
DELETE `comment`
FROM `comment` INNER JOIN `comment_origin_` ON `comment`.comment_id = `comment_origin_`.comment_id;
DELETE `user`
REPLACE INTO `comment_origin_` SELECT * FROM `comment`; -- 80.1G
REPLACE INTO `user_origin_` SELECT * FROM `user`; -- 5.20G
DELETE `comment`
FROM `comment` INNER JOIN `comment_origin_` ON `comment`.comment_id = `comment_origin_`.comment_id;
DELETE `user`
FROM `user` INNER JOIN `user_origin_` ON `user`.user_id = `user_origin_`.user_id;

View File

@ -2,20 +2,16 @@
# cd ./netease_music
cd tools/netease_music/
# 【ING】
start cmd /k "node index --utils assistant"
start cmd /k "node index --utils song"
start cmd /k "node index --utils artist --limit 50000"
start cmd /k "node index --utils album --limit 50000"
start cmd /k "node index --utils lyric"
start cmd /k "node index --utils lyric --limit 10000"
# start cmd /k "node index --utils comment --limit 10000"
start cmd /k "node index --utils playlist"
exit
# 把增量数据带上来
# node index --utils lyric --limit 500 --order desc
# node index --utils lyric --limit 500
# lyric_5
# 【ING】
@ -99,39 +95,54 @@ node index --utils comment --min 2000000 --max 2500000 --limit 10000
node index --utils comment --min 2500000 --max 3000000 --limit 10000
node index --utils comment --min 3000000 --max 3500000 --limit 10000
# comment_3
# comment_3 配置待更新
# 【阿里云ing】
node index --utils comment --min 3500000 --max 4000000 --limit 10000 &
node index --utils comment --min 4000000 --max 4500000 --limit 10000 &
node index --utils comment --min 4500000 --max 5000000 --limit 10000 &
node index --utils comment --min 5000000 --max 5500000 --limit 10000 &
node index --utils comment --min 5500000 --max 6000000 --limit 10000 &
# node index --utils comment --min 5500000 --max 6000000 --limit 10000
# comment_4
# 【阿里云ing】
node index --utils comment --min 6000000 --max 6500000 --limit 10000 &
node index --utils comment --min 6500000 --max 7000000 --limit 10000 &
node index --utils comment --min 7000000 --max 7500000 --limit 10000 &
node index --utils comment --min 7500000 --max 8000000 --limit 10000 &
# comment_4 配置待更新
# comment_5
node index --utils comment --min 8000000 --max 8500000 --limit 10000
node index --utils comment --min 8500000 --max 9000000 --limit 10000
# comment_5 配置待更新
# 【公司电脑ing】
# node index --utils comment --min 6000000 --max 9000000 --limit 10000
node index --utils comment --min 9000000 --max 9500000 --limit 10000
node index --utils comment --min 9500000 --max 10000000 --limit 10000
# node index --utils comment --min 9500000 --max 10000000 --limit 10000
# comment_n
# 【公司电脑ing】
node index --utils comment --min 10000000 --max 20000000 --limit 10000
node index --utils comment --min 20000000 --max 30000000 --limit 10000
node index --utils comment --min 30000000 --max 40000000 --limit 10000
# node index --utils comment --min 40000000 --max 50000000 --limit 10000
node index --utils comment --min 50000000 --max 500000000 --limit 10000
# comment_2n
node index --utils comment --min 1000000000 --max 1500000000 --limit 10000
node index --utils comment --min 1500000000 --max 2000000000 --limit 10000
node index --utils comment --min 2000000000 --max 2500000000 --limit 10000
node index --utils comment --min 2500000000 --limit 10000
# comment_2n_1 配置待更新
# 【公司电脑ing】
# node index --utils comment --min 1000000000 --max 1100000000 --limit 10000
# node index --utils comment --min 1100000000 --max 1200000000 --limit 10000
node index --utils comment --min 1200000000 --max 1300000000 --limit 10000
node index --utils comment --min 1300000000 --max 1400000000 --limit 10000
node index --utils comment --min 1400000000 --max 1500000000 --limit 10000
# comment_2n_2 配置待更新
# 【手机ing】
node index --utils comment --min 1500000000 --max 1600000000 --limit 10000 &
node index --utils comment --min 1600000000 --max 1700000000 --limit 10000 &
node index --utils comment --min 1700000000 --max 1800000000 --limit 10000 &
node index --utils comment --min 1800000000 --max 1900000000 --limit 10000 &
node index --utils comment --min 1900000000 --max 2000000000 --limit 10000 &
# comment_2n_3 配置待更新
# 【阿里云ing】
node index --utils comment --min 2000000000 --max 2100000000 --limit 10000 &
node index --utils comment --min 2100000000 --max 2200000000 --limit 10000 &
# node index --utils comment --min 2200000000 --max 2300000000 --limit 10000
# node index --utils comment --min 2300000000 --max 2400000000 --limit 10000
# node index --utils comment --min 2400000000 --max 2500000000 --limit 10000
# node index --utils comment --min 2500000000 --limit 10000
# # 待整理 2000000 - 1999000000
# start cmd /k "node index --utils comment --limit 10000 --min --max " #

View File

@ -1,80 +1,103 @@
let a = `1990000000
1980000000
1970000000
1960000000
1950000000
1940000000
1930000000
1920000000
1910000000
1900000000
1890000000
1880000000
1870000000
1860000000
1850000000
1840000000
1830000000
1820000000
1810000000
1800000000
1500000000
1490000000
1480000000
1470000000
1460000000
1450000000
1440000000
1430000000
1420000000
1410000000
1400000000
1390000000
1380000000
1370000000
1360000000
1350000000
1340000000
1330000000
1320000000
1310000000
1300000000
1290000000
860000000
570000000
560000000
550000000
540000000
530000000
520000000
510000000
500000000
490000000
480000000
470000000
460000000
450000000
440000000
430000000
420000000
410000000
400000000
390000000
40000000
30000000
20000000
10000000
0`
const splitCount = 1000
const step = 10000000 / splitCount
let b = []
a.split('\n')
.map(i => Number(i))
.forEach(n => {
for (let i = splitCount; i > 0; i--) {
b.push(Number(n) + (i - 1) * step)
}
});
console.log(b.join('\n'))
// -- 查看需要爬取的 comment 的分布
// SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
// FROM comment_progress
// WHERE current_status != 2
// GROUP BY s
// ORDER BY s DESC;
// 变量 a 为通过执行以上SQL获取的分段
let a = `2110000000
2100000000
2090000000
2080000000
2070000000
2060000000
2050000000
2040000000
2030000000
2020000000
2010000000
2000000000
1990000000
1980000000
1970000000
1960000000
1950000000
1940000000
1930000000
1920000000
1910000000
1900000000
1890000000
1880000000
1870000000
1860000000
1850000000
1840000000
1830000000
1820000000
1810000000
1800000000
1500000000
1490000000
1480000000
1470000000
1460000000
1450000000
1440000000
1430000000
1420000000
1410000000
1400000000
1390000000
1380000000
1370000000
1360000000
1350000000
1340000000
1330000000
1320000000
1310000000
1300000000
1290000000
860000000
570000000
560000000
550000000
540000000
530000000
520000000
510000000
500000000
490000000
480000000
470000000
460000000
450000000
440000000
430000000
420000000
410000000
400000000
390000000
30000000
20000000
10000000
0`
const splitCount = 1
const step = 10000000 / splitCount
let b = []
a.split('\n')
.map(i => Number(i))
.forEach(n => {
for (let i = splitCount; i > 0; i--) {
b.push(Number(n) + (i - 1) * step)
}
});
let content = b.join('\n')
// console.log(content)
const fs = require('fs')
fs.writeFileSync('comment id segment.txt', content, 'utf-8')

View File

@ -1,136 +1,147 @@
-- 更新统计数据
-- songCount 容易超时,有几张表查询时容易发生死锁,所以请在没有爬取时进行统计
-- 4G: 4294967296 (4 * 1024 * 1024 * 1024) 64M: 67108864
-- my.ini 配置文件中设置 innodb_buffer_pool_size=4G
show variables like "%innodb_buffer_pool_size%";
DELETE FROM analysis WHERE `key` LIKE '%_old';
UPDATE analysis SET `key`=concat(`key`,'_old'), modify_time=modify_time WHERE `key` NOT LIKE '%_old';
INSERT INTO analysis (`key`, `value`) VALUES ('songCount', (SELECT count(*) as count FROM song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songWaiting', (SELECT count(*) as count FROM wait_fetch_song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('playlistCount', (SELECT count(*) AS count FROM playlist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('albumCount', (SELECT count(*) as count FROM album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('albumWaiting', (SELECT count(*) as count FROM wait_fetch_album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('artistCount', (SELECT count(*) AS count FROM artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('artistWaiting', (SELECT count(*) as count FROM wait_fetch_artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('lyricCount', (SELECT count(*) AS count FROM lyric) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('commentCount', (SELECT count( DISTINCT song_id ) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('commentTotalCount', (SELECT count(*) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('userCount', (SELECT count(*) AS count FROM user) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songPlaylistCount', (SELECT count(*) AS count FROM song_playlist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songAlbumCount', (SELECT count(*) AS count FROM song_album_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songArtistCount', (SELECT count(*) AS count FROM song_artist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
-- 更新后初次全表扫描
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
-- 全量更新
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric );
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress );
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist );
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album );
-- 查看需要爬取的 song 的分布
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_song
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 album 的分布
SELECT cast( FLOOR( id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_album
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 artist 的分布
SELECT cast( FLOOR(id / 100000 ) * 100000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_artist
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 comment 的分布
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM comment_progress
WHERE current_status != 2
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 lyric 的分布
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_lyric
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 song 的分布
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM song
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 user 的分布
SELECT cast( FLOOR( user_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM user
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 album 的分布
SELECT cast( FLOOR( album_id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
FROM album
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 artist 的分布
SELECT cast( FLOOR( artist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
FROM artist
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 playlist 的分布
SELECT cast( FLOOR( playlist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
FROM playlist
GROUP BY s
ORDER BY s DESC;
-- 查询单个数据库里面各个表所占磁盘空间大小包括其索引的大小
SELECT
table_schema AS '数据库',
table_name AS '表名',
table_rows AS '记录数',
TRUNCATE (data_length / 1024 / 1024, 2) AS '数据容量(MB)',
TRUNCATE (index_length / 1024 / 1024, 2) AS '索引容量(MB)',
TRUNCATE ((data_length + index_length) / 1024 / 1024 / 1024, 2) AS '总容量(GB)'
FROM
information_schema.TABLES
WHERE
table_schema = 'neteasemusic'
ORDER BY
table_rows DESC;
-- 更新统计数据
-- songCount 容易超时,有几张表查询时容易发生死锁,所以请在没有爬取时进行统计
-- 4G: 4294967296 (4 * 1024 * 1024 * 1024) 64M: 67108864
-- my.ini 配置文件中设置 innodb_buffer_pool_size=4G
show variables like "%innodb_buffer_pool_size%";
DELETE FROM analysis WHERE `key` LIKE '%_old';
UPDATE analysis SET `key`=concat(`key`,'_old'), modify_time=modify_time WHERE `key` NOT LIKE '%_old';
INSERT INTO analysis (`key`, `value`) VALUES ('songCount', (SELECT count(*) as count FROM song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songWaiting', (SELECT count(*) as count FROM wait_fetch_song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('playlistCount', (SELECT count(*) AS count FROM playlist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('albumCount', (SELECT count(*) as count FROM album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('albumWaiting', (SELECT count(*) as count FROM wait_fetch_album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('artistCount', (SELECT count(*) AS count FROM artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('artistWaiting', (SELECT count(*) as count FROM wait_fetch_artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('lyricCount', (SELECT count(*) AS count FROM lyric) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('commentCount', (SELECT count( DISTINCT song_id ) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('commentTotalCount', (SELECT count(*) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('userCount', (SELECT count(*) AS count FROM user) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songPlaylistCount', (SELECT count(*) AS count FROM song_playlist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songAlbumCount', (SELECT count(*) AS count FROM song_album_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songArtistCount', (SELECT count(*) AS count FROM song_artist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
-- 更新后初次全表扫描
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
-- 全量更新
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric );
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress );
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist );
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album );
-- 查看需要爬取的 song 的分布
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_song
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 album 的分布
SELECT cast( FLOOR( id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_album
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 artist 的分布
SELECT cast( FLOOR(id / 100000 ) * 100000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_artist
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 comment 的分布
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM comment_progress
WHERE current_status != 2
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 lyric 的分布
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_lyric
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 song 的分布
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM song
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 user 的分布
SELECT cast( FLOOR( user_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM user
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 album 的分布
SELECT cast( FLOOR( album_id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
FROM album
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 artist 的分布
SELECT cast( FLOOR( artist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
FROM artist
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 playlist 的分布
SELECT cast( FLOOR( playlist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
FROM playlist
GROUP BY s
ORDER BY s DESC;
-- 查询单个数据库里面各个表所占磁盘空间大小包括其索引的大小
SELECT
table_schema AS '数据库',
table_name AS '表名',
table_rows AS '记录数',
TRUNCATE (data_length / 1024 / 1024, 2) AS '数据容量(MB)',
TRUNCATE (index_length / 1024 / 1024, 2) AS '索引容量(MB)',
TRUNCATE ((data_length + index_length) / 1024 / 1024 / 1024, 2) AS '总容量(GB)'
FROM
information_schema.TABLES
WHERE
table_schema = 'neteasemusic'
ORDER BY
table_rows DESC;
-- 统计等待爬取的数据条数 2023.12.25
SELECT 'comment' as wait_fetch, count(*) as `count` FROM `comment_progress` where current_status = 0
UNION ALL
SELECT 'album', count(*) FROM `wait_fetch_album`
UNION ALL
SELECT 'artist', count(*) FROM `wait_fetch_artist`
UNION ALL
SELECT 'lyric', count(*) FROM `wait_fetch_lyric`

View File

@ -1,74 +1,79 @@
windows服务器
cd C:\Users\Administrator\Desktop\tools\netease_music
linux服务器
cd /www/neteasemusic/tools
本地库测试
node index --database neteasemusic_develop --utils song
node index --database neteasemusic_develop --utils album --min 10000000
node index --database neteasemusic_develop --utils album --order desc
node index --database neteasemusic_develop --utils artist
node index --database neteasemusic_develop --utils playlist
node index --database neteasemusic_develop --utils comment --limit 10000
node index --database neteasemusic_develop --utils lyric
node index --database neteasemusic_develop --utils assistant
思路:
通过一首歌查出对应的artist和album然后顺藤摸瓜查出网易云的其他song, album, artist, lyric, comment等
插入rel表的时候同时插入 wait_check_xx 表,然后后续检查这个表,如果不存在,那么就插入对应的 wait_fetch_xxx 表
之后查出 wait_fetch_xxx 表,进行数据拉取,形成闭环
后期:
歌单定时更新rel表中添加一个del字段先将歌单下面的全部置为删除状态再插入的时候把已有歌曲的标记重新修改为正常状态
评论的更新
被删除的aritst和album回头再通过其他表中的数据反查回来
歌曲目前爬取之后会有一部分没有image封面还是需要用旧方法爬取到
说明:
song表中data_version=1的音乐是第一次爬取的时候存在但是后面再爬取时不存在的音乐
后续分区(不能在现有表上修改,只能重新查出数据到新表)
alter table song add partition (
PARTITION p1 VALUES LESS THAN ( 50000000),
PARTITION p2 VALUES LESS THAN (1000000000),
PARTITION p3 VALUES LESS THAN (1500000000),
PARTITION p4 VALUES LESS THAN (2000000000),
PARTITION p5 VALUES LESS THAN MAXVALUE
);
SQL文件说明
sql/structure.sql 中的SQL为最简不包含字段的编码集
sql/neteasemusic.sql 中的SQL为数据库导出包含字段的编码集
项目数据库 CHARACTER SET 统一使用 'utf8mb4'COLLATE 统一使用 'utf8mb4_general_ci'
# # 查看列表
# screen -ls
# # 创建一个screen
# screen + <Enter>
# # 切换到指定屏幕
# screen -r <screen_id>
# # 切出屏幕
windows服务器
cd C:\Users\Administrator\Desktop\tools\netease_music
linux服务器
cd /www/neteasemusic/tools
手机 Termux
pkg update
pkg install git
pkg install nodejs
本地库测试
node index --database neteasemusic_develop --utils song
node index --database neteasemusic_develop --utils album --min 10000000
node index --database neteasemusic_develop --utils album --order desc
node index --database neteasemusic_develop --utils artist
node index --database neteasemusic_develop --utils playlist
node index --database neteasemusic_develop --utils comment --limit 10000
node index --database neteasemusic_develop --utils lyric
node index --database neteasemusic_develop --utils assistant
思路:
通过一首歌查出对应的artist和album然后顺藤摸瓜查出网易云的其他song, album, artist, lyric, comment等
插入rel表的时候同时插入 wait_check_xx 表,然后后续检查这个表,如果不存在,那么就插入对应的 wait_fetch_xxx 表
之后查出 wait_fetch_xxx 表,进行数据拉取,形成闭环
后期:
歌单定时更新rel表中添加一个del字段先将歌单下面的全部置为删除状态再插入的时候把已有歌曲的标记重新修改为正常状态
评论的更新
被删除的aritst和album回头再通过其他表中的数据反查回来
歌曲目前爬取之后会有一部分没有image封面还是需要用旧方法爬取到
说明:
song表中data_version=1的音乐是第一次爬取的时候存在但是后面再爬取时不存在的音乐
后续分区(不能在现有表上修改,只能重新查出数据到新表)
alter table song add partition (
PARTITION p1 VALUES LESS THAN ( 50000000),
PARTITION p2 VALUES LESS THAN (1000000000),
PARTITION p3 VALUES LESS THAN (1500000000),
PARTITION p4 VALUES LESS THAN (2000000000),
PARTITION p5 VALUES LESS THAN MAXVALUE
);
SQL文件说明
sql/structure.sql 中的SQL为最简不包含字段的编码集
sql/neteasemusic.sql 中的SQL为数据库导出包含字段的编码集
项目数据库 CHARACTER SET 统一使用 'utf8mb4'COLLATE 统一使用 'utf8mb4_general_ci'
# # 查看列表
# screen -ls
# # 创建一个screen
# screen + <Enter>
# # 切换到指定屏幕
# screen -r <screen_id>
# # 切出屏幕
# Ctrl + A D

View File

@ -1,40 +1,40 @@
// const mysql = require('mysql');
// await new Promise(function (resolve, reject) {
// //通过MySQL中方法创建连接对象
// var connection = mysql.createConnection({
// "charset": "utf8mb4",
// "host": "localhost",
// "user": "root",
// "password": "123456",
// "port": 3306,
// "database": ""
// });
// //开始连接
// connection.connect();
// var sql = `
// INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
// ON DUPLICATE KEY UPDATE content = VALUES(content), like_count = VALUES(like_count), comment_type = GREATEST(comment_type, VALUES(comment_type)), modify_time = CURRENT_TIMESTAMP
// `;
// var params = commentInfoList.map(commentInfo => [
// commentInfo.comment_id,
// commentInfo.parent_comment_id,
// commentInfo.user_id,
// commentInfo.song_id,
// commentInfo.content,
// commentInfo.time,
// commentInfo.like_count,
// commentInfo.comment_type
// ]);
// var formattedSql = connection.format(sql, [params]); // 返回一个格式化后的SQL字符串
// console.log(params); // 打印原始SQL语句
// console.log(formattedSql); // 打印原始SQL语句
// //最后需要关闭连接
// connection.end();
// });
// process.exit(0);
// node index --utils comment --min 1935500000 --max 1935550000 --limit 10
// const mysql = require('mysql');
// await new Promise(function (resolve, reject) {
// //通过MySQL中方法创建连接对象
// var connection = mysql.createConnection({
// "charset": "utf8mb4",
// "host": "localhost",
// "user": "root",
// "password": "123456",
// "port": 3306,
// "database": ""
// });
// //开始连接
// connection.connect();
// var sql = `
// INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
// ON DUPLICATE KEY UPDATE content = VALUES(content), like_count = VALUES(like_count), comment_type = GREATEST(comment_type, VALUES(comment_type)), modify_time = CURRENT_TIMESTAMP
// `;
// var params = commentInfoList.map(commentInfo => [
// commentInfo.comment_id,
// commentInfo.parent_comment_id,
// commentInfo.user_id,
// commentInfo.song_id,
// commentInfo.content,
// commentInfo.time,
// commentInfo.like_count,
// commentInfo.comment_type
// ]);
// var formattedSql = connection.format(sql, [params]); // 返回一个格式化后的SQL字符串
// console.log(params); // 打印原始SQL语句
// console.log(formattedSql); // 打印原始SQL语句
// //最后需要关闭连接
// connection.end();
// });
// process.exit(0);
// node index --utils comment --min 1935500000 --max 1935550000 --limit 10