更新 netease_music 脚本
This commit is contained in:
parent
e3ea397f03
commit
fc46e76d74
@ -1,10 +0,0 @@
|
||||
start cmd /k "node index --utils assistant"
|
||||
|
||||
start cmd /k "node index --utils song"
|
||||
start cmd /k "node index --utils album --limit 10000"
|
||||
start cmd /k "node index --utils artist --limit 10000"
|
||||
start cmd /k "node index --utils comment --limit 10000"
|
||||
start cmd /k "node index --utils lyric --limit 10000"
|
||||
|
||||
@REM start cmd /k "node index --utils playlist"
|
||||
exit
|
1
netease_music/manual-script/.gitignore
vendored
Normal file
1
netease_music/manual-script/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
comment id segment.txt
|
@ -1,9 +1,9 @@
|
||||
REPLACE INTO `comment_origin_` SELECT * FROM `comment`; -- 80.1G
|
||||
REPLACE INTO `user_origin_` SELECT * FROM `user`; -- 5.20G
|
||||
|
||||
|
||||
DELETE `comment`
|
||||
FROM `comment` INNER JOIN `comment_origin_` ON `comment`.comment_id = `comment_origin_`.comment_id;
|
||||
|
||||
DELETE `user`
|
||||
REPLACE INTO `comment_origin_` SELECT * FROM `comment`; -- 80.1G
|
||||
REPLACE INTO `user_origin_` SELECT * FROM `user`; -- 5.20G
|
||||
|
||||
|
||||
DELETE `comment`
|
||||
FROM `comment` INNER JOIN `comment_origin_` ON `comment`.comment_id = `comment_origin_`.comment_id;
|
||||
|
||||
DELETE `user`
|
||||
FROM `user` INNER JOIN `user_origin_` ON `user`.user_id = `user_origin_`.user_id;
|
@ -2,20 +2,16 @@
|
||||
# cd ./netease_music
|
||||
cd tools/netease_music/
|
||||
|
||||
# 【ING】
|
||||
start cmd /k "node index --utils assistant"
|
||||
start cmd /k "node index --utils song"
|
||||
start cmd /k "node index --utils artist --limit 50000"
|
||||
start cmd /k "node index --utils album --limit 50000"
|
||||
start cmd /k "node index --utils lyric"
|
||||
start cmd /k "node index --utils lyric --limit 10000"
|
||||
# start cmd /k "node index --utils comment --limit 10000"
|
||||
|
||||
start cmd /k "node index --utils playlist"
|
||||
|
||||
exit
|
||||
|
||||
# 把增量数据带上来
|
||||
|
||||
# node index --utils lyric --limit 500 --order desc
|
||||
# node index --utils lyric --limit 500
|
||||
|
||||
# lyric_5
|
||||
# 【ING】
|
||||
@ -99,39 +95,54 @@ node index --utils comment --min 2000000 --max 2500000 --limit 10000
|
||||
node index --utils comment --min 2500000 --max 3000000 --limit 10000
|
||||
node index --utils comment --min 3000000 --max 3500000 --limit 10000
|
||||
|
||||
# comment_3
|
||||
# comment_3 配置待更新
|
||||
# 【阿里云ing】
|
||||
node index --utils comment --min 3500000 --max 4000000 --limit 10000 &
|
||||
node index --utils comment --min 4000000 --max 4500000 --limit 10000 &
|
||||
node index --utils comment --min 4500000 --max 5000000 --limit 10000 &
|
||||
node index --utils comment --min 5000000 --max 5500000 --limit 10000 &
|
||||
node index --utils comment --min 5500000 --max 6000000 --limit 10000 &
|
||||
# node index --utils comment --min 5500000 --max 6000000 --limit 10000
|
||||
|
||||
# comment_4
|
||||
# 【阿里云ing】
|
||||
node index --utils comment --min 6000000 --max 6500000 --limit 10000 &
|
||||
node index --utils comment --min 6500000 --max 7000000 --limit 10000 &
|
||||
node index --utils comment --min 7000000 --max 7500000 --limit 10000 &
|
||||
node index --utils comment --min 7500000 --max 8000000 --limit 10000 &
|
||||
# comment_4 配置待更新
|
||||
|
||||
# comment_5
|
||||
node index --utils comment --min 8000000 --max 8500000 --limit 10000
|
||||
node index --utils comment --min 8500000 --max 9000000 --limit 10000
|
||||
# comment_5 配置待更新
|
||||
# 【公司电脑ing】
|
||||
# node index --utils comment --min 6000000 --max 9000000 --limit 10000
|
||||
node index --utils comment --min 9000000 --max 9500000 --limit 10000
|
||||
node index --utils comment --min 9500000 --max 10000000 --limit 10000
|
||||
# node index --utils comment --min 9500000 --max 10000000 --limit 10000
|
||||
|
||||
# comment_n
|
||||
# 【公司电脑ing】
|
||||
node index --utils comment --min 10000000 --max 20000000 --limit 10000
|
||||
node index --utils comment --min 20000000 --max 30000000 --limit 10000
|
||||
node index --utils comment --min 30000000 --max 40000000 --limit 10000
|
||||
# node index --utils comment --min 40000000 --max 50000000 --limit 10000
|
||||
node index --utils comment --min 50000000 --max 500000000 --limit 10000
|
||||
|
||||
# comment_2n
|
||||
node index --utils comment --min 1000000000 --max 1500000000 --limit 10000
|
||||
node index --utils comment --min 1500000000 --max 2000000000 --limit 10000
|
||||
node index --utils comment --min 2000000000 --max 2500000000 --limit 10000
|
||||
node index --utils comment --min 2500000000 --limit 10000
|
||||
# comment_2n_1 配置待更新
|
||||
# 【公司电脑ing】
|
||||
# node index --utils comment --min 1000000000 --max 1100000000 --limit 10000
|
||||
# node index --utils comment --min 1100000000 --max 1200000000 --limit 10000
|
||||
node index --utils comment --min 1200000000 --max 1300000000 --limit 10000
|
||||
node index --utils comment --min 1300000000 --max 1400000000 --limit 10000
|
||||
node index --utils comment --min 1400000000 --max 1500000000 --limit 10000
|
||||
|
||||
# comment_2n_2 配置待更新
|
||||
# 【手机ing】
|
||||
node index --utils comment --min 1500000000 --max 1600000000 --limit 10000 &
|
||||
node index --utils comment --min 1600000000 --max 1700000000 --limit 10000 &
|
||||
node index --utils comment --min 1700000000 --max 1800000000 --limit 10000 &
|
||||
node index --utils comment --min 1800000000 --max 1900000000 --limit 10000 &
|
||||
node index --utils comment --min 1900000000 --max 2000000000 --limit 10000 &
|
||||
|
||||
# comment_2n_3 配置待更新
|
||||
# 【阿里云ing】
|
||||
node index --utils comment --min 2000000000 --max 2100000000 --limit 10000 &
|
||||
node index --utils comment --min 2100000000 --max 2200000000 --limit 10000 &
|
||||
# node index --utils comment --min 2200000000 --max 2300000000 --limit 10000
|
||||
# node index --utils comment --min 2300000000 --max 2400000000 --limit 10000
|
||||
# node index --utils comment --min 2400000000 --max 2500000000 --limit 10000
|
||||
# node index --utils comment --min 2500000000 --limit 10000
|
||||
|
||||
# # 待整理 2000000 - 1999000000
|
||||
# start cmd /k "node index --utils comment --limit 10000 --min --max " #
|
@ -1,80 +1,103 @@
|
||||
let a = `1990000000
|
||||
1980000000
|
||||
1970000000
|
||||
1960000000
|
||||
1950000000
|
||||
1940000000
|
||||
1930000000
|
||||
1920000000
|
||||
1910000000
|
||||
1900000000
|
||||
1890000000
|
||||
1880000000
|
||||
1870000000
|
||||
1860000000
|
||||
1850000000
|
||||
1840000000
|
||||
1830000000
|
||||
1820000000
|
||||
1810000000
|
||||
1800000000
|
||||
1500000000
|
||||
1490000000
|
||||
1480000000
|
||||
1470000000
|
||||
1460000000
|
||||
1450000000
|
||||
1440000000
|
||||
1430000000
|
||||
1420000000
|
||||
1410000000
|
||||
1400000000
|
||||
1390000000
|
||||
1380000000
|
||||
1370000000
|
||||
1360000000
|
||||
1350000000
|
||||
1340000000
|
||||
1330000000
|
||||
1320000000
|
||||
1310000000
|
||||
1300000000
|
||||
1290000000
|
||||
860000000
|
||||
570000000
|
||||
560000000
|
||||
550000000
|
||||
540000000
|
||||
530000000
|
||||
520000000
|
||||
510000000
|
||||
500000000
|
||||
490000000
|
||||
480000000
|
||||
470000000
|
||||
460000000
|
||||
450000000
|
||||
440000000
|
||||
430000000
|
||||
420000000
|
||||
410000000
|
||||
400000000
|
||||
390000000
|
||||
40000000
|
||||
30000000
|
||||
20000000
|
||||
10000000
|
||||
0`
|
||||
|
||||
const splitCount = 1000
|
||||
const step = 10000000 / splitCount
|
||||
|
||||
let b = []
|
||||
a.split('\n')
|
||||
.map(i => Number(i))
|
||||
.forEach(n => {
|
||||
for (let i = splitCount; i > 0; i--) {
|
||||
b.push(Number(n) + (i - 1) * step)
|
||||
}
|
||||
});
|
||||
console.log(b.join('\n'))
|
||||
// -- 查看需要爬取的 comment 的分布
|
||||
// SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
// FROM comment_progress
|
||||
// WHERE current_status != 2
|
||||
// GROUP BY s
|
||||
// ORDER BY s DESC;
|
||||
|
||||
// 变量 a 为通过执行以上SQL获取的分段
|
||||
let a = `2110000000
|
||||
2100000000
|
||||
2090000000
|
||||
2080000000
|
||||
2070000000
|
||||
2060000000
|
||||
2050000000
|
||||
2040000000
|
||||
2030000000
|
||||
2020000000
|
||||
2010000000
|
||||
2000000000
|
||||
1990000000
|
||||
1980000000
|
||||
1970000000
|
||||
1960000000
|
||||
1950000000
|
||||
1940000000
|
||||
1930000000
|
||||
1920000000
|
||||
1910000000
|
||||
1900000000
|
||||
1890000000
|
||||
1880000000
|
||||
1870000000
|
||||
1860000000
|
||||
1850000000
|
||||
1840000000
|
||||
1830000000
|
||||
1820000000
|
||||
1810000000
|
||||
1800000000
|
||||
1500000000
|
||||
1490000000
|
||||
1480000000
|
||||
1470000000
|
||||
1460000000
|
||||
1450000000
|
||||
1440000000
|
||||
1430000000
|
||||
1420000000
|
||||
1410000000
|
||||
1400000000
|
||||
1390000000
|
||||
1380000000
|
||||
1370000000
|
||||
1360000000
|
||||
1350000000
|
||||
1340000000
|
||||
1330000000
|
||||
1320000000
|
||||
1310000000
|
||||
1300000000
|
||||
1290000000
|
||||
860000000
|
||||
570000000
|
||||
560000000
|
||||
550000000
|
||||
540000000
|
||||
530000000
|
||||
520000000
|
||||
510000000
|
||||
500000000
|
||||
490000000
|
||||
480000000
|
||||
470000000
|
||||
460000000
|
||||
450000000
|
||||
440000000
|
||||
430000000
|
||||
420000000
|
||||
410000000
|
||||
400000000
|
||||
390000000
|
||||
30000000
|
||||
20000000
|
||||
10000000
|
||||
0`
|
||||
|
||||
const splitCount = 1
|
||||
const step = 10000000 / splitCount
|
||||
|
||||
let b = []
|
||||
a.split('\n')
|
||||
.map(i => Number(i))
|
||||
.forEach(n => {
|
||||
for (let i = splitCount; i > 0; i--) {
|
||||
b.push(Number(n) + (i - 1) * step)
|
||||
}
|
||||
});
|
||||
let content = b.join('\n')
|
||||
// console.log(content)
|
||||
|
||||
const fs = require('fs')
|
||||
fs.writeFileSync('comment id segment.txt', content, 'utf-8')
|
@ -1,136 +1,147 @@
|
||||
-- 更新统计数据
|
||||
-- songCount 容易超时,有几张表查询时容易发生死锁,所以请在没有爬取时进行统计
|
||||
-- 4G: 4294967296 (4 * 1024 * 1024 * 1024) 64M: 67108864
|
||||
-- my.ini 配置文件中设置 innodb_buffer_pool_size=4G
|
||||
show variables like "%innodb_buffer_pool_size%";
|
||||
DELETE FROM analysis WHERE `key` LIKE '%_old';
|
||||
UPDATE analysis SET `key`=concat(`key`,'_old'), modify_time=modify_time WHERE `key` NOT LIKE '%_old';
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songCount', (SELECT count(*) as count FROM song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songWaiting', (SELECT count(*) as count FROM wait_fetch_song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('playlistCount', (SELECT count(*) AS count FROM playlist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('albumCount', (SELECT count(*) as count FROM album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('albumWaiting', (SELECT count(*) as count FROM wait_fetch_album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('artistCount', (SELECT count(*) AS count FROM artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('artistWaiting', (SELECT count(*) as count FROM wait_fetch_artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('lyricCount', (SELECT count(*) AS count FROM lyric) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('commentCount', (SELECT count( DISTINCT song_id ) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('commentTotalCount', (SELECT count(*) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('userCount', (SELECT count(*) AS count FROM user) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songPlaylistCount', (SELECT count(*) AS count FROM song_playlist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songAlbumCount', (SELECT count(*) AS count FROM song_album_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songArtistCount', (SELECT count(*) AS count FROM song_artist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
|
||||
|
||||
|
||||
-- 更新后初次全表扫描
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
|
||||
|
||||
-- 全量更新
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric );
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress );
|
||||
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist );
|
||||
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album );
|
||||
|
||||
|
||||
|
||||
-- 查看需要爬取的 song 的分布
|
||||
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM wait_fetch_song
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看需要爬取的 album 的分布
|
||||
SELECT cast( FLOOR( id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM wait_fetch_album
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看需要爬取的 artist 的分布
|
||||
SELECT cast( FLOOR(id / 100000 ) * 100000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM wait_fetch_artist
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看需要爬取的 comment 的分布
|
||||
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM comment_progress
|
||||
WHERE current_status != 2
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看需要爬取的 lyric 的分布
|
||||
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM wait_fetch_lyric
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
|
||||
|
||||
|
||||
-- 查看本地已有 song 的分布
|
||||
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM song
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看本地已有 user 的分布
|
||||
SELECT cast( FLOOR( user_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM user
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看本地已有 album 的分布
|
||||
SELECT cast( FLOOR( album_id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM album
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看本地已有 artist 的分布
|
||||
SELECT cast( FLOOR( artist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM artist
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看本地已有 playlist 的分布
|
||||
SELECT cast( FLOOR( playlist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM playlist
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
|
||||
|
||||
-- 查询单个数据库里面各个表所占磁盘空间大小包括其索引的大小
|
||||
SELECT
|
||||
table_schema AS '数据库',
|
||||
table_name AS '表名',
|
||||
table_rows AS '记录数',
|
||||
TRUNCATE (data_length / 1024 / 1024, 2) AS '数据容量(MB)',
|
||||
TRUNCATE (index_length / 1024 / 1024, 2) AS '索引容量(MB)',
|
||||
TRUNCATE ((data_length + index_length) / 1024 / 1024 / 1024, 2) AS '总容量(GB)'
|
||||
FROM
|
||||
information_schema.TABLES
|
||||
WHERE
|
||||
table_schema = 'neteasemusic'
|
||||
ORDER BY
|
||||
table_rows DESC;
|
||||
-- 更新统计数据
|
||||
-- songCount 容易超时,有几张表查询时容易发生死锁,所以请在没有爬取时进行统计
|
||||
-- 4G: 4294967296 (4 * 1024 * 1024 * 1024) 64M: 67108864
|
||||
-- my.ini 配置文件中设置 innodb_buffer_pool_size=4G
|
||||
show variables like "%innodb_buffer_pool_size%";
|
||||
DELETE FROM analysis WHERE `key` LIKE '%_old';
|
||||
UPDATE analysis SET `key`=concat(`key`,'_old'), modify_time=modify_time WHERE `key` NOT LIKE '%_old';
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songCount', (SELECT count(*) as count FROM song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songWaiting', (SELECT count(*) as count FROM wait_fetch_song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('playlistCount', (SELECT count(*) AS count FROM playlist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('albumCount', (SELECT count(*) as count FROM album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('albumWaiting', (SELECT count(*) as count FROM wait_fetch_album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('artistCount', (SELECT count(*) AS count FROM artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('artistWaiting', (SELECT count(*) as count FROM wait_fetch_artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('lyricCount', (SELECT count(*) AS count FROM lyric) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('commentCount', (SELECT count( DISTINCT song_id ) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('commentTotalCount', (SELECT count(*) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('userCount', (SELECT count(*) AS count FROM user) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songPlaylistCount', (SELECT count(*) AS count FROM song_playlist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songAlbumCount', (SELECT count(*) AS count FROM song_album_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
INSERT INTO analysis (`key`, `value`) VALUES ('songArtistCount', (SELECT count(*) AS count FROM song_artist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
|
||||
|
||||
|
||||
|
||||
-- 更新后初次全表扫描
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
|
||||
|
||||
|
||||
|
||||
-- 全量更新
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
|
||||
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
|
||||
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric );
|
||||
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress );
|
||||
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist );
|
||||
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album );
|
||||
|
||||
|
||||
|
||||
-- 查看需要爬取的 song 的分布
|
||||
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM wait_fetch_song
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看需要爬取的 album 的分布
|
||||
SELECT cast( FLOOR( id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM wait_fetch_album
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看需要爬取的 artist 的分布
|
||||
SELECT cast( FLOOR(id / 100000 ) * 100000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM wait_fetch_artist
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看需要爬取的 comment 的分布
|
||||
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM comment_progress
|
||||
WHERE current_status != 2
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看需要爬取的 lyric 的分布
|
||||
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM wait_fetch_lyric
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
|
||||
|
||||
|
||||
-- 查看本地已有 song 的分布
|
||||
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM song
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看本地已有 user 的分布
|
||||
SELECT cast( FLOOR( user_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM user
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看本地已有 album 的分布
|
||||
SELECT cast( FLOOR( album_id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM album
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看本地已有 artist 的分布
|
||||
SELECT cast( FLOOR( artist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM artist
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
-- 查看本地已有 playlist 的分布
|
||||
SELECT cast( FLOOR( playlist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
|
||||
FROM playlist
|
||||
GROUP BY s
|
||||
ORDER BY s DESC;
|
||||
|
||||
|
||||
|
||||
-- 查询单个数据库里面各个表所占磁盘空间大小包括其索引的大小
|
||||
SELECT
|
||||
table_schema AS '数据库',
|
||||
table_name AS '表名',
|
||||
table_rows AS '记录数',
|
||||
TRUNCATE (data_length / 1024 / 1024, 2) AS '数据容量(MB)',
|
||||
TRUNCATE (index_length / 1024 / 1024, 2) AS '索引容量(MB)',
|
||||
TRUNCATE ((data_length + index_length) / 1024 / 1024 / 1024, 2) AS '总容量(GB)'
|
||||
FROM
|
||||
information_schema.TABLES
|
||||
WHERE
|
||||
table_schema = 'neteasemusic'
|
||||
ORDER BY
|
||||
table_rows DESC;
|
||||
|
||||
|
||||
|
||||
-- 统计等待爬取的数据条数 2023.12.25
|
||||
SELECT 'comment' as wait_fetch, count(*) as `count` FROM `comment_progress` where current_status = 0
|
||||
UNION ALL
|
||||
SELECT 'album', count(*) FROM `wait_fetch_album`
|
||||
UNION ALL
|
||||
SELECT 'artist', count(*) FROM `wait_fetch_artist`
|
||||
UNION ALL
|
||||
SELECT 'lyric', count(*) FROM `wait_fetch_lyric`
|
@ -1,74 +1,79 @@
|
||||
windows服务器
|
||||
cd C:\Users\Administrator\Desktop\tools\netease_music
|
||||
|
||||
linux服务器
|
||||
cd /www/neteasemusic/tools
|
||||
|
||||
|
||||
本地库测试
|
||||
node index --database neteasemusic_develop --utils song
|
||||
node index --database neteasemusic_develop --utils album --min 10000000
|
||||
node index --database neteasemusic_develop --utils album --order desc
|
||||
node index --database neteasemusic_develop --utils artist
|
||||
node index --database neteasemusic_develop --utils playlist
|
||||
node index --database neteasemusic_develop --utils comment --limit 10000
|
||||
node index --database neteasemusic_develop --utils lyric
|
||||
node index --database neteasemusic_develop --utils assistant
|
||||
|
||||
|
||||
|
||||
思路:
|
||||
通过一首歌,查出对应的artist和album,然后顺藤摸瓜查出网易云的其他song, album, artist, lyric, comment等
|
||||
|
||||
插入rel表的时候同时插入 wait_check_xx 表,然后后续检查这个表,如果不存在,那么就插入对应的 wait_fetch_xxx 表
|
||||
之后查出 wait_fetch_xxx 表,进行数据拉取,形成闭环
|
||||
|
||||
|
||||
|
||||
后期:
|
||||
歌单定时更新(rel表中添加一个del字段,先将歌单下面的全部置为删除状态,再插入的时候把已有歌曲的标记重新修改为正常状态)
|
||||
|
||||
评论的更新
|
||||
|
||||
被删除的aritst和album回头再通过其他表中的数据反查回来
|
||||
|
||||
歌曲目前爬取之后,会有一部分没有image封面,还是需要用旧方法爬取到
|
||||
|
||||
|
||||
|
||||
说明:
|
||||
song表中data_version=1的音乐是第一次爬取的时候存在,但是后面再爬取时不存在的音乐
|
||||
|
||||
|
||||
|
||||
后续分区(不能在现有表上修改,只能重新查出数据到新表)
|
||||
alter table song add partition (
|
||||
PARTITION p1 VALUES LESS THAN ( 50000000),
|
||||
PARTITION p2 VALUES LESS THAN (1000000000),
|
||||
PARTITION p3 VALUES LESS THAN (1500000000),
|
||||
PARTITION p4 VALUES LESS THAN (2000000000),
|
||||
PARTITION p5 VALUES LESS THAN MAXVALUE
|
||||
);
|
||||
|
||||
|
||||
|
||||
SQL文件说明
|
||||
sql/structure.sql 中的SQL为最简,不包含字段的编码集
|
||||
sql/neteasemusic.sql 中的SQL为数据库导出,包含字段的编码集
|
||||
项目数据库 CHARACTER SET 统一使用 'utf8mb4',COLLATE 统一使用 'utf8mb4_general_ci'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # 查看列表
|
||||
# screen -ls
|
||||
|
||||
# # 创建一个screen
|
||||
# screen + <Enter>
|
||||
|
||||
# # 切换到指定屏幕
|
||||
# screen -r <screen_id>
|
||||
|
||||
# # 切出屏幕
|
||||
windows服务器
|
||||
cd C:\Users\Administrator\Desktop\tools\netease_music
|
||||
|
||||
linux服务器
|
||||
cd /www/neteasemusic/tools
|
||||
|
||||
手机 Termux
|
||||
pkg update
|
||||
pkg install git
|
||||
pkg install nodejs
|
||||
|
||||
|
||||
本地库测试
|
||||
node index --database neteasemusic_develop --utils song
|
||||
node index --database neteasemusic_develop --utils album --min 10000000
|
||||
node index --database neteasemusic_develop --utils album --order desc
|
||||
node index --database neteasemusic_develop --utils artist
|
||||
node index --database neteasemusic_develop --utils playlist
|
||||
node index --database neteasemusic_develop --utils comment --limit 10000
|
||||
node index --database neteasemusic_develop --utils lyric
|
||||
node index --database neteasemusic_develop --utils assistant
|
||||
|
||||
|
||||
|
||||
思路:
|
||||
通过一首歌,查出对应的artist和album,然后顺藤摸瓜查出网易云的其他song, album, artist, lyric, comment等
|
||||
|
||||
插入rel表的时候同时插入 wait_check_xx 表,然后后续检查这个表,如果不存在,那么就插入对应的 wait_fetch_xxx 表
|
||||
之后查出 wait_fetch_xxx 表,进行数据拉取,形成闭环
|
||||
|
||||
|
||||
|
||||
后期:
|
||||
歌单定时更新(rel表中添加一个del字段,先将歌单下面的全部置为删除状态,再插入的时候把已有歌曲的标记重新修改为正常状态)
|
||||
|
||||
评论的更新
|
||||
|
||||
被删除的aritst和album回头再通过其他表中的数据反查回来
|
||||
|
||||
歌曲目前爬取之后,会有一部分没有image封面,还是需要用旧方法爬取到
|
||||
|
||||
|
||||
|
||||
说明:
|
||||
song表中data_version=1的音乐是第一次爬取的时候存在,但是后面再爬取时不存在的音乐
|
||||
|
||||
|
||||
|
||||
后续分区(不能在现有表上修改,只能重新查出数据到新表)
|
||||
alter table song add partition (
|
||||
PARTITION p1 VALUES LESS THAN ( 50000000),
|
||||
PARTITION p2 VALUES LESS THAN (1000000000),
|
||||
PARTITION p3 VALUES LESS THAN (1500000000),
|
||||
PARTITION p4 VALUES LESS THAN (2000000000),
|
||||
PARTITION p5 VALUES LESS THAN MAXVALUE
|
||||
);
|
||||
|
||||
|
||||
|
||||
SQL文件说明
|
||||
sql/structure.sql 中的SQL为最简,不包含字段的编码集
|
||||
sql/neteasemusic.sql 中的SQL为数据库导出,包含字段的编码集
|
||||
项目数据库 CHARACTER SET 统一使用 'utf8mb4',COLLATE 统一使用 'utf8mb4_general_ci'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# # 查看列表
|
||||
# screen -ls
|
||||
|
||||
# # 创建一个screen
|
||||
# screen + <Enter>
|
||||
|
||||
# # 切换到指定屏幕
|
||||
# screen -r <screen_id>
|
||||
|
||||
# # 切出屏幕
|
||||
# Ctrl + A D
|
@ -1,40 +1,40 @@
|
||||
|
||||
// const mysql = require('mysql');
|
||||
// await new Promise(function (resolve, reject) {
|
||||
// //通过MySQL中方法创建连接对象
|
||||
// var connection = mysql.createConnection({
|
||||
// "charset": "utf8mb4",
|
||||
// "host": "localhost",
|
||||
// "user": "root",
|
||||
// "password": "123456",
|
||||
// "port": 3306,
|
||||
// "database": ""
|
||||
// });
|
||||
// //开始连接
|
||||
// connection.connect();
|
||||
// var sql = `
|
||||
// INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
|
||||
// ON DUPLICATE KEY UPDATE content = VALUES(content), like_count = VALUES(like_count), comment_type = GREATEST(comment_type, VALUES(comment_type)), modify_time = CURRENT_TIMESTAMP
|
||||
// `;
|
||||
// var params = commentInfoList.map(commentInfo => [
|
||||
// commentInfo.comment_id,
|
||||
// commentInfo.parent_comment_id,
|
||||
// commentInfo.user_id,
|
||||
// commentInfo.song_id,
|
||||
// commentInfo.content,
|
||||
// commentInfo.time,
|
||||
// commentInfo.like_count,
|
||||
// commentInfo.comment_type
|
||||
// ]);
|
||||
// var formattedSql = connection.format(sql, [params]); // 返回一个格式化后的SQL字符串
|
||||
// console.log(params); // 打印原始SQL语句
|
||||
// console.log(formattedSql); // 打印原始SQL语句
|
||||
// //最后需要关闭连接
|
||||
// connection.end();
|
||||
// });
|
||||
// process.exit(0);
|
||||
|
||||
|
||||
// node index --utils comment --min 1935500000 --max 1935550000 --limit 10
|
||||
|
||||
|
||||
|
||||
// const mysql = require('mysql');
|
||||
// await new Promise(function (resolve, reject) {
|
||||
// //通过MySQL中方法创建连接对象
|
||||
// var connection = mysql.createConnection({
|
||||
// "charset": "utf8mb4",
|
||||
// "host": "localhost",
|
||||
// "user": "root",
|
||||
// "password": "123456",
|
||||
// "port": 3306,
|
||||
// "database": ""
|
||||
// });
|
||||
// //开始连接
|
||||
// connection.connect();
|
||||
// var sql = `
|
||||
// INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
|
||||
// ON DUPLICATE KEY UPDATE content = VALUES(content), like_count = VALUES(like_count), comment_type = GREATEST(comment_type, VALUES(comment_type)), modify_time = CURRENT_TIMESTAMP
|
||||
// `;
|
||||
// var params = commentInfoList.map(commentInfo => [
|
||||
// commentInfo.comment_id,
|
||||
// commentInfo.parent_comment_id,
|
||||
// commentInfo.user_id,
|
||||
// commentInfo.song_id,
|
||||
// commentInfo.content,
|
||||
// commentInfo.time,
|
||||
// commentInfo.like_count,
|
||||
// commentInfo.comment_type
|
||||
// ]);
|
||||
// var formattedSql = connection.format(sql, [params]); // 返回一个格式化后的SQL字符串
|
||||
// console.log(params); // 打印原始SQL语句
|
||||
// console.log(formattedSql); // 打印原始SQL语句
|
||||
// //最后需要关闭连接
|
||||
// connection.end();
|
||||
// });
|
||||
// process.exit(0);
|
||||
|
||||
|
||||
// node index --utils comment --min 1935500000 --max 1935550000 --limit 10
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user