1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee

更新 netease_music 脚本

This commit is contained in:
2023-12-25 16:46:39 +08:00
parent e3ea397f03
commit fc46e76d74
10 changed files with 412 additions and 371 deletions

View File

@@ -0,0 +1 @@
comment id segment.txt

View File

@@ -0,0 +1 @@
start cmd

View File

@@ -0,0 +1,9 @@
REPLACE INTO `comment_origin_` SELECT * FROM `comment`; -- 80.1G
REPLACE INTO `user_origin_` SELECT * FROM `user`; -- 5.20G
DELETE `comment`
FROM `comment` INNER JOIN `comment_origin_` ON `comment`.comment_id = `comment_origin_`.comment_id;
DELETE `user`
FROM `user` INNER JOIN `user_origin_` ON `user`.user_id = `user_origin_`.user_id;

View File

@@ -0,0 +1,13 @@
# https://shell.aliyun.com/
git clone https://git.only4.work/coder-xiaomo/tools
cd tools
echo '{"mysql":{"charset":"utf8mb4","host":"124.220.172.110","user":"root","password":"123456","port":5204,"database":"","connectTimeout": 3600000,"acquireTimeout": 3600000,"timeout": 3600000}}' > config.json
npm config set registry https://registry.npmmirror.com
cat config.json
npm config get registry
npm i
# cd netease_music
cd ~/tools/netease_music/

View File

@@ -0,0 +1,148 @@
# 本地
# cd ./netease_music
cd tools/netease_music/
# 【ING】
start cmd /k "node index --utils assistant"
start cmd /k "node index --utils song"
start cmd /k "node index --utils artist --limit 50000"
start cmd /k "node index --utils album --limit 50000"
start cmd /k "node index --utils lyric --limit 10000"
# start cmd /k "node index --utils comment --limit 10000"
start cmd /k "node index --utils playlist"
# lyric_5
# 【ING】
# node index --utils lyric --min 2100000000 --limit 10000
node index --utils lyric --min 2090000000 --max 2100000000 --limit 10000
node index --utils lyric --min 2080000000 --max 2090000000 --limit 10000
node index --utils lyric --min 2070000000 --max 2080000000 --limit 10000
node index --utils lyric --min 2000000000 --max 2070000000 --limit 10000
node index --utils lyric --min 1970000000 --max 2000000000 --limit 10000
node index --utils lyric --min 1960000000 --max 1970000000 --limit 10000
node index --utils lyric --min 1950000000 --max 1960000000 --limit 10000
node index --utils lyric --min 1940000000 --max 1950000000 --limit 10000
# lyric_4
node index --utils lyric --min 1930000000 --max 1940000000 --limit 10000
node index --utils lyric --min 1920000000 --max 1930000000 --limit 10000
node index --utils lyric --min 1910000000 --max 1920000000 --limit 10000
node index --utils lyric --min 1900000000 --max 1910000000 --limit 10000
node index --utils lyric --min 1890000000 --max 1900000000 --limit 10000
node index --utils lyric --min 1880000000 --max 1890000000 --limit 10000
node index --utils lyric --min 1870000000 --max 1880000000 --limit 10000
node index --utils lyric --min 1860000000 --max 1870000000 --limit 10000
node index --utils lyric --min 1850000000 --max 1860000000 --limit 10000
node index --utils lyric --min 1840000000 --max 1850000000 --limit 10000
# lyric_3
node index --utils lyric --min 1830000000 --max 1840000000 --limit 10000
node index --utils lyric --min 1820000000 --max 1830000000 --limit 10000
node index --utils lyric --min 1810000000 --max 1820000000 --limit 10000
node index --utils lyric --min 1800000000 --max 1810000000 --limit 10000
node index --utils lyric --min 1500000000 --max 1800000000 --limit 10000
node index --utils lyric --min 1490000000 --max 1500000000 --limit 10000
node index --utils lyric --min 1480000000 --max 1490000000 --limit 10000
node index --utils lyric --min 1470000000 --max 1480000000 --limit 10000
node index --utils lyric --min 1460000000 --max 1470000000 --limit 10000
node index --utils lyric --min 1450000000 --max 1460000000 --limit 10000
# lyric_2
# 【ING】
node index --utils lyric --min 1440000000 --max 1450000000 --limit 10000
node index --utils lyric --min 1430000000 --max 1440000000 --limit 10000
node index --utils lyric --min 1420000000 --max 1430000000 --limit 10000
node index --utils lyric --min 1410000000 --max 1420000000 --limit 10000
node index --utils lyric --min 1400000000 --max 1410000000 --limit 10000
node index --utils lyric --min 1390000000 --max 1400000000 --limit 10000
node index --utils lyric --min 1380000000 --max 1390000000 --limit 10000
node index --utils lyric --min 1370000000 --max 1380000000 --limit 10000
node index --utils lyric --min 1360000000 --max 1370000000 --limit 10000
node index --utils lyric --min 1350000000 --max 1360000000 --limit 10000
# lyric_1
# 【ING】
node index --utils lyric --min 1340000000 --max 1350000000 --limit 10000
node index --utils lyric --min 1330000000 --max 1340000000 --limit 10000
node index --utils lyric --min 1320000000 --max 1330000000 --limit 10000
node index --utils lyric --min 1310000000 --max 1320000000 --limit 10000
node index --utils lyric --min 1300000000 --max 1310000000 --limit 10000
node index --utils lyric --min 570000000 --max 1300000000 --limit 10000
node index --utils lyric --min 560000000 --max 570000000 --limit 10000
node index --utils lyric --min 550000000 --max 560000000 --limit 10000
node index --utils lyric --min 540000000 --max 550000000 --limit 10000
node index --utils lyric --min 530000000 --max 540000000 --limit 10000
node index --utils lyric --max 530000000 --limit 10000
# ###################################################################################
# comment_1
# 【ING】
# node index --utils comment --min 0 --max 100000 --limit 10000
node index --utils comment --min 100000 --max 200000 --limit 10000
node index --utils comment --min 200000 --max 400000 --limit 10000
node index --utils comment --min 400000 --max 600000 --limit 10000
node index --utils comment --min 600000 --max 800000 --limit 10000
# node index --utils comment --min 900000 --max 1000000 --limit 10000
# comment_2
# 【ING】
node index --utils comment --min 1000000 --max 1500000 --limit 10000
node index --utils comment --min 1500000 --max 2000000 --limit 10000
node index --utils comment --min 2000000 --max 2500000 --limit 10000
node index --utils comment --min 2500000 --max 3000000 --limit 10000
node index --utils comment --min 3000000 --max 3500000 --limit 10000
# comment_3 配置待更新
# 【阿里云ing】
node index --utils comment --min 3500000 --max 4000000 --limit 10000 &
node index --utils comment --min 4000000 --max 4500000 --limit 10000 &
node index --utils comment --min 4500000 --max 5000000 --limit 10000 &
node index --utils comment --min 5000000 --max 5500000 --limit 10000 &
# node index --utils comment --min 5500000 --max 6000000 --limit 10000
# comment_4 配置待更新
# comment_5 配置待更新
# 【公司电脑ing】
# node index --utils comment --min 6000000 --max 9000000 --limit 10000
node index --utils comment --min 9000000 --max 9500000 --limit 10000
# node index --utils comment --min 9500000 --max 10000000 --limit 10000
# comment_n
# 【公司电脑ing】
node index --utils comment --min 10000000 --max 20000000 --limit 10000
node index --utils comment --min 20000000 --max 30000000 --limit 10000
node index --utils comment --min 30000000 --max 40000000 --limit 10000
# node index --utils comment --min 40000000 --max 50000000 --limit 10000
node index --utils comment --min 50000000 --max 500000000 --limit 10000
# comment_2n_1 配置待更新
# 【公司电脑ing】
# node index --utils comment --min 1000000000 --max 1100000000 --limit 10000
# node index --utils comment --min 1100000000 --max 1200000000 --limit 10000
node index --utils comment --min 1200000000 --max 1300000000 --limit 10000
node index --utils comment --min 1300000000 --max 1400000000 --limit 10000
node index --utils comment --min 1400000000 --max 1500000000 --limit 10000
# comment_2n_2 配置待更新
# 【手机ing】
node index --utils comment --min 1500000000 --max 1600000000 --limit 10000 &
node index --utils comment --min 1600000000 --max 1700000000 --limit 10000 &
node index --utils comment --min 1700000000 --max 1800000000 --limit 10000 &
node index --utils comment --min 1800000000 --max 1900000000 --limit 10000 &
node index --utils comment --min 1900000000 --max 2000000000 --limit 10000 &
# comment_2n_3 配置待更新
# 【阿里云ing】
node index --utils comment --min 2000000000 --max 2100000000 --limit 10000 &
node index --utils comment --min 2100000000 --max 2200000000 --limit 10000 &
# node index --utils comment --min 2200000000 --max 2300000000 --limit 10000
# node index --utils comment --min 2300000000 --max 2400000000 --limit 10000
# node index --utils comment --min 2400000000 --max 2500000000 --limit 10000
# node index --utils comment --min 2500000000 --limit 10000
# # 待整理 2000000 - 1999000000
# start cmd /k "node index --utils comment --limit 10000 --min --max " #

View File

@@ -0,0 +1,103 @@
// -- 查看需要爬取的 comment 的分布
// SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
// FROM comment_progress
// WHERE current_status != 2
// GROUP BY s
// ORDER BY s DESC;
// 变量 a 为通过执行以上SQL获取的分段
let a = `2110000000
2100000000
2090000000
2080000000
2070000000
2060000000
2050000000
2040000000
2030000000
2020000000
2010000000
2000000000
1990000000
1980000000
1970000000
1960000000
1950000000
1940000000
1930000000
1920000000
1910000000
1900000000
1890000000
1880000000
1870000000
1860000000
1850000000
1840000000
1830000000
1820000000
1810000000
1800000000
1500000000
1490000000
1480000000
1470000000
1460000000
1450000000
1440000000
1430000000
1420000000
1410000000
1400000000
1390000000
1380000000
1370000000
1360000000
1350000000
1340000000
1330000000
1320000000
1310000000
1300000000
1290000000
860000000
570000000
560000000
550000000
540000000
530000000
520000000
510000000
500000000
490000000
480000000
470000000
460000000
450000000
440000000
430000000
420000000
410000000
400000000
390000000
30000000
20000000
10000000
0`
const splitCount = 1
const step = 10000000 / splitCount
let b = []
a.split('\n')
.map(i => Number(i))
.forEach(n => {
for (let i = splitCount; i > 0; i--) {
b.push(Number(n) + (i - 1) * step)
}
});
let content = b.join('\n')
// console.log(content)
const fs = require('fs')
fs.writeFileSync('comment id segment.txt', content, 'utf-8')

View File

@@ -0,0 +1,147 @@
-- 更新统计数据
-- songCount 容易超时,有几张表查询时容易发生死锁,所以请在没有爬取时进行统计
-- 4G: 4294967296 (4 * 1024 * 1024 * 1024) 64M: 67108864
-- my.ini 配置文件中设置 innodb_buffer_pool_size=4G
show variables like "%innodb_buffer_pool_size%";
DELETE FROM analysis WHERE `key` LIKE '%_old';
UPDATE analysis SET `key`=concat(`key`,'_old'), modify_time=modify_time WHERE `key` NOT LIKE '%_old';
INSERT INTO analysis (`key`, `value`) VALUES ('songCount', (SELECT count(*) as count FROM song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songWaiting', (SELECT count(*) as count FROM wait_fetch_song) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('playlistCount', (SELECT count(*) AS count FROM playlist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('albumCount', (SELECT count(*) as count FROM album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('albumWaiting', (SELECT count(*) as count FROM wait_fetch_album) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('artistCount', (SELECT count(*) AS count FROM artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('artistWaiting', (SELECT count(*) as count FROM wait_fetch_artist) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('lyricCount', (SELECT count(*) AS count FROM lyric) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('commentCount', (SELECT count( DISTINCT song_id ) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('commentTotalCount', (SELECT count(*) AS count FROM comment) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('userCount', (SELECT count(*) AS count FROM user) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songPlaylistCount', (SELECT count(*) AS count FROM song_playlist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songAlbumCount', (SELECT count(*) AS count FROM song_album_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
INSERT INTO analysis (`key`, `value`) VALUES ('songArtistCount', (SELECT count(*) AS count FROM song_artist_relation) ) ON DUPLICATE KEY UPDATE `value` = VALUES(`value`);
-- 更新后初次全表扫描
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song_playlist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE create_time > '2022-10-28 00:00:00';
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE create_time > '2022-10-28 00:00:00';
-- 全量更新
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
INSERT IGNORE INTO wait_check_song (id) SELECT song_id FROM song_playlist_relation WHERE song_id NOT IN ( SELECT song_id FROM song );
INSERT IGNORE INTO wait_check_lyric (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric );
INSERT IGNORE INTO wait_check_comment (id) SELECT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress );
INSERT IGNORE INTO wait_check_artist (id) SELECT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist );
INSERT IGNORE INTO wait_check_album (id) SELECT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album );
-- 查看需要爬取的 song 的分布
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_song
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 album 的分布
SELECT cast( FLOOR( id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_album
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 artist 的分布
SELECT cast( FLOOR(id / 100000 ) * 100000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_artist
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 comment 的分布
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM comment_progress
WHERE current_status != 2
GROUP BY s
ORDER BY s DESC;
-- 查看需要爬取的 lyric 的分布
SELECT cast( FLOOR( id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM wait_fetch_lyric
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 song 的分布
SELECT cast( FLOOR( song_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM song
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 user 的分布
SELECT cast( FLOOR( user_id / 10000000 ) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM user
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 album 的分布
SELECT cast( FLOOR( album_id / 1000000 ) * 1000000 as UNSIGNED ) as s, count(*) as count
FROM album
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 artist 的分布
SELECT cast( FLOOR( artist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
FROM artist
GROUP BY s
ORDER BY s DESC;
-- 查看本地已有 playlist 的分布
SELECT cast( FLOOR( playlist_id / 2000000 ) * 2000000 as UNSIGNED ) as s, count(*) as count
FROM playlist
GROUP BY s
ORDER BY s DESC;
-- 查询单个数据库里面各个表所占磁盘空间大小包括其索引的大小
SELECT
table_schema AS '数据库',
table_name AS '表名',
table_rows AS '记录数',
TRUNCATE (data_length / 1024 / 1024, 2) AS '数据容量(MB)',
TRUNCATE (index_length / 1024 / 1024, 2) AS '索引容量(MB)',
TRUNCATE ((data_length + index_length) / 1024 / 1024 / 1024, 2) AS '总容量(GB)'
FROM
information_schema.TABLES
WHERE
table_schema = 'neteasemusic'
ORDER BY
table_rows DESC;
-- 统计等待爬取的数据条数 2023.12.25
SELECT 'comment' as wait_fetch, count(*) as `count` FROM `comment_progress` where current_status = 0
UNION ALL
SELECT 'album', count(*) FROM `wait_fetch_album`
UNION ALL
SELECT 'artist', count(*) FROM `wait_fetch_artist`
UNION ALL
SELECT 'lyric', count(*) FROM `wait_fetch_lyric`

View File

@@ -0,0 +1,79 @@
windows服务器
cd C:\Users\Administrator\Desktop\tools\netease_music
linux服务器
cd /www/neteasemusic/tools
手机 Termux
pkg update
pkg install git
pkg install nodejs
本地库测试
node index --database neteasemusic_develop --utils song
node index --database neteasemusic_develop --utils album --min 10000000
node index --database neteasemusic_develop --utils album --order desc
node index --database neteasemusic_develop --utils artist
node index --database neteasemusic_develop --utils playlist
node index --database neteasemusic_develop --utils comment --limit 10000
node index --database neteasemusic_develop --utils lyric
node index --database neteasemusic_develop --utils assistant
思路:
通过一首歌查出对应的artist和album然后顺藤摸瓜查出网易云的其他song, album, artist, lyric, comment等
插入rel表的时候同时插入 wait_check_xx 表,然后后续检查这个表,如果不存在,那么就插入对应的 wait_fetch_xxx 表
之后查出 wait_fetch_xxx 表,进行数据拉取,形成闭环
后期:
歌单定时更新rel表中添加一个del字段先将歌单下面的全部置为删除状态再插入的时候把已有歌曲的标记重新修改为正常状态
评论的更新
被删除的aritst和album回头再通过其他表中的数据反查回来
歌曲目前爬取之后会有一部分没有image封面还是需要用旧方法爬取到
说明:
song表中data_version=1的音乐是第一次爬取的时候存在但是后面再爬取时不存在的音乐
后续分区(不能在现有表上修改,只能重新查出数据到新表)
alter table song add partition (
PARTITION p1 VALUES LESS THAN ( 50000000),
PARTITION p2 VALUES LESS THAN (1000000000),
PARTITION p3 VALUES LESS THAN (1500000000),
PARTITION p4 VALUES LESS THAN (2000000000),
PARTITION p5 VALUES LESS THAN MAXVALUE
);
SQL文件说明
sql/structure.sql 中的SQL为最简不包含字段的编码集
sql/neteasemusic.sql 中的SQL为数据库导出包含字段的编码集
项目数据库 CHARACTER SET 统一使用 'utf8mb4'COLLATE 统一使用 'utf8mb4_general_ci'
# # 查看列表
# screen -ls
# # 创建一个screen
# screen + <Enter>
# # 切换到指定屏幕
# screen -r <screen_id>
# # 切出屏幕
# Ctrl + A D

View File

@@ -0,0 +1,40 @@
// const mysql = require('mysql');
// await new Promise(function (resolve, reject) {
// //通过MySQL中方法创建连接对象
// var connection = mysql.createConnection({
// "charset": "utf8mb4",
// "host": "localhost",
// "user": "root",
// "password": "123456",
// "port": 3306,
// "database": ""
// });
// //开始连接
// connection.connect();
// var sql = `
// INSERT INTO comment ( comment_id, parent_comment_id, user_id, song_id, content, time, like_count, comment_type ) VALUES ?
// ON DUPLICATE KEY UPDATE content = VALUES(content), like_count = VALUES(like_count), comment_type = GREATEST(comment_type, VALUES(comment_type)), modify_time = CURRENT_TIMESTAMP
// `;
// var params = commentInfoList.map(commentInfo => [
// commentInfo.comment_id,
// commentInfo.parent_comment_id,
// commentInfo.user_id,
// commentInfo.song_id,
// commentInfo.content,
// commentInfo.time,
// commentInfo.like_count,
// commentInfo.comment_type
// ]);
// var formattedSql = connection.format(sql, [params]); // 返回一个格式化后的SQL字符串
// console.log(params); // 打印原始SQL语句
// console.log(formattedSql); // 打印原始SQL语句
// //最后需要关闭连接
// connection.end();
// });
// process.exit(0);
// node index --utils comment --min 1935500000 --max 1935550000 --limit 10