1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee

通过命令行指定爬取参数,不用再修改代码了

This commit is contained in:
2022-10-06 21:06:09 +08:00
parent 9db9383934
commit be2658375c
11 changed files with 202 additions and 48 deletions

View File

@@ -1,13 +1,42 @@
-- 查看需要爬取的音乐的分布
SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM (
-- 查看需要爬取的 song 的分布
SELECT cast( format( t_tmp.song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM (
SELECT DISTINCT song_id FROM song_album_relation
UNION
SELECT DISTINCT song_id FROM song_artist_relation
) as t_tmp
WHERE song_id NOT IN ( SELECT song_id FROM song )
GROUP BY s
ORDER BY s DESC
WHERE song_id NOT IN ( SELECT song_id FROM song )
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 album 的分布
SELECT cast( format( album_id / 1000000, 0) * 1000000 as UNSIGNED ) as s, count(*) as count
FROM song_album_relation
WHERE album_id NOT IN ( SELECT album_id FROM album )
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 artist 的分布
SELECT cast( format( artist_id / 2000000, 0) * 2000000 as UNSIGNED ) as s, count(*) as count
FROM song_artist_relation
WHERE artist_id NOT IN ( SELECT artist_id FROM artist )
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 comment 的分布
SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM comment_progress
WHERE current_status != 2
GROUP BY s
ORDER BY s DESC
-- 查看需要爬取的 lyric 的分布
SELECT cast( format( song_id / 10000000, 0) * 10000000 as UNSIGNED ) as s, count(*) as count
FROM song
WHERE song_id NOT IN ( SELECT song_id FROM lyric )
GROUP BY s
ORDER BY s DESC
-- optimize table
optimize table album;