1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee

通过命令行指定爬取参数,不用再修改代码了

This commit is contained in:
2022-10-06 21:06:09 +08:00
parent 9db9383934
commit be2658375c
11 changed files with 202 additions and 48 deletions

View File

@@ -23,19 +23,33 @@ async function getFromDatabase({ albumId }) {
// 正常应该查不出记录才对
/*
SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'
SELECT * FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:([:space:]*?|[ ]*?)。,更多.*$'
*/
async function fetchAll({ isUpdate = false }) {
console.log("start fetching albums ...")
var albumIds = await dbUtils.query(isUpdate
? `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`
: `SELECT DISTINCT album_id FROM song_album_relation WHERE album_id NOT IN ( SELECT album_id FROM album )`, []);
async function fetchAll({ args = {}, isUpdate = false }) {
console.log("start fetching albums ...");
if (isUpdate) {
var sql = `SELECT DISTINCT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'`;
} else {
let whereClause = [
args.min ? `album_id > ${args.min}` : '1=1',
args.max ? `album_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT album_id FROM song_album_relation WHERE ${whereClause} AND album_id NOT IN ( SELECT album_id FROM album )
${args.order ? `ORDER BY album_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
}
var albumIds = await dbUtils.query(sql, []);
albumIds = albumIds.map(item => item.album_id);
for (let i = 0; i < albumIds.length; i++) {
await global.checkIsExit();
const albumId = albumIds[i];
console.log(`${i + 1}/${albumIds.length} | album: ${albumId}`);
console.log(`${i + 1}/${albumIds.length} | album: ${albumId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try {
await fetch({ albumId: albumId, update: isUpdate });
} catch (err) {

View File

@@ -22,16 +22,25 @@ async function getFromDatabase({ artistId }) {
}
// 从数据库中查出还缺少的歌手,并进行爬取
async function fetchAll() {
console.log("start fetching artists ...")
var artistIds = await dbUtils.query(`
SELECT DISTINCT artist_id FROM song_artist_relation WHERE artist_id NOT IN ( SELECT artist_id FROM artist )
`, []);
async function fetchAll({ args = {} }) {
console.log("start fetching artists ...");
let whereClause = [
args.min ? `artist_id > ${args.min}` : '1=1',
args.max ? `artist_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT artist_id FROM song_artist_relation WHERE ${whereClause} AND artist_id NOT IN ( SELECT artist_id FROM artist )
${args.order ? `ORDER BY artist_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var artistIds = await dbUtils.query(sql, []);
artistIds = artistIds.map(item => item.artist_id);
for (let i = 0; i < artistIds.length; i++) {
await global.checkIsExit();
const artistId = artistIds[i];
console.log(`${i + 1}/${artistIds.length} | artist: ${artistId}`);
console.log(`${i + 1}/${artistIds.length} | artist: ${artistId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try {
await fetch({ artistId: artistId });
} catch (err) {

View File

@@ -11,28 +11,33 @@ const dbUtils = global.dbUtils;
// https://github.com/Binaryify/NeteaseCloudMusicApi
const { comment_music } = require('NeteaseCloudMusicApi');
async function fetchAll() {
console.log("start fetching comment ...")
async function fetchAll({ args = {} }) {
console.log("start fetching comment ...");
// 首先将需要爬取的song_id导入comment_progress表
await dbUtils.query(`
INSERT INTO comment_progress ( song_id )
INSERT IGNORE INTO comment_progress ( song_id )
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM comment_progress )
`, []);
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT song_id FROM comment_progress WHERE ${whereClause} AND current_status != 2
ORDER BY current_status DESC${args.order ? `, song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
// 首先查询有无正在爬取中的记录
var songIds = await dbUtils.query(`
-- 本机
SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id <= 30000000 LIMIT 1000
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id < 30000000 ORDER BY current_status DESC
-- 服务器
-- SELECT song_id FROM comment_progress WHERE current_status != 2 AND song_id > 30000000 ORDER BY current_status DESC
`, []);
var songIds = await dbUtils.query(sql, []);
songIds = songIds.map(item => item.song_id);
for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit();
const songId = songIds[i];
console.log(`${i + 1}/${songIds.length} | comment: ${songId}`);
console.log(`${i + 1}/${songIds.length} | comment: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try {
await fetch({ songId: songId });
} catch (err) {

View File

@@ -7,11 +7,20 @@ const sleepUtils = require('../../../utils/sleepUtils');
const dbUtils = global.dbUtils;
// 从数据库中查出还缺少的歌词,并进行爬取
async function fetchAll() {
async function fetchAll({ args = {} }) {
console.log("start fetching lyrics ...");
var songIds = await dbUtils.query(`
SELECT DISTINCT song_id FROM song WHERE song_id NOT IN ( SELECT song_id FROM lyric )
`, []);
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT song_id FROM song WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM lyric )
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var songIds = await dbUtils.query(sql, []);
songIds = songIds.map(song => song.song_id);
for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit();

View File

@@ -24,18 +24,27 @@ async function getFromDatabase({ songId }) {
}
// 从数据库中查出还缺少的歌曲,并进行爬取
async function fetchAll() {
async function fetchAll({ args = {} }) {
console.log("start fetching songs ...");
var songIds = await dbUtils.query(`
SELECT DISTINCT song_id FROM song_artist_relation WHERE song_id NOT IN ( SELECT song_id FROM song )
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
SELECT DISTINCT song_id FROM song_artist_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
UNION
SELECT DISTINCT song_id FROM song_album_relation WHERE song_id NOT IN ( SELECT song_id FROM song )
`, []);
SELECT DISTINCT song_id FROM song_album_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var songIds = await dbUtils.query(sql, []);
songIds = songIds.map(item => item.song_id);
for (let i = 0; i < songIds.length; i++) {
await global.checkIsExit();
const songId = songIds[i];
console.log(`${i + 1}/${songIds.length} | song: ${songId}`);
console.log(`${i + 1}/${songIds.length} | song: ${songId} | ${args.min ?? "?"}-${args.max ?? "?"}`);
try {
await fetch({ songId: songId });
} catch (err) {