1
0
Code Issues Pull Requests Projects Releases Wiki Activity GitHub Gitee

插入关联表时同事插入wait_check表;统一查询将要爬取的id代码到dataManager.js

This commit is contained in:
2022-10-25 19:36:05 +08:00
parent 4753fd55ae
commit 3660fefda4
9 changed files with 192 additions and 119 deletions

View File

@@ -29,28 +29,7 @@ SELECT * FROM album WHERE (full_description = '' or full_description is null) an
async function fetchAll({ args = {}, isUpdate = false }) {
console.log("start fetching albums ...");
if (isUpdate) {
var sql = `
SELECT album_id FROM album WHERE (full_description = '' or full_description is null) and description like '%专辑《%》,简介:%' and description not regexp '^.*?专辑《.*?》,简介:[:space:]*?。,更多.*$'
`;
} else {
let whereClause = [
args.min ? `album_id > ${args.min}` : '1=1',
args.max ? `album_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
-- 查出来通过代码去重,提高速度
SELECT album_id FROM song_album_relation WHERE ${whereClause} AND album_id NOT IN ( SELECT album_id FROM album )
${args.order ? `ORDER BY album_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
}
var albumIds = await dbUtils.query(sql, []);
albumIds = albumIds.map(item => item.album_id);
albumIds = Array.from(new Set(albumIds));
let albumIds = await dataManager.album.getIdsToFetch(args, isUpdate);
for (let i = 0; i < albumIds.length; i++) {
await global.checkIsExit();
const albumId = albumIds[i];
@@ -147,7 +126,7 @@ async function fetch({ albumId, debug = false, update = false }) {
let image = /<meta property="og:image" content="http:\/\/p.\.music\.126\.net\/(.*?)" \/>/.exec(html)[1];
let songListJSONString = /<textarea id="song-list-pre-data" style="display:none;">(.*?)<\/textarea>/.exec(html)[1];
let songList = JSON.parse(songListJSONString);
let songIds = songList.map(song => song.id);
let songIds = songList.map(song => Number(song.id));
let albumInfo = {
album_id: albumId,
@@ -160,8 +139,10 @@ async function fetch({ albumId, debug = false, update = false }) {
version: 1
};
// console.log("albumInfo", albumInfo);
await dataManager.wait_check.insert("song", songIds);
if (albumId > 0) {
let songAlbumRel = songIds.map(songId => [Number(songId), albumId]);
let songAlbumRel = songIds.map(songId => [songId, albumId]);
await dataManager.song_album.insertCollection(songAlbumRel);
}

View File

@@ -25,22 +25,7 @@ async function getFromDatabase({ artistId }) {
// 从数据库中查出还缺少的歌手,并进行爬取
async function fetchAll({ args = {} }) {
console.log("start fetching artists ...");
let whereClause = [
args.min ? `artist_id > ${args.min}` : '1=1',
args.max ? `artist_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql = `
-- 查出来通过代码去重,提高速度
-- SELECT DISTINCT artist_id FROM song_artist_relation WHERE ${whereClause} AND artist_id NOT IN ( SELECT artist_id FROM artist )
SELECT artist_id FROM song_artist_relation WHERE ${whereClause} AND artist_id NOT IN ( SELECT artist_id FROM artist )
${args.order ? `ORDER BY artist_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
console.log(sql);
var artistIds = await dbUtils.query(sql, []);
artistIds = artistIds.map(item => item.artist_id);
artistIds = Array.from(new Set(artistIds));
let artistIds = await dataManager.artist.getIdsToFetch(args);
for (let i = 0; i < artistIds.length; i++) {
await global.checkIsExit();
const artistId = artistIds[i];
@@ -97,7 +82,7 @@ async function fetch({ artistId, debug = false }) {
try {
let songListJSONString = /<textarea id="song-list-pre-data" style="display:none;">(.*?)<\/textarea>/.exec(html)[1];
let songList = JSON.parse(songListJSONString);
songIds = songList.map(song => song.id);
songIds = songList.map(song => Number(song.id));
} catch (error) {
// 可能是歌手下面没有音乐 例如https://music.163.com/#/artist?id=30032762
}
@@ -111,8 +96,9 @@ async function fetch({ artistId, debug = false }) {
};
// console.log("artistInfo", artistInfo);
await dataManager.wait_check.insert("song", songIds);
if (artistId > 0) {
let songArtistRel = songIds.map(songId => [Number(songId), artistId]);
let songArtistRel = songIds.map(songId => [songId, artistId]);
await dataManager.song_artist.insertCollection(songArtistRel);
}

View File

@@ -173,6 +173,7 @@ async function fetch({ playlistId, debug = false }) {
process.exit(0);
}
await dataManager.wait_check.insert("song", playlist.trackIds.map(track => track.id));
let trackIds = playlist.trackIds.map(track => [track.id, playlist.id, track.alg, track.rcmdReason]);
await dataManager.song_playlist.insertCollection(trackIds);
await dataManager.playlist.insertCollection(playlistInfo);

View File

@@ -1,6 +1,7 @@
const fs = require('fs');
const path = require('path');
const requestUtils = require('../../../utils/requestUtils');
const sleepUtils = require('../../../utils/sleepUtils');
const dataManager = require('../dataManager');
@@ -11,35 +12,7 @@ const { song_detail } = require('NeteaseCloudMusicApi');
// 从数据库中查出还缺少的歌曲,并进行爬取
async function fetchAll({ args = {} }) {
console.log("start fetching songs ...");
let whereClause = [
args.min ? `song_id > ${args.min}` : '1=1',
args.max ? `song_id <= ${args.max}` : '1=1',
].join(' AND ');
var sql1 = `
SELECT song_id FROM song_artist_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
var sql2 = `
SELECT song_id FROM song_album_relation WHERE ${whereClause} AND song_id NOT IN ( SELECT song_id FROM song )
${args.order ? `ORDER BY song_id ${args.order}` : ''}
${args.limit ? `LIMIT ${args.limit}` : ''}
`;
// // 更新现有数据
// sql = `SELECT song_id FROM song WHERE data_version = 1`;
// 测试用
// sql = `SELECT song_id FROM song_artist_relation group by song_id limit 10`;
console.log(sql1);
var songIds1 = await dbUtils.query(sql1, []);
songIds1 = songIds1.map(item => item.song_id);
console.log(sql2);
var songIds2 = await dbUtils.query(sql2, []);
songIds2 = songIds2.map(item => item.song_id);
var songIds = songIds1.concat(songIds2);
songIds = Array.from(new Set(songIds)); // 去重
let songIds = await dataManager.song.getIdsToFetch(args);
// 0 - 100, 200 - 399, 400 - ..., ... - songIds.length-1
// 0 1 2 count-1
var step = 1000;
@@ -71,9 +44,14 @@ async function fetch({ songIdArray, debug = false }) {
}
// console.log(songResult.body.songs.map(item => JSON.stringify(item)));
let albumIds = [], artistIds = [];
let songAlbumRel = [], songArtistRel = [];
let songInfoList = songResult.body.songs.map(song => {
song.ar.forEach(item => songArtistRel.push([song.id, item.id]));
song.ar.forEach(item => {
artistIds.push(item.id);
songArtistRel.push([song.id, item.id])
});
albumIds.push(song.al.id || 0);
songAlbumRel.push([song.id, song.al.id || 0])
return {
title: song.name, // 歌曲标题
@@ -107,6 +85,8 @@ async function fetch({ songIdArray, debug = false }) {
if (songInfoList.length == 0) return;
console.log("插入数据库");
await dataManager.wait_check.insert("album", albumIds);
await dataManager.wait_check.insert("artist", artistIds);
await dataManager.song_album.insertCollection(songAlbumRel);
await dataManager.song_artist.insertCollection(songArtistRel);
await dataManager.song.insertCollection(songInfoList); // image 因为接口没有返回,所以不更新