2022-09-30 08:06:14 +08:00
const fs = require ( 'fs' ) ;
const path = require ( 'path' ) ;
const dbUtils = require ( '../utils/dbUtils' ) ;
const requestUtils = require ( '../utils/requestUtils' ) ;
const sleepUtils = require ( '../utils/sleepUtils' ) ;
async function main ( ) {
console . log ( "neteaseMusic Starting..." ) ;
// 指定数据库
dbUtils . create ( "neteaseMusic" ) ;
// getMusicInfo({ songId: "1855221507" });
// getMusicInfo({ songId: "1855221517" });
// getMusicInfo({ songId: "1861632812" });
// getArtistInfo({ artistId: "1079074" });
// getArtistInfo({ artistId: "1079075" });
// getAlbumInfo({ albumId: "74268047" });
// getAlbumInfo({ albumId: "129327797" });
// 不是所有歌手都有个人主页 例如 https://music.163.com/#/artist?id=1079075
// getUserInfo({ userId: "37365202" });
// getUserInfo({ userId: "29879272" });
2022-09-30 21:33:46 +08:00
while ( true ) {
await startGetMusic ( 100 ) ;
}
2022-09-30 08:06:14 +08:00
}
2022-09-30 21:33:46 +08:00
/ *
DELETE FROM song _artist _relation WHERE song _id = 0 OR artist _id = 0
-- DELETE FROM song _album _relation WHERE song _id = 0 OR album _id = 0
* /
async function startGetMusic ( sleepTime ) {
// 从数据库中查出还缺少的歌曲,并进行爬取
console . log ( "start fetching songs ..." ) ;
let songIds = await dbUtils . query ( `
SELECT DISTINCT song _id FROM song _artist _relation WHERE song _id NOT IN ( SELECT DISTINCT song _id FROM song )
UNION
SELECT DISTINCT song _id FROM song _album _relation WHERE song _id NOT IN ( SELECT DISTINCT song _id FROM song )
` , []);
songIds = songIds . map ( item => item . song _id ) ;
for ( let i = 0 ; i < songIds . length ; i ++ ) {
const songId = songIds [ i ] ;
console . log ( ` ${ i } / ${ songIds . length } | song: ${ songId } | ${ await statistics ( ) } ` ) ;
await getMusicInfo ( { songId : songId } ) ;
await sleepUtils . sleep ( sleepTime ) ;
if ( fs . readFileSync ( 'stop.txt' ) == "1" ) {
throw new Error ( ` Stopped ` ) ;
}
}
2022-09-30 08:06:14 +08:00
2022-09-30 21:33:46 +08:00
// 从数据库中查出还缺少的专辑,并进行爬取
console . log ( "start fetching albums ..." )
let albumIds = await dbUtils . query ( `
SELECT DISTINCT album _id FROM song _album _relation WHERE album _id NOT IN ( SELECT DISTINCT album _id FROM album )
` , []);
albumIds = albumIds . map ( item => item . album _id ) ;
for ( let i = 0 ; i < albumIds . length ; i ++ ) {
const albumId = albumIds [ i ] ;
console . log ( ` ${ i } / ${ albumIds . length } | album: ${ albumId } | ${ await statistics ( ) } ` ) ;
await getAlbumInfo ( { albumId : albumId } ) ;
await sleepUtils . sleep ( sleepTime ) ;
if ( fs . readFileSync ( 'stop.txt' ) == "1" ) {
throw new Error ( ` Stopped ` ) ;
2022-09-30 08:06:14 +08:00
}
}
2022-09-30 21:33:46 +08:00
// 从数据库中查出还缺少的歌手,并进行爬取
console . log ( "start fetching albums ..." )
let artistIds = await dbUtils . query ( `
SELECT DISTINCT artist _id FROM song _artist _relation WHERE artist _id NOT IN ( SELECT DISTINCT artist _id FROM artist )
` , []);
artistIds = artistIds . map ( item => item . artist _id ) ;
for ( let i = 0 ; i < artistIds . length ; i ++ ) {
const artistId = artistIds [ i ] ;
console . log ( ` ${ i } / ${ artistIds . length } | artist: ${ artistId } | ${ await statistics ( ) } ` ) ;
await getArtistInfo ( { artistId : artistId } ) ;
await sleepUtils . sleep ( sleepTime ) ;
if ( fs . readFileSync ( 'stop.txt' ) == "1" ) {
throw new Error ( ` Stopped ` ) ;
2022-09-30 08:06:14 +08:00
}
}
}
2022-09-30 21:33:46 +08:00
async function statistics ( ) {
let sql = `
SELECT
song _count ,
album _count ,
artist _count ,
song _album _count ,
song _artist _count
FROM
( SELECT count ( * ) AS song _count FROM song ) t1 ,
( SELECT count ( * ) AS album _count FROM album ) t2 ,
( SELECT count ( * ) AS artist _count FROM artist ) t3 ,
( SELECT count ( * ) AS song _album _count FROM song _album _relation ) t4 ,
( SELECT count ( * ) AS song _artist _count FROM song _artist _relation ) t5 ` ;
let result = await dbUtils . query ( sql , [ ] ) ;
let songCount = result [ 0 ] . song _count ;
let albumCount = result [ 0 ] . album _count ;
let artistCount = result [ 0 ] . artist _count ;
let songAlbumCount = result [ 0 ] . song _album _count ;
let songArtistCount = result [ 0 ] . song _artist _count ;
return ` song: ${ songCount } , album: ${ albumCount } , artist: ${ artistCount } | songAlbum: ${ songAlbumCount } , songArtist: ${ songArtistCount } ` ;
}
2022-09-30 08:06:14 +08:00
// 获取音乐详情
async function getMusicInfo ( { songId } ) {
2022-09-30 08:20:55 +08:00
let result = await dbUtils . query ( 'SELECT count(*) as count FROM song WHERE song_id = ?' , [ songId ] ) ;
if ( result [ 0 ] . count > 0 ) {
console . log ( ` 数据库中已有数据,跳过 songId: ${ songId } ` ) ;
2022-09-30 21:33:46 +08:00
return ;
2022-09-30 08:20:55 +08:00
2022-09-30 21:33:46 +08:00
// let songResult = await dbUtils.query('SELECT * FROM song WHERE song_id = ?', [songId]);
// songResult = JSON.parse(JSON.stringify(songResult));
2022-09-30 08:20:55 +08:00
2022-09-30 21:33:46 +08:00
// let songArtistResult = await dbUtils.query('SELECT * FROM song_artist_relation WHERE song_id = ?', [songId]);
// songArtistResult = JSON.parse(JSON.stringify(songArtistResult));
// songResult.artistIds = songArtistResult.map(song => song.artist_id);
2022-09-30 08:20:55 +08:00
2022-09-30 21:33:46 +08:00
// let songAlbumResult = await dbUtils.query('SELECT * FROM song_album_relation WHERE song_id = ?', [songId]);
// songAlbumResult = JSON.parse(JSON.stringify(songAlbumResult));
// songResult.albumId = songAlbumResult.map(song => song.album_id)[0];
// // console.log(songResult);
// return songResult;
2022-09-30 08:20:55 +08:00
}
2022-09-30 08:06:14 +08:00
let url = ` https://music.163.com/song?id= ${ songId } ` ;
try {
2022-09-30 21:33:46 +08:00
// var html = fs.readFileSync(path.join(__dirname, "../temp", `song-${songId}.html`), 'utf8');
2022-09-30 08:06:14 +08:00
var html = await requestUtils . getApiResult ( url ) ;
fs . writeFileSync ( path . join ( _ _dirname , "../temp" , ` song- ${ songId } .html ` ) , html ) ;
2022-09-30 21:33:46 +08:00
} catch ( errors ) {
console . error ( errors ) ;
return ;
2022-09-30 08:06:14 +08:00
}
// console.log(html);
// 正则匹配
let regExResult = /\<script type\=\"application\/ld\+json\"\>([\S\s]*?)\<\/script\>/ . exec ( html ) ;
let songInfoJSONString = regExResult [ 1 ] ;
let songInfoDict = JSON . parse ( songInfoJSONString ) ;
// console.log(songInfoDict);
let title = /<meta property="og:title" content="(.*?)" \/>/ . exec ( html ) [ 1 ] ;
let image = /<meta property="og:image" content="http:\/\/p.\.music\.126\.net\/(.*?)" \/>/ . exec ( html ) [ 1 ] ;
let artist = /<meta property="og:music:artist" content="(.*?)" \/>/ . exec ( html ) [ 1 ] ;
let duration = /<meta property="music:duration" content="(.*?)"\/>/ . exec ( html ) [ 1 ] ;
2022-09-30 21:33:46 +08:00
try {
var album = /<meta property="og:music:album" content="(.*?)"\/>/ . exec ( html ) [ 1 ] ;
var albumId = /<meta property="music:album" content="https:\/\/music\.163\.com\/album\?id=(.*?)"\/>/ . exec ( html ) [ 1 ] ;
} catch ( err ) {
// 歌曲不在专辑中
}
2022-09-30 08:06:14 +08:00
const reg = /<meta property="music:musician" content="https:\/\/music\.163\.com\/artist\?id=(.*?)"\/>/g ;
let artistIds = [ ] ;
let matched = null ;
while ( ( matched = reg . exec ( html ) ) !== null ) {
artistIds . push ( matched [ 1 ] ) ;
}
let songInfo = {
songId : songId ,
title : title ,
image : image ,
pubDate : songInfoDict . pubDate ,
artist : artist ,
artistIds : artistIds ,
2022-09-30 21:33:46 +08:00
album : album || null ,
albumId : albumId || null ,
2022-09-30 08:06:14 +08:00
duration : duration ,
} ;
// console.log("songInfo", songInfo);
dbUtils . query ( 'INSERT IGNORE INTO song SET ?' , {
song _id : songInfo . songId ,
title : songInfo . title ,
image : songInfo . image ,
pub _date : songInfo . pubDate ,
} ) ;
2022-09-30 21:33:46 +08:00
if ( albumId != null )
dbUtils . query ( 'INSERT IGNORE INTO song_album_relation SET ?' , {
song _id : songInfo . songId ,
album _id : songInfo . albumId ,
} ) ;
2022-09-30 08:06:14 +08:00
artistIds . forEach ( function ( artistId ) {
dbUtils . query ( 'INSERT IGNORE INTO song_artist_relation SET ?' , {
song _id : songInfo . songId ,
artist _id : artistId ,
} ) ;
} ) ;
return songInfo ;
}
// 获取音乐人详情
async function getArtistInfo ( { artistId } ) {
let result = await dbUtils . query ( 'SELECT count(*) as count FROM artist WHERE artist_id = ?' , [ artistId ] ) ;
if ( result [ 0 ] . count > 0 ) {
console . log ( ` 数据库中已有数据,跳过 artistId: ${ artistId } ` ) ;
2022-09-30 21:33:46 +08:00
return ;
2022-09-30 08:20:55 +08:00
// // let artistResult = await dbUtils.query('SELECT * FROM artist LEFT JOIN song_artist_relation ON artist.artist_id = song_artist_relation.artist_id WHERE artist.artist_id = ?', [artistId]);
// let artistResult = await dbUtils.query('SELECT * FROM artist WHERE artist_id = ?', [artistId]);
// artistResult = JSON.parse(JSON.stringify(artistResult));
// let songArtistResult = await dbUtils.query('SELECT * FROM song_artist_relation WHERE artist_id = ?', [artistId]);
// songArtistResult = JSON.parse(JSON.stringify(songArtistResult));
// artistResult.songIds = songArtistResult.map(song => song.song_id);
// // console.log(artistResult);
// return artistResult;
2022-09-30 08:06:14 +08:00
}
let url = ` https://music.163.com/artist?id= ${ artistId } ` ;
try {
2022-09-30 21:33:46 +08:00
// var html = fs.readFileSync(path.join(__dirname, "../temp", `artist-${artistId}.html`), 'utf8');
2022-09-30 08:06:14 +08:00
var html = await requestUtils . getApiResult ( url ) ;
fs . writeFileSync ( path . join ( _ _dirname , "../temp" , ` artist- ${ artistId } .html ` ) , html ) ;
2022-09-30 21:33:46 +08:00
} catch ( errors ) {
console . error ( errors ) ;
return ;
2022-09-30 08:06:14 +08:00
}
// console.log(html);
// 正则匹配
let regExResult = /\<script type\=\"application\/ld\+json\"\>([\S\s]*?)\<\/script\>/ . exec ( html ) ;
let artistInfoJSONString = regExResult [ 1 ] ;
let artistInfoDict = JSON . parse ( artistInfoJSONString ) ;
// console.log(artistInfoDict);
let image = /<meta property="og:image" content="http:\/\/p.\.music\.126\.net\/(.*?)" \/>/ . exec ( html ) [ 1 ] ;
let songListJSONString = /<textarea id="song-list-pre-data" style="display:none;">(.*?)<\/textarea>/ . exec ( html ) [ 1 ] ;
let songList = JSON . parse ( songListJSONString ) ;
let songIds = songList . map ( song => song . id ) ;
let artistInfo = {
artistId : artistId ,
title : artistInfoDict . title ,
image : image ,
description : artistInfoDict . description ,
pubDate : artistInfoDict . pubDate ,
songIds : songIds ,
} ;
// console.log("artistInfo", artistInfo);
dbUtils . query ( 'INSERT IGNORE INTO artist SET ?' , {
artist _id : artistInfo . artistId ,
title : artistInfo . title ,
description : artistInfo . description ,
image : artistInfo . image ,
pub _date : artistInfo . pubDate ,
} ) ;
songIds . forEach ( function ( songId ) {
2022-09-30 21:33:46 +08:00
if ( isNaN ( Number ( songId ) ) || Number ( songId ) === 0 || isNaN ( Number ( artistId ) ) || Number ( artistId ) === 0 )
return ;
2022-09-30 08:06:14 +08:00
dbUtils . query ( 'INSERT IGNORE INTO song_artist_relation SET ?' , {
song _id : songId ,
artist _id : artistId ,
} ) ;
} ) ;
return artistInfo ;
}
// 获取专辑详情
async function getAlbumInfo ( { albumId } ) {
let result = await dbUtils . query ( 'SELECT count(*) as count FROM album WHERE album_id = ?' , [ albumId ] ) ;
if ( result [ 0 ] . count > 0 ) {
console . log ( ` 数据库中已有数据,跳过 albumId: ${ albumId } ` ) ;
2022-09-30 21:33:46 +08:00
return ;
2022-09-30 08:20:55 +08:00
// let albumResult = await dbUtils.query('SELECT * FROM album WHERE album_id = ?', [albumId]);
// albumResult = JSON.parse(JSON.stringify(albumResult));
// let songAlbumResult = await dbUtils.query('SELECT * FROM song_album_relation WHERE album_id = ?', [albumId]);
// songAlbumResult = JSON.parse(JSON.stringify(songAlbumResult));
// albumResult.songIds = songAlbumResult.map(song => song.song_id);
// // console.log(albumResult);
// return albumResult;
2022-09-30 08:06:14 +08:00
}
let url = ` https://music.163.com/album?id= ${ albumId } ` ;
try {
2022-09-30 21:33:46 +08:00
// var html = fs.readFileSync(path.join(__dirname, "../temp", `album-${albumId}.html`), 'utf8');
2022-09-30 08:06:14 +08:00
var html = await requestUtils . getApiResult ( url ) ;
fs . writeFileSync ( path . join ( _ _dirname , "../temp" , ` album- ${ albumId } .html ` ) , html ) ;
2022-09-30 21:33:46 +08:00
} catch ( errors ) {
console . error ( errors ) ;
return ;
2022-09-30 08:06:14 +08:00
}
// console.log(html);
// 正则匹配
let regExResult = /\<script type\=\"application\/ld\+json\"\>([\S\s]*?)\<\/script\>/ . exec ( html ) ;
let albumInfoJSONString = regExResult [ 1 ] ;
let albumInfoDict = JSON . parse ( albumInfoJSONString ) ;
// console.log(albumInfoDict);
let company = null ;
try {
company = /<p class="intr"><b>发行公司:<\/b>\n(.*?)\n<\/p>/ . exec ( html ) [ 1 ] ;
} catch ( e ) {
}
let image = /<meta property="og:image" content="http:\/\/p.\.music\.126\.net\/(.*?)" \/>/ . exec ( html ) [ 1 ] ;
let songListJSONString = /<textarea id="song-list-pre-data" style="display:none;">(.*?)<\/textarea>/ . exec ( html ) [ 1 ] ;
let songList = JSON . parse ( songListJSONString ) ;
let songIds = songList . map ( song => song . id ) ;
let albumInfo = {
albumId : albumId ,
title : albumInfoDict . title ,
image : image ,
description : albumInfoDict . description ,
pubDate : albumInfoDict . pubDate ,
company : company ,
songIds : songIds ,
} ;
// console.log("albumInfo", albumInfo);
dbUtils . query ( 'INSERT IGNORE INTO album SET ?' , {
album _id : albumInfo . albumId ,
title : albumInfo . title ,
description : albumInfo . description ,
image : albumInfo . image ,
pub _date : albumInfo . pubDate ,
company : albumInfo . company ,
} ) ;
songIds . forEach ( function ( songId ) {
2022-09-30 21:33:46 +08:00
if ( isNaN ( Number ( songId ) ) || Number ( songId ) === 0 || isNaN ( Number ( albumId ) ) || Number ( songId ) === 0 )
return ;
2022-09-30 08:06:14 +08:00
dbUtils . query ( 'INSERT IGNORE INTO song_album_relation SET ?' , {
song _id : songId ,
album _id : albumId ,
} ) ;
} ) ;
return albumInfo ;
}
// // 获取音乐人详情
// async function getUserInfo({ userId }) {
// let url = `https://music.163.com/user/home?id=${userId}`;
// try {
// var html = fs.readFileSync(path.join(__dirname, "../temp", ` user-${userId}.html`), 'utf8');
// } catch (errors) {
// var html = await requestUtils.getApiResult(url);
// fs.writeFileSync(path.join(__dirname, "../temp", ` user-${userId}.html`), html);
// }
// // console.log(html);
// }
module . exports = {
main : main ,
}