const fs = require('fs'); const { getApiResult } = require('../utils/requestUtils'); const dbUtils = require("../utils/dbPoolUtils"); const sleepUtils = require("../utils/sleepUtils"); // 数据库连接池 dbUtils.create({ database: "neteasemusic", // 指定数据库 connectionLimit: 10, // 设置数据库连接池数量 }); global.dbUtils = dbUtils; const dataManager = require('./src/dataManager'); const requestUtils = require('../utils/requestUtils'); async function main() { var args = require('minimist')(process.argv.slice(2)); global.args = { "order": args.order, "limit": args.limit, } // async function timeout1() { // await getList(); // setTimeout(() => console.log("getList已完成"), 2000); // } // timeout1(); async function timeout2() { await startFetchDetail(); setTimeout(timeout2, 2000); } timeout2(); async function timeout3() { await startFetchRealUrl(); setTimeout(timeout3, 2000); } timeout3(); } // 爬取列表页,获得歌曲详情页 async function getList() { let forumId = 1; // 分类id let beginPage = 1; // 起始页 let endPage = 23; // 结束页 for (let page = beginPage; page <= endPage; page++) { let url = `https://hifini.com/forum-${forumId}-${page}.htm?orderby=tid`; // 按照发帖时间排序 console.log(`getList \t| ${beginPage}/${page}/${endPage} | forumId: ${forumId} | ${url}`); // let html = fs.readFileSync("./1.html", "utf8"); let html = await getApiResult(url); // fs.writeFileSync("./1.html", html); var matcher = html.matchAll(/(.*?)<\/a>/g); var m = matcher.next(); var threadList = []; while (!m.done) { // if (!/^.*?\[[-\/\.A-Za-z0-9]+?\]$/.exec(m.value[2])) { // console.log(`跳过 ${m.value[2]}`); // } else { threadList.push({ forum_id: forumId, thread_id: Number(m.value[1]), title: m.value[2] }); // } m = matcher.next(); } await dataManager.thread.insertCollection(threadList); await sleepUtils.sleep(1000); } } async function startFetchDetail() { let idsToFetch = await dataManager.thread.getIdsToFetch(); idsToFetch = idsToFetch.map(item => item.thread_id); // console.log(idsToFetch); for (let i = 0; i < idsToFetch.length; i++) { const threadId = idsToFetch[i]; console.log(`getDetail\t| ${i + 1}/${idsToFetch.length} | threadId: ${threadId}`); await getDetail(threadId); // await sleepUtils.sleep(100); } } async function getDetail(threadId) { let url = `https://hifini.com/thread-${threadId}.htm`; let html; try { // html = fs.readFileSync("./1.html", "utf8"); html = await getApiResult(url, { timeout: 3000 }); // fs.writeFileSync("./1.html", html); } catch (e) { console.error("请求失败,可能是请求超时", e); return; } // 解析到音乐信息 var matcher = /var ap4 = new APlayer\(([\S\s]*?)\);/.exec(html); if (!matcher) { await dataManager.thread.update(threadId, 0, { music_title: "未解析到音乐" }); console.log("未解析到音乐,跳过"); return; } try { let arrStr = matcher[1]; // console.log(arrStr); eval(`let document = { getElementById: () => {} }; var arr = ${arrStr};`); var musicArr = arr.music; // console.log(musicArr); } catch (e) { console.error("解析失败", e); return; } var matcher = html.matchAll(/<\/i>(.*?)<\/a>/g); var m = matcher.next(); var tagList = []; while (!m.done) { tagList.push({ tag_id: Number(m.value[1]), tag_name: m.value[2] }); m = matcher.next(); } await dataManager.tag.insertCollection(tagList); await dataManager.thread_tag.insertCollection(tagList.map(tag => { return { thread_id: threadId, tag_id: tag.tag_id }; })); if (musicArr.length > 1) { console.log("典型:thread_id:", threadId); await dataManager.thread.insertCollection(musicArr.map((music, i) => { return { thread_id: threadId, music_index: i } })); } for (let i = 0; i < musicArr.length; i++) { const music = musicArr[i]; await dataManager.thread.update(threadId, i, { music_title: music.title, music_author: music.author || "", music_url: music.url, music_pic: music.pic || "" }); } // console.log("done"); } async function startFetchRealUrl() { let urlsToFetch = await dataManager.thread.getIdsToFetchRealUrl(); // console.log(urlsToFetch.map(item => item.thread_id)); for (let i = 0; i < urlsToFetch.length; i++) { const urlToFetch = urlsToFetch[i]; console.log(`getRealUrl\t| ${i + 1}/${urlsToFetch.length} | threadId: ${urlToFetch.thread_id} | music_index: ${urlToFetch.music_index}`); await getRealUrl(urlToFetch.thread_id, urlToFetch.music_index, urlToFetch.music_url); // await sleepUtils.sleep(100); } } async function getRealUrl(threadId, musicIndex, fakeUrl) { let url = "原地址已失效"; try { url = await requestUtils.getRedirectUrl(`https://hifini.com/${fakeUrl}`); } catch (e) { console.log("重定向地址获取失败"); } result = await dataManager.thread.update(threadId, musicIndex, { music_real_url: url }); } main();