diff --git a/hifini_music/index.js b/hifini_music/index.js new file mode 100644 index 0000000..099f59d --- /dev/null +++ b/hifini_music/index.js @@ -0,0 +1,159 @@ +const fs = require('fs'); +const { getApiResult } = require('../utils/requestUtils'); +const dbUtils = require("../utils/dbPoolUtils"); +const sleepUtils = require("../utils/sleepUtils"); + +// 数据库连接池 +dbUtils.create({ + database: "neteasemusic", // 指定数据库 + connectionLimit: 10, // 设置数据库连接池数量 +}); +global.dbUtils = dbUtils; + +const dataManager = require('./src/dataManager'); +const requestUtils = require('../utils/requestUtils'); + +async function main() { + async function timeout1() { + await getList(); + setTimeout(timeout1, 2000); + } + timeout1(); + + async function timeout2() { + await startFetchDetail(); + setTimeout(timeout2, 2000); + } + timeout2(); + + async function timeout3() { + await startFetchRealUrl(); + setTimeout(timeout3, 2000); + } + timeout3(); +} + +// 爬取列表页,获得歌曲详情页 +async function getList() { + + let forumId = 12; // 分类id + let beginPage = 125; // 起始页 + let endPage = 165; // 结束页 + for (let page = beginPage; page <= endPage; page++) { + let url = `https://hifini.com/forum-${forumId}-${page}.htm?orderby=tid`; // 按照发帖时间排序 + console.log(`getList \t| ${beginPage}/${page}/${endPage} | forumId: ${forumId} | ${url}`); + + // let html = fs.readFileSync("./1.html", "utf8"); + let html = await getApiResult(url); + // fs.writeFileSync("./1.html", html); + + var matcher = html.matchAll(/(.*?)<\/a>/g); + var m = matcher.next(); + var threadList = []; + while (!m.done) { + if (!/^.*?\[[-\/\.A-Za-z0-9]+?\]$/.exec(m.value[2])) { + console.log(`跳过 ${m.value[2]}`); + } else { + threadList.push({ + forum_id: forumId, + thread_id: Number(m.value[1]), + title: m.value[2] + }); + } + m = matcher.next(); + } + await dataManager.thread.insertCollection(threadList); + await sleepUtils.sleep(1000); + } +} + +async function startFetchDetail() { + let idsToFetch = await dataManager.thread.getIdsToFetch(); + idsToFetch = idsToFetch.map(item => item.thread_id); + // console.log(idsToFetch); + for (let i = 0; i < idsToFetch.length; i++) { + const threadId = idsToFetch[i]; + console.log(`getDetail\t| ${i + 1}/${idsToFetch.length} | threadId: ${threadId}`); + await getDetail(threadId); + await sleepUtils.sleep(1000); + } +} + +async function getDetail(threadId) { + + let url = `https://hifini.com/thread-${threadId}.htm`; + + // let html = fs.readFileSync("./1.html", "utf8"); + let html = await getApiResult(url); + // fs.writeFileSync("./1.html", html); + + // 解析到音乐信息 + var matcher = /var ap4 = new APlayer\(([\S\s]*?)\);/.exec(html); + if (!matcher) { + await dataManager.thread.update(threadId, { music_title: "未解析到音乐" }); + console.log("未解析到音乐,跳过"); + return; + } + try { + let arrStr = matcher[1]; + // console.log(arrStr); + eval(`let document = { getElementById: () => {} }; var arr = ${arrStr};`); + var music = arr.music[0]; + // console.log(music); + } catch (e) { + console.error("解析失败", e); + return; + } + + var matcher = html.matchAll(/<\/i>(.*?)<\/a>/g); + var m = matcher.next(); + var tagList = []; + while (!m.done) { + tagList.push({ + tag_id: Number(m.value[1]), + tag_name: m.value[2] + }); + m = matcher.next(); + } + + await dataManager.tag.insertCollection(tagList); + + await dataManager.thread_tag.insertCollection(tagList.map(tag => { + return { + thread_id: threadId, + tag_id: tag.tag_id + }; + })); + + await dataManager.thread.update(threadId, { + music_title: music.title, + music_author: music.author || "", + music_url: music.url, + music_pic: music.pic || "" + }); + // console.log("done"); +} + +async function startFetchRealUrl() { + let urlsToFetch = await dataManager.thread.getIdsToFetchRealUrl(); + // console.log(urlsToFetch.map(item => item.thread_id)); + urlsToFetch = urlsToFetch.map(item => { return { threadId: item.thread_id, fakeUrl: item.music_url } }); + for (let i = 0; i < urlsToFetch.length; i++) { + const urlToFetch = urlsToFetch[i]; + console.log(`getRealUrl\t| ${i + 1}/${urlsToFetch.length} | threadId: ${urlToFetch.threadId} | ${urlToFetch.fakeUrl}`); + await getRealUrl(urlToFetch); + await sleepUtils.sleep(1000); + } +} + +async function getRealUrl(urlToFetch) { + let { threadId, fakeUrl } = urlToFetch; + try { + let url = await requestUtils.getRedirectUrl(`https://hifini.com/${fakeUrl}`); + result = await dataManager.thread.update(threadId, { music_real_url: url }); + } catch (e) { + console.log("重定向地址获取失败"); + } +} + +main(); \ No newline at end of file diff --git a/hifini_music/src/dataManager.js b/hifini_music/src/dataManager.js new file mode 100644 index 0000000..4fdcdcf --- /dev/null +++ b/hifini_music/src/dataManager.js @@ -0,0 +1,43 @@ +const dbUtils = global.dbUtils; + +let insertCollectionTemplate = async (tableName, dataList) => { + if (dataList.length == 0) return; + return await dbUtils.query(` + INSERT INTO ${tableName} ( ${Object.keys(dataList[0]).map(field => `\`${field}\``).join(",")} ) VALUES ? + ON DUPLICATE KEY UPDATE ${Object.keys(dataList[0]).map(field => `${field}=VALUES(${field})`).join(", ")} + `, [dataList.map(item => Object.values(item))]); +} + +module.exports = { + + thread: { + insertCollection: async (threadList) => { + return await insertCollectionTemplate("hifini_thread", threadList); + }, + + update: async (threadId, threadInfo) => { + return await dbUtils.query(`UPDATE hifini_thread SET ? WHERE thread_id = ${threadId}`, threadInfo); + }, + + getIdsToFetch: async () => { + return await dbUtils.query(`SELECT thread_id FROM hifini_thread where music_title='' and music_pic='' and music_url=''`); + }, + + getIdsToFetchRealUrl: async () => { + return await dbUtils.query(`SELECT thread_id,music_url FROM hifini_thread where music_url like 'get_music.php?key=%' and music_real_url=''`); + } + }, + + tag: { + insertCollection: async (tagList) => { + return await insertCollectionTemplate("hifini_tag", tagList); + }, + }, + + thread_tag: { + insertCollection: async (tagList) => { + return await insertCollectionTemplate("hifini_thread_tag_relation", tagList); + }, + }, + +}; diff --git a/utils/requestUtils.js b/utils/requestUtils.js index 4450172..c5d11bb 100644 --- a/utils/requestUtils.js +++ b/utils/requestUtils.js @@ -35,8 +35,24 @@ async function query(opts) { return return_data; } +async function getRedirectUrl(url) { + return await new Promise((resolve, reject) => { + request({ + url: url, + followRedirect: false + }, function (err, res, body) { + if (err) { + reject(err); + } + // console.log(res.headers.location); + resolve(res.headers.location); + }); + }); +} + module.exports = { get: get, getApiResult: getApiResult, query: query, + getRedirectUrl: getRedirectUrl, }