From 6ce6b0cd469930a2dd021a6710e38a49ea6b78bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E5=B0=8F=E5=A2=A8?=
<2291200076@qq.com>
Date: Fri, 28 Oct 2022 00:23:50 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0hifini=5Fmusic=E7=88=AC?=
=?UTF-8?q?=E8=99=AB=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
hifini_music/index.js | 159 ++++++++++++++++++++++++++++++++
hifini_music/src/dataManager.js | 43 +++++++++
utils/requestUtils.js | 16 ++++
3 files changed, 218 insertions(+)
create mode 100644 hifini_music/index.js
create mode 100644 hifini_music/src/dataManager.js
diff --git a/hifini_music/index.js b/hifini_music/index.js
new file mode 100644
index 0000000..099f59d
--- /dev/null
+++ b/hifini_music/index.js
@@ -0,0 +1,159 @@
+const fs = require('fs');
+const { getApiResult } = require('../utils/requestUtils');
+const dbUtils = require("../utils/dbPoolUtils");
+const sleepUtils = require("../utils/sleepUtils");
+
+// 数据库连接池
+dbUtils.create({
+ database: "neteasemusic", // 指定数据库
+ connectionLimit: 10, // 设置数据库连接池数量
+});
+global.dbUtils = dbUtils;
+
+const dataManager = require('./src/dataManager');
+const requestUtils = require('../utils/requestUtils');
+
+async function main() {
+ async function timeout1() {
+ await getList();
+ setTimeout(timeout1, 2000);
+ }
+ timeout1();
+
+ async function timeout2() {
+ await startFetchDetail();
+ setTimeout(timeout2, 2000);
+ }
+ timeout2();
+
+ async function timeout3() {
+ await startFetchRealUrl();
+ setTimeout(timeout3, 2000);
+ }
+ timeout3();
+}
+
+// 爬取列表页,获得歌曲详情页
+async function getList() {
+
+ let forumId = 12; // 分类id
+ let beginPage = 125; // 起始页
+ let endPage = 165; // 结束页
+ for (let page = beginPage; page <= endPage; page++) {
+ let url = `https://hifini.com/forum-${forumId}-${page}.htm?orderby=tid`; // 按照发帖时间排序
+ console.log(`getList \t| ${beginPage}/${page}/${endPage} | forumId: ${forumId} | ${url}`);
+
+ // let html = fs.readFileSync("./1.html", "utf8");
+ let html = await getApiResult(url);
+ // fs.writeFileSync("./1.html", html);
+
+ var matcher = html.matchAll(/(.*?)<\/a>/g);
+ var m = matcher.next();
+ var threadList = [];
+ while (!m.done) {
+ if (!/^.*?\[[-\/\.A-Za-z0-9]+?\]$/.exec(m.value[2])) {
+ console.log(`跳过 ${m.value[2]}`);
+ } else {
+ threadList.push({
+ forum_id: forumId,
+ thread_id: Number(m.value[1]),
+ title: m.value[2]
+ });
+ }
+ m = matcher.next();
+ }
+ await dataManager.thread.insertCollection(threadList);
+ await sleepUtils.sleep(1000);
+ }
+}
+
+async function startFetchDetail() {
+ let idsToFetch = await dataManager.thread.getIdsToFetch();
+ idsToFetch = idsToFetch.map(item => item.thread_id);
+ // console.log(idsToFetch);
+ for (let i = 0; i < idsToFetch.length; i++) {
+ const threadId = idsToFetch[i];
+ console.log(`getDetail\t| ${i + 1}/${idsToFetch.length} | threadId: ${threadId}`);
+ await getDetail(threadId);
+ await sleepUtils.sleep(1000);
+ }
+}
+
+async function getDetail(threadId) {
+
+ let url = `https://hifini.com/thread-${threadId}.htm`;
+
+ // let html = fs.readFileSync("./1.html", "utf8");
+ let html = await getApiResult(url);
+ // fs.writeFileSync("./1.html", html);
+
+ // 解析到音乐信息
+ var matcher = /var ap4 = new APlayer\(([\S\s]*?)\);/.exec(html);
+ if (!matcher) {
+ await dataManager.thread.update(threadId, { music_title: "未解析到音乐" });
+ console.log("未解析到音乐,跳过");
+ return;
+ }
+ try {
+ let arrStr = matcher[1];
+ // console.log(arrStr);
+ eval(`let document = { getElementById: () => {} }; var arr = ${arrStr};`);
+ var music = arr.music[0];
+ // console.log(music);
+ } catch (e) {
+ console.error("解析失败", e);
+ return;
+ }
+
+ var matcher = html.matchAll(/<\/i>(.*?)<\/a>/g);
+ var m = matcher.next();
+ var tagList = [];
+ while (!m.done) {
+ tagList.push({
+ tag_id: Number(m.value[1]),
+ tag_name: m.value[2]
+ });
+ m = matcher.next();
+ }
+
+ await dataManager.tag.insertCollection(tagList);
+
+ await dataManager.thread_tag.insertCollection(tagList.map(tag => {
+ return {
+ thread_id: threadId,
+ tag_id: tag.tag_id
+ };
+ }));
+
+ await dataManager.thread.update(threadId, {
+ music_title: music.title,
+ music_author: music.author || "",
+ music_url: music.url,
+ music_pic: music.pic || ""
+ });
+ // console.log("done");
+}
+
+async function startFetchRealUrl() {
+ let urlsToFetch = await dataManager.thread.getIdsToFetchRealUrl();
+ // console.log(urlsToFetch.map(item => item.thread_id));
+ urlsToFetch = urlsToFetch.map(item => { return { threadId: item.thread_id, fakeUrl: item.music_url } });
+ for (let i = 0; i < urlsToFetch.length; i++) {
+ const urlToFetch = urlsToFetch[i];
+ console.log(`getRealUrl\t| ${i + 1}/${urlsToFetch.length} | threadId: ${urlToFetch.threadId} | ${urlToFetch.fakeUrl}`);
+ await getRealUrl(urlToFetch);
+ await sleepUtils.sleep(1000);
+ }
+}
+
+async function getRealUrl(urlToFetch) {
+ let { threadId, fakeUrl } = urlToFetch;
+ try {
+ let url = await requestUtils.getRedirectUrl(`https://hifini.com/${fakeUrl}`);
+ result = await dataManager.thread.update(threadId, { music_real_url: url });
+ } catch (e) {
+ console.log("重定向地址获取失败");
+ }
+}
+
+main();
\ No newline at end of file
diff --git a/hifini_music/src/dataManager.js b/hifini_music/src/dataManager.js
new file mode 100644
index 0000000..4fdcdcf
--- /dev/null
+++ b/hifini_music/src/dataManager.js
@@ -0,0 +1,43 @@
+const dbUtils = global.dbUtils;
+
+let insertCollectionTemplate = async (tableName, dataList) => {
+ if (dataList.length == 0) return;
+ return await dbUtils.query(`
+ INSERT INTO ${tableName} ( ${Object.keys(dataList[0]).map(field => `\`${field}\``).join(",")} ) VALUES ?
+ ON DUPLICATE KEY UPDATE ${Object.keys(dataList[0]).map(field => `${field}=VALUES(${field})`).join(", ")}
+ `, [dataList.map(item => Object.values(item))]);
+}
+
+module.exports = {
+
+ thread: {
+ insertCollection: async (threadList) => {
+ return await insertCollectionTemplate("hifini_thread", threadList);
+ },
+
+ update: async (threadId, threadInfo) => {
+ return await dbUtils.query(`UPDATE hifini_thread SET ? WHERE thread_id = ${threadId}`, threadInfo);
+ },
+
+ getIdsToFetch: async () => {
+ return await dbUtils.query(`SELECT thread_id FROM hifini_thread where music_title='' and music_pic='' and music_url=''`);
+ },
+
+ getIdsToFetchRealUrl: async () => {
+ return await dbUtils.query(`SELECT thread_id,music_url FROM hifini_thread where music_url like 'get_music.php?key=%' and music_real_url=''`);
+ }
+ },
+
+ tag: {
+ insertCollection: async (tagList) => {
+ return await insertCollectionTemplate("hifini_tag", tagList);
+ },
+ },
+
+ thread_tag: {
+ insertCollection: async (tagList) => {
+ return await insertCollectionTemplate("hifini_thread_tag_relation", tagList);
+ },
+ },
+
+};
diff --git a/utils/requestUtils.js b/utils/requestUtils.js
index 4450172..c5d11bb 100644
--- a/utils/requestUtils.js
+++ b/utils/requestUtils.js
@@ -35,8 +35,24 @@ async function query(opts) {
return return_data;
}
+async function getRedirectUrl(url) {
+ return await new Promise((resolve, reject) => {
+ request({
+ url: url,
+ followRedirect: false
+ }, function (err, res, body) {
+ if (err) {
+ reject(err);
+ }
+ // console.log(res.headers.location);
+ resolve(res.headers.location);
+ });
+ });
+}
+
module.exports = {
get: get,
getApiResult: getApiResult,
query: query,
+ getRedirectUrl: getRedirectUrl,
}