245 lines
9.1 KiB
JavaScript
245 lines
9.1 KiB
JavaScript
// 导入需要的模块
|
||
const request = require('request');
|
||
const fs = require('fs');
|
||
const xpath = require('xpath');
|
||
const dom = require('xmldom').DOMParser;
|
||
const mysql = require('mysql');
|
||
|
||
// 要爬取的网页地址
|
||
// https://data.rmtc.org.cn/gis/PubIndexM.html?type=0
|
||
// https://data.rmtc.org.cn/gis/PubIndexM.html?type=1
|
||
|
||
const mysqlConfig = {
|
||
host: 'localhost', // 数据库地址
|
||
user: 'root', // 数据库用户名
|
||
password: '123456', // 数据库密码
|
||
database: 'open_data' // 数据库名称
|
||
}
|
||
|
||
main()
|
||
|
||
async function main() {
|
||
await doFetch('0')
|
||
await doFetch('1')
|
||
console.log('爬取完成')
|
||
}
|
||
|
||
//
|
||
async function doFetch(type) {
|
||
const url = 'https://data.rmtc.org.cn/gis/PubIndexM.html?type=' + type
|
||
await new Promise((resolve) => {
|
||
// 发送请求,获取网页内容
|
||
request(url, async (error, response, body) => {
|
||
if (error) {
|
||
console.error(error);
|
||
return
|
||
}
|
||
|
||
// fs.writeFileSync('output/html.json', body, 'utf-8');
|
||
|
||
// 使用dom模块解析body内容,生成文档对象
|
||
const doc = new dom().parseFromString(body);
|
||
|
||
// 修改了这个xpath表达式,用li标签而不是text(),因为text()会匹配所有文本节点,包括空白和换行
|
||
const lis = xpath.select("//ul[@data-role='listview']/li", doc);
|
||
|
||
// 定义一个空数组,用于存储转换后的数据
|
||
const data = [];
|
||
|
||
const childrenPage = [];
|
||
|
||
// 遍历提取到的数据,将其转换为对象,并存入数组中
|
||
for (let i = 0; i < lis.length; i++) {
|
||
const li = lis[i]
|
||
|
||
/*
|
||
<li><a data-ajax="false" href="PubStationlistM.html?type=1&id=2410283902&">
|
||
<h2><span class="epfy">红沿河核电厂</span> (<span class="epfy">复州城</span>)</h2>
|
||
<p>
|
||
<span class="span-count">83 nGy/h</span>
|
||
<span class="showtime">2023-08-25</span> </p>
|
||
</a>
|
||
|
||
</li>
|
||
*/
|
||
// 使用nodeValue属性获取文本内容
|
||
const name = xpath.select1(".//h2/span[@class='epfy'][1]/text()", li).nodeValue.trim();
|
||
const location = xpath.select1(".//h2/span[@class='epfy'][2]/text()", li).nodeValue.trim();
|
||
const value = xpath.select1(".//p/span[@class='span-count']/text()", li).nodeValue.trim();
|
||
const date = xpath.select1(".//p/span[@class='showtime']/text()", li).nodeValue.trim();
|
||
|
||
// 使用正则表达式匹配type和id的值
|
||
const href = xpath.select1("./a/@href", li).nodeValue.trim();
|
||
const regex = /type=(\d+)&id=(\d+)/;
|
||
const match = regex.exec(href);
|
||
const type = match[1];
|
||
const id = match[2];
|
||
|
||
// 创建一个对象,存储每一条数据
|
||
const item = {
|
||
name,
|
||
location,
|
||
value,
|
||
date,
|
||
parent_id: null,
|
||
type,
|
||
id
|
||
};
|
||
// 将对象推入数组中
|
||
data.push(item);
|
||
|
||
childrenPage.push({ type, id, href, name })
|
||
}
|
||
// 打印转换后的数据
|
||
// console.log(data);
|
||
// console.log('主表数据完成', JSON.stringify(data));
|
||
console.log('主表数据完成');
|
||
|
||
saveToDb(data)
|
||
// fs.writeFileSync('output/data.json', JSON.stringify(data, null, 4), 'utf-8');
|
||
|
||
console.log('childrenPage', childrenPage)
|
||
for (let page of childrenPage) {
|
||
console.log('#######################################################')
|
||
console.log('page', page)
|
||
await doFetch2(page.type, page.id, page.name, page.href)
|
||
console.log('#######################################################')
|
||
await new Promise((resolve) => {
|
||
setTimeout(resolve, 1000)
|
||
})
|
||
}
|
||
|
||
resolve()
|
||
});
|
||
})
|
||
}
|
||
|
||
|
||
async function doFetch2(parentType, parentId, parentName, relativeUrl) {
|
||
const url = 'https://data.rmtc.org.cn/gis/' + relativeUrl
|
||
// 发送请求,获取网页内容
|
||
let data = await new Promise((resolve) => {
|
||
request(url, (error, response, body) => {
|
||
if (error) {
|
||
console.error(error);
|
||
return
|
||
}
|
||
|
||
const doc = new dom().parseFromString(body);
|
||
const lis = xpath.select("//ul[@data-role='listview']/li", doc);
|
||
const data = [];
|
||
|
||
// 遍历提取到的数据,将其转换为对象,并存入数组中
|
||
for (let i = 0; i < lis.length; i++) {
|
||
const li = lis[i]
|
||
|
||
/*
|
||
<li><a stid="42974" time="2023-08-26 00:00:00" itemkey="43061" itemcode="0102060301" itemname="辐射剂量率" class="showboxlink" href="#drawdialog">
|
||
<h2>老渔窝</h2>
|
||
<p>
|
||
<span class="span-count">71 nGy/h</span>
|
||
<span class="showtime">2023-08-25</span> </p>
|
||
</a>
|
||
|
||
</li>
|
||
*/
|
||
// 使用nodeValue属性获取文本内容
|
||
const time = xpath.select1(".//a/@time", li).nodeValue.trim();
|
||
const itemkey = xpath.select1(".//a/@itemkey", li).nodeValue.trim();
|
||
const itemcode = xpath.select1(".//a/@itemcode", li).nodeValue.trim();
|
||
const itemname = xpath.select1(".//a/@itemname", li).nodeValue.trim();
|
||
|
||
const location = xpath.select1(".//h2/text()", li).nodeValue.trim();
|
||
const value = xpath.select1(".//p/span[@class='span-count']/text()", li).nodeValue.trim();
|
||
const date = xpath.select1(".//p/span[@class='showtime']/text()", li).nodeValue.trim();
|
||
|
||
// 创建一个对象,存储每一条数据
|
||
const item = {
|
||
name: parentName,
|
||
location,
|
||
value,
|
||
date,
|
||
parent_id: parentId,
|
||
type: parentType,
|
||
id: null,
|
||
itemkey: itemkey,
|
||
itemcode: itemcode,
|
||
itemname: itemname,
|
||
time,
|
||
};
|
||
|
||
// 将对象推入数组中
|
||
data.push(item);
|
||
}
|
||
// 打印转换后的数据
|
||
// console.log(data);
|
||
// console.log('二级分类数据完成', parentType, parentId, JSON.stringify(data));
|
||
console.log('二级分类数据完成', parentType, parentId);
|
||
|
||
resolve(data)
|
||
// fs.writeFileSync('output/data.json', JSON.stringify(data, null, 4), 'utf-8');
|
||
});
|
||
})
|
||
saveToDb(data)
|
||
}
|
||
|
||
function saveToDb(data) {
|
||
if (!data || data.length == 0) {
|
||
console.error("没有数据需要保存")
|
||
return
|
||
}
|
||
|
||
// 创建一个数据库连接对象
|
||
const connection = mysql.createConnection(mysqlConfig);
|
||
|
||
// 连接数据库
|
||
connection.connect((err) => {
|
||
if (err) {
|
||
console.error(err);
|
||
} else {
|
||
console.log('数据库连接成功');
|
||
|
||
const sql = `INSERT IGNORE INTO nuclear_data (${Object.keys(data[0]).join(', ')}) VALUES ?`;
|
||
// `REPLACE INTO nuclear_data (name, location, value, date, parent_id, type, id) VALUES ?`;
|
||
// `INSERT IGNORE INTO nuclear_data (name, location, value, date, parent_id, type, id) VALUES ?`;
|
||
|
||
// 将数据转换为二维数组,方便插入
|
||
const values = data.map(item => Object.values(item));
|
||
|
||
// let data = [
|
||
// {
|
||
// name: '红沿河核电厂',
|
||
// location: '复州城',
|
||
// value: '83 nGy/h',
|
||
// date: '2023-08-25',
|
||
// parent_id: '1',
|
||
// type: '1',
|
||
// id: '2410283902'
|
||
// },
|
||
// ];
|
||
|
||
// const values = [
|
||
// [
|
||
// "红沿河核电厂",
|
||
// "复州城",
|
||
// "83 nGy/h",
|
||
// "2023-08-25",
|
||
// "1",
|
||
// "1",
|
||
// "2410283902"
|
||
// ]
|
||
// ]
|
||
|
||
// 执行插入或替换数据的语句,传入values作为参数
|
||
connection.query(sql, [values], (err, result) => {
|
||
if (err) {
|
||
console.error(err);
|
||
} else {
|
||
console.log('成功');
|
||
// 关闭数据库连接
|
||
connection.end();
|
||
}
|
||
});
|
||
}
|
||
});
|
||
} |