|
|
|
|
import requests
|
|
|
|
|
from lxml import etree
|
|
|
|
|
import pymysql
|
|
|
|
|
import re
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
# MySQL数据库配置
|
|
|
|
|
db_config = {
|
|
|
|
|
'host': 'localhost',
|
|
|
|
|
'user': 'root',
|
|
|
|
|
'password': '091020',
|
|
|
|
|
'database': 'jdbc',
|
|
|
|
|
'charset': 'utf8mb4',
|
|
|
|
|
'cursorclass': pymysql.cursors.DictCursor #游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 爬取并保存数据到MySQL的函数
|
|
|
|
|
def crawl_and_save(year, url):
|
|
|
|
|
datas = []
|
|
|
|
|
response = requests.get(url)
|
|
|
|
|
response.raise_for_status() # 检查请求是否成功
|
|
|
|
|
html = etree.HTML(response.text.encode('iso-8859-1').decode('utf-8'))
|
|
|
|
|
htmlTxt = response.text.encode('iso-8859-1').decode('utf-8')
|
|
|
|
|
|
|
|
|
|
table = html.xpath('//table')[0]
|
|
|
|
|
# 提取数据
|
|
|
|
|
if year == 2020:
|
|
|
|
|
# 使用XPath提取表格的所有行
|
|
|
|
|
rows = table.xpath('.//tbody/tr')
|
|
|
|
|
# 遍历所有行,并提取每行的数据
|
|
|
|
|
for row in rows:
|
|
|
|
|
data = {
|
|
|
|
|
'year': year,
|
|
|
|
|
'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
|
|
|
|
|
'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
|
|
|
|
|
'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
|
|
|
|
|
'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
|
|
|
|
|
'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
|
|
|
|
|
'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
|
|
|
|
|
'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
|
|
|
|
|
'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '',
|
|
|
|
|
}
|
|
|
|
|
if data['category'] == '类别':
|
|
|
|
|
continue
|
|
|
|
|
datas.append(data)
|
|
|
|
|
|
|
|
|
|
elif year == 2021:
|
|
|
|
|
# 使用BeautifulSoup解析HTML代码
|
|
|
|
|
soup = BeautifulSoup(htmlTxt, 'html.parser')
|
|
|
|
|
# 找到包含表格的div元素
|
|
|
|
|
div = soup.find('div', class_='blog-content')
|
|
|
|
|
# 找到表格并提取信息
|
|
|
|
|
table = div.find('table')
|
|
|
|
|
rows = table.find_all('tr')
|
|
|
|
|
for row in rows[2:]: # 跳过表头和标题行
|
|
|
|
|
cells = row.find_all('td')
|
|
|
|
|
if len(cells) == 8: # 确保行中有8个单元格
|
|
|
|
|
category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells
|
|
|
|
|
data = {
|
|
|
|
|
'year': year,
|
|
|
|
|
'category': category.text,
|
|
|
|
|
'FamCategories': category_type.text,
|
|
|
|
|
'ProDirection': major.text,
|
|
|
|
|
'EnrollmentPlan': recruitment.text,
|
|
|
|
|
'ControlLine': control_line.text,
|
|
|
|
|
'AdmissionLine': file_line.text,
|
|
|
|
|
'ProAdmissionLine': major_line.text,
|
|
|
|
|
'ReferenceRanking': rank.text,
|
|
|
|
|
}
|
|
|
|
|
datas.append(data)
|
|
|
|
|
|
|
|
|
|
elif year == 2022:
|
|
|
|
|
# 正则表达式模式
|
|
|
|
|
pattern = re.compile(r'<td.*?>(.*?)</td>')
|
|
|
|
|
# 提取信息
|
|
|
|
|
rows = re.findall(r'<tr.*?>(.*?)</tr>', htmlTxt, flags=re.DOTALL) # 找到所有行
|
|
|
|
|
for row in rows[2:]: # 跳过表头和标题行
|
|
|
|
|
cells = pattern.findall(row)
|
|
|
|
|
if len(cells) == 7:
|
|
|
|
|
data = {
|
|
|
|
|
'year': year,
|
|
|
|
|
'category': None,
|
|
|
|
|
'FamCategories': cells[0],
|
|
|
|
|
'ProDirection': cells[1],
|
|
|
|
|
'EnrollmentPlan': cells[2],
|
|
|
|
|
'ControlLine': cells[3],
|
|
|
|
|
'AdmissionLine': cells[4],
|
|
|
|
|
'ProAdmissionLine': cells[5],
|
|
|
|
|
'ReferenceRanking': cells[6],
|
|
|
|
|
}
|
|
|
|
|
datas.append(data)
|
|
|
|
|
|
|
|
|
|
elif year == 2023:
|
|
|
|
|
# 使用XPath提取表格的所有行
|
|
|
|
|
rows = table.xpath('.//tbody/tr')
|
|
|
|
|
# 遍历所有行,并提取每行的数据
|
|
|
|
|
for row in rows:
|
|
|
|
|
data = {
|
|
|
|
|
'year': year,
|
|
|
|
|
'category': None,
|
|
|
|
|
'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
|
|
|
|
|
'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
|
|
|
|
|
'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
|
|
|
|
|
'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
|
|
|
|
|
'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
|
|
|
|
|
'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
|
|
|
|
|
'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
|
|
|
|
|
}
|
|
|
|
|
if data['FamCategories'] == '科类':
|
|
|
|
|
continue
|
|
|
|
|
datas.append(data)
|
|
|
|
|
|
|
|
|
|
# 保存到MySQL
|
|
|
|
|
save_to_mysql(datas)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 保存到MySQL函数
|
|
|
|
|
def save_to_mysql(data_list):
|
|
|
|
|
connection = pymysql.connect(**db_config)
|
|
|
|
|
try:
|
|
|
|
|
with connection.cursor() as cursor:
|
|
|
|
|
# 插入数据
|
|
|
|
|
insert_sql = """
|
|
|
|
|
INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`)
|
|
|
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
|
|
|
"""
|
|
|
|
|
for data in data_list:
|
|
|
|
|
cursor.execute(insert_sql, (
|
|
|
|
|
data['year'],
|
|
|
|
|
data['category'],
|
|
|
|
|
data['ProDirection'],
|
|
|
|
|
data['FamCategories'],
|
|
|
|
|
data['EnrollmentPlan'],
|
|
|
|
|
data['ControlLine'],
|
|
|
|
|
data['AdmissionLine'],
|
|
|
|
|
data['ProAdmissionLine'],
|
|
|
|
|
data['ReferenceRanking']
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
# 提交事务
|
|
|
|
|
connection.commit()
|
|
|
|
|
finally:
|
|
|
|
|
connection.close()
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
years_urls = {
|
|
|
|
|
2020: 'https://zjc.ncu.edu.cn/zszx/lnfs/3e5d5e97e5924d1794fd0eba09b79bf2.htm',
|
|
|
|
|
2021: 'https://zjc.ncu.edu.cn/zszx/lnfs/47c901697a5549e998baf512a0c384f5.htm',
|
|
|
|
|
2022: 'https://zjc.ncu.edu.cn/zszx/lnfs/9ba4f80248874172af2937017620226b.htm',
|
|
|
|
|
2023: 'https://zjc.ncu.edu.cn/zszx/lnfs/5f3d9db4a5304265be3e31725a290b5c.htm'
|
|
|
|
|
}
|
|
|
|
|
# 遍历年份和对应的URL,爬取并保存数据
|
|
|
|
|
for year, url in years_urls.items():
|
|
|
|
|
crawl_and_save(year, url)
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
main()
|