You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cx00000/南昌大学专业录取分数线.py

158 lines
6.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from lxml import etree
import pymysql
import re
from bs4 import BeautifulSoup
# MySQL数据库配置
db_config = {
'host': 'localhost',
'user': 'root',
'password': '091020',
'database': 'jdbc',
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor #游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名
}
# 爬取并保存数据到MySQL的函数
def crawl_and_save(year, url):
datas = []
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
html = etree.HTML(response.text.encode('iso-8859-1').decode('utf-8'))
htmlTxt = response.text.encode('iso-8859-1').decode('utf-8')
table = html.xpath('//table')[0]
# 提取数据
if year == 2020:
# 使用XPath提取表格的所有行
rows = table.xpath('.//tbody/tr')
# 遍历所有行,并提取每行的数据
for row in rows:
data = {
'year': year,
'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '',
}
if data['category'] == '类别':
continue
datas.append(data)
elif year == 2021:
# 使用BeautifulSoup解析HTML代码
soup = BeautifulSoup(htmlTxt, 'html.parser')
# 找到包含表格的div元素
div = soup.find('div', class_='blog-content')
# 找到表格并提取信息
table = div.find('table')
rows = table.find_all('tr')
for row in rows[2:]: # 跳过表头和标题行
cells = row.find_all('td')
if len(cells) == 8: # 确保行中有8个单元格
category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells
data = {
'year': year,
'category': category.text,
'FamCategories': category_type.text,
'ProDirection': major.text,
'EnrollmentPlan': recruitment.text,
'ControlLine': control_line.text,
'AdmissionLine': file_line.text,
'ProAdmissionLine': major_line.text,
'ReferenceRanking': rank.text,
}
datas.append(data)
elif year == 2022:
# 正则表达式模式
pattern = re.compile(r'<td.*?>(.*?)</td>')
# 提取信息
rows = re.findall(r'<tr.*?>(.*?)</tr>', htmlTxt, flags=re.DOTALL) # 找到所有行
for row in rows[2:]: # 跳过表头和标题行
cells = pattern.findall(row)
if len(cells) == 7:
data = {
'year': year,
'category': None,
'FamCategories': cells[0],
'ProDirection': cells[1],
'EnrollmentPlan': cells[2],
'ControlLine': cells[3],
'AdmissionLine': cells[4],
'ProAdmissionLine': cells[5],
'ReferenceRanking': cells[6],
}
datas.append(data)
elif year == 2023:
# 使用XPath提取表格的所有行
rows = table.xpath('.//tbody/tr')
# 遍历所有行,并提取每行的数据
for row in rows:
data = {
'year': year,
'category': None,
'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
}
if data['FamCategories'] == '科类':
continue
datas.append(data)
# 保存到MySQL
save_to_mysql(datas)
# 保存到MySQL函数
def save_to_mysql(data_list):
connection = pymysql.connect(**db_config)
try:
with connection.cursor() as cursor:
# 插入数据
insert_sql = """
INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
for data in data_list:
cursor.execute(insert_sql, (
data['year'],
data['category'],
data['ProDirection'],
data['FamCategories'],
data['EnrollmentPlan'],
data['ControlLine'],
data['AdmissionLine'],
data['ProAdmissionLine'],
data['ReferenceRanking']
))
# 提交事务
connection.commit()
finally:
connection.close()
def main():
years_urls = {
2020: 'https://zjc.ncu.edu.cn/zszx/lnfs/3e5d5e97e5924d1794fd0eba09b79bf2.htm',
2021: 'https://zjc.ncu.edu.cn/zszx/lnfs/47c901697a5549e998baf512a0c384f5.htm',
2022: 'https://zjc.ncu.edu.cn/zszx/lnfs/9ba4f80248874172af2937017620226b.htm',
2023: 'https://zjc.ncu.edu.cn/zszx/lnfs/5f3d9db4a5304265be3e31725a290b5c.htm'
}
# 遍历年份和对应的URL爬取并保存数据
for year, url in years_urls.items():
crawl_and_save(year, url)
if __name__ == '__main__':
main()