diff --git a/南昌大学专业录取分数线.py b/南昌大学专业录取分数线.py
new file mode 100644
index 0000000..256463a
--- /dev/null
+++ b/南昌大学专业录取分数线.py
@@ -0,0 +1,157 @@
+import requests
+from lxml import etree
+import pymysql
+import re
+from bs4 import BeautifulSoup
+
+# MySQL数据库配置
+db_config = {
+ 'host': 'localhost',
+ 'user': 'root',
+ 'password': '091020',
+ 'database': 'jdbc',
+ 'charset': 'utf8mb4',
+ 'cursorclass': pymysql.cursors.DictCursor #游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名
+}
+
+# 爬取并保存数据到MySQL的函数
+def crawl_and_save(year, url):
+ datas = []
+ response = requests.get(url)
+ response.raise_for_status() # 检查请求是否成功
+ html = etree.HTML(response.text.encode('iso-8859-1').decode('utf-8'))
+ htmlTxt = response.text.encode('iso-8859-1').decode('utf-8')
+
+ table = html.xpath('//table')[0]
+ # 提取数据
+ if year == 2020:
+ # 使用XPath提取表格的所有行
+ rows = table.xpath('.//tbody/tr')
+ # 遍历所有行,并提取每行的数据
+ for row in rows:
+ data = {
+ 'year': year,
+ 'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
+ 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
+ 'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
+ 'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
+ 'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
+ 'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
+ 'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
+ 'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '',
+ }
+ if data['category'] == '类别':
+ continue
+ datas.append(data)
+
+ elif year == 2021:
+ # 使用BeautifulSoup解析HTML代码
+ soup = BeautifulSoup(htmlTxt, 'html.parser')
+ # 找到包含表格的div元素
+ div = soup.find('div', class_='blog-content')
+ # 找到表格并提取信息
+ table = div.find('table')
+ rows = table.find_all('tr')
+ for row in rows[2:]: # 跳过表头和标题行
+ cells = row.find_all('td')
+ if len(cells) == 8: # 确保行中有8个单元格
+ category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells
+ data = {
+ 'year': year,
+ 'category': category.text,
+ 'FamCategories': category_type.text,
+ 'ProDirection': major.text,
+ 'EnrollmentPlan': recruitment.text,
+ 'ControlLine': control_line.text,
+ 'AdmissionLine': file_line.text,
+ 'ProAdmissionLine': major_line.text,
+ 'ReferenceRanking': rank.text,
+ }
+ datas.append(data)
+
+ elif year == 2022:
+ # 正则表达式模式
+ pattern = re.compile(r'
(.*?)')
+ # 提取信息
+ rows = re.findall(r'(.*?)', htmlTxt, flags=re.DOTALL) # 找到所有行
+ for row in rows[2:]: # 跳过表头和标题行
+ cells = pattern.findall(row)
+ if len(cells) == 7:
+ data = {
+ 'year': year,
+ 'category': None,
+ 'FamCategories': cells[0],
+ 'ProDirection': cells[1],
+ 'EnrollmentPlan': cells[2],
+ 'ControlLine': cells[3],
+ 'AdmissionLine': cells[4],
+ 'ProAdmissionLine': cells[5],
+ 'ReferenceRanking': cells[6],
+ }
+ datas.append(data)
+
+ elif year == 2023:
+ # 使用XPath提取表格的所有行
+ rows = table.xpath('.//tbody/tr')
+ # 遍历所有行,并提取每行的数据
+ for row in rows:
+ data = {
+ 'year': year,
+ 'category': None,
+ 'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
+ 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
+ 'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
+ 'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
+ 'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
+ 'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
+ 'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
+ }
+ if data['FamCategories'] == '科类':
+ continue
+ datas.append(data)
+
+ # 保存到MySQL
+ save_to_mysql(datas)
+
+
+# 保存到MySQL函数
+def save_to_mysql(data_list):
+ connection = pymysql.connect(**db_config)
+ try:
+ with connection.cursor() as cursor:
+ # 插入数据
+ insert_sql = """
+ INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`)
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
+ """
+ for data in data_list:
+ cursor.execute(insert_sql, (
+ data['year'],
+ data['category'],
+ data['ProDirection'],
+ data['FamCategories'],
+ data['EnrollmentPlan'],
+ data['ControlLine'],
+ data['AdmissionLine'],
+ data['ProAdmissionLine'],
+ data['ReferenceRanking']
+ ))
+
+ # 提交事务
+ connection.commit()
+ finally:
+ connection.close()
+
+def main():
+ years_urls = {
+ 2020: 'https://zjc.ncu.edu.cn/zszx/lnfs/3e5d5e97e5924d1794fd0eba09b79bf2.htm',
+ 2021: 'https://zjc.ncu.edu.cn/zszx/lnfs/47c901697a5549e998baf512a0c384f5.htm',
+ 2022: 'https://zjc.ncu.edu.cn/zszx/lnfs/9ba4f80248874172af2937017620226b.htm',
+ 2023: 'https://zjc.ncu.edu.cn/zszx/lnfs/5f3d9db4a5304265be3e31725a290b5c.htm'
+ }
+ # 遍历年份和对应的URL,爬取并保存数据
+ for year, url in years_urls.items():
+ crawl_and_save(year, url)
+
+if __name__ == '__main__':
+ main()