From 4e92cd75115027b5b2d85b551bae2263f0288167 Mon Sep 17 00:00:00 2001 From: p94xcago5 <2609106649@qq.com> Date: Thu, 13 Jun 2024 08:44:55 +0800 Subject: [PATCH] ADD file via upload --- 南昌大学专业录取分数线.py | 157 +++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 南昌大学专业录取分数线.py diff --git a/南昌大学专业录取分数线.py b/南昌大学专业录取分数线.py new file mode 100644 index 0000000..256463a --- /dev/null +++ b/南昌大学专业录取分数线.py @@ -0,0 +1,157 @@ +import requests +from lxml import etree +import pymysql +import re +from bs4 import BeautifulSoup + +# MySQL数据库配置 +db_config = { + 'host': 'localhost', + 'user': 'root', + 'password': '091020', + 'database': 'jdbc', + 'charset': 'utf8mb4', + 'cursorclass': pymysql.cursors.DictCursor #游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名 +} + +# 爬取并保存数据到MySQL的函数 +def crawl_and_save(year, url): + datas = [] + response = requests.get(url) + response.raise_for_status() # 检查请求是否成功 + html = etree.HTML(response.text.encode('iso-8859-1').decode('utf-8')) + htmlTxt = response.text.encode('iso-8859-1').decode('utf-8') + + table = html.xpath('//table')[0] + # 提取数据 + if year == 2020: + # 使用XPath提取表格的所有行 + rows = table.xpath('.//tbody/tr') + # 遍历所有行,并提取每行的数据 + for row in rows: + data = { + 'year': year, + 'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', + 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', + 'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', + 'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', + 'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', + 'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', + 'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', + 'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '', + } + if data['category'] == '类别': + continue + datas.append(data) + + elif year == 2021: + # 使用BeautifulSoup解析HTML代码 + soup = BeautifulSoup(htmlTxt, 'html.parser') + # 找到包含表格的div元素 + div = soup.find('div', class_='blog-content') + # 找到表格并提取信息 + table = div.find('table') + rows = table.find_all('tr') + for row in rows[2:]: # 跳过表头和标题行 + cells = row.find_all('td') + if len(cells) == 8: # 确保行中有8个单元格 + category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells + data = { + 'year': year, + 'category': category.text, + 'FamCategories': category_type.text, + 'ProDirection': major.text, + 'EnrollmentPlan': recruitment.text, + 'ControlLine': control_line.text, + 'AdmissionLine': file_line.text, + 'ProAdmissionLine': major_line.text, + 'ReferenceRanking': rank.text, + } + datas.append(data) + + elif year == 2022: + # 正则表达式模式 + pattern = re.compile(r'(.*?)') + # 提取信息 + rows = re.findall(r'(.*?)', htmlTxt, flags=re.DOTALL) # 找到所有行 + for row in rows[2:]: # 跳过表头和标题行 + cells = pattern.findall(row) + if len(cells) == 7: + data = { + 'year': year, + 'category': None, + 'FamCategories': cells[0], + 'ProDirection': cells[1], + 'EnrollmentPlan': cells[2], + 'ControlLine': cells[3], + 'AdmissionLine': cells[4], + 'ProAdmissionLine': cells[5], + 'ReferenceRanking': cells[6], + } + datas.append(data) + + elif year == 2023: + # 使用XPath提取表格的所有行 + rows = table.xpath('.//tbody/tr') + # 遍历所有行,并提取每行的数据 + for row in rows: + data = { + 'year': year, + 'category': None, + 'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', + 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', + 'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', + 'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', + 'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', + 'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', + 'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', + } + if data['FamCategories'] == '科类': + continue + datas.append(data) + + # 保存到MySQL + save_to_mysql(datas) + + +# 保存到MySQL函数 +def save_to_mysql(data_list): + connection = pymysql.connect(**db_config) + try: + with connection.cursor() as cursor: + # 插入数据 + insert_sql = """ + INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + for data in data_list: + cursor.execute(insert_sql, ( + data['year'], + data['category'], + data['ProDirection'], + data['FamCategories'], + data['EnrollmentPlan'], + data['ControlLine'], + data['AdmissionLine'], + data['ProAdmissionLine'], + data['ReferenceRanking'] + )) + + # 提交事务 + connection.commit() + finally: + connection.close() + +def main(): + years_urls = { + 2020: 'https://zjc.ncu.edu.cn/zszx/lnfs/3e5d5e97e5924d1794fd0eba09b79bf2.htm', + 2021: 'https://zjc.ncu.edu.cn/zszx/lnfs/47c901697a5549e998baf512a0c384f5.htm', + 2022: 'https://zjc.ncu.edu.cn/zszx/lnfs/9ba4f80248874172af2937017620226b.htm', + 2023: 'https://zjc.ncu.edu.cn/zszx/lnfs/5f3d9db4a5304265be3e31725a290b5c.htm' + } + # 遍历年份和对应的URL,爬取并保存数据 + for year, url in years_urls.items(): + crawl_and_save(year, url) + +if __name__ == '__main__': + main()