From 9d3bfb934625dd2922c54d8c47c2d6eb5e53201d Mon Sep 17 00:00:00 2001 From: p94xcago5 <2609106649@qq.com> Date: Thu, 13 Jun 2024 10:19:09 +0800 Subject: [PATCH] =?UTF-8?q?Delete=20'=E5=8D=97=E6=98=8C=E5=A4=A7=E5=AD=A6?= =?UTF-8?q?=E4=B8=93=E4=B8=9A=E5=BD=95=E5=8F=96=E5=88=86=E6=95=B0=E7=BA=BF?= =?UTF-8?q?.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 南昌大学专业录取分数线.py | 157 --------------------------- 1 file changed, 157 deletions(-) delete mode 100644 南昌大学专业录取分数线.py diff --git a/南昌大学专业录取分数线.py b/南昌大学专业录取分数线.py deleted file mode 100644 index 256463a..0000000 --- a/南昌大学专业录取分数线.py +++ /dev/null @@ -1,157 +0,0 @@ -import requests -from lxml import etree -import pymysql -import re -from bs4 import BeautifulSoup - -# MySQL数据库配置 -db_config = { - 'host': 'localhost', - 'user': 'root', - 'password': '091020', - 'database': 'jdbc', - 'charset': 'utf8mb4', - 'cursorclass': pymysql.cursors.DictCursor #游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名 -} - -# 爬取并保存数据到MySQL的函数 -def crawl_and_save(year, url): - datas = [] - response = requests.get(url) - response.raise_for_status() # 检查请求是否成功 - html = etree.HTML(response.text.encode('iso-8859-1').decode('utf-8')) - htmlTxt = response.text.encode('iso-8859-1').decode('utf-8') - - table = html.xpath('//table')[0] - # 提取数据 - if year == 2020: - # 使用XPath提取表格的所有行 - rows = table.xpath('.//tbody/tr') - # 遍历所有行,并提取每行的数据 - for row in rows: - data = { - 'year': year, - 'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', - 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', - 'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', - 'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', - 'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', - 'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', - 'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', - 'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '', - } - if data['category'] == '类别': - continue - datas.append(data) - - elif year == 2021: - # 使用BeautifulSoup解析HTML代码 - soup = BeautifulSoup(htmlTxt, 'html.parser') - # 找到包含表格的div元素 - div = soup.find('div', class_='blog-content') - # 找到表格并提取信息 - table = div.find('table') - rows = table.find_all('tr') - for row in rows[2:]: # 跳过表头和标题行 - cells = row.find_all('td') - if len(cells) == 8: # 确保行中有8个单元格 - category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells - data = { - 'year': year, - 'category': category.text, - 'FamCategories': category_type.text, - 'ProDirection': major.text, - 'EnrollmentPlan': recruitment.text, - 'ControlLine': control_line.text, - 'AdmissionLine': file_line.text, - 'ProAdmissionLine': major_line.text, - 'ReferenceRanking': rank.text, - } - datas.append(data) - - elif year == 2022: - # 正则表达式模式 - pattern = re.compile(r'(.*?)') - # 提取信息 - rows = re.findall(r'(.*?)', htmlTxt, flags=re.DOTALL) # 找到所有行 - for row in rows[2:]: # 跳过表头和标题行 - cells = pattern.findall(row) - if len(cells) == 7: - data = { - 'year': year, - 'category': None, - 'FamCategories': cells[0], - 'ProDirection': cells[1], - 'EnrollmentPlan': cells[2], - 'ControlLine': cells[3], - 'AdmissionLine': cells[4], - 'ProAdmissionLine': cells[5], - 'ReferenceRanking': cells[6], - } - datas.append(data) - - elif year == 2023: - # 使用XPath提取表格的所有行 - rows = table.xpath('.//tbody/tr') - # 遍历所有行,并提取每行的数据 - for row in rows: - data = { - 'year': year, - 'category': None, - 'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', - 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', - 'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', - 'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', - 'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', - 'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', - 'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', - } - if data['FamCategories'] == '科类': - continue - datas.append(data) - - # 保存到MySQL - save_to_mysql(datas) - - -# 保存到MySQL函数 -def save_to_mysql(data_list): - connection = pymysql.connect(**db_config) - try: - with connection.cursor() as cursor: - # 插入数据 - insert_sql = """ - INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) - """ - for data in data_list: - cursor.execute(insert_sql, ( - data['year'], - data['category'], - data['ProDirection'], - data['FamCategories'], - data['EnrollmentPlan'], - data['ControlLine'], - data['AdmissionLine'], - data['ProAdmissionLine'], - data['ReferenceRanking'] - )) - - # 提交事务 - connection.commit() - finally: - connection.close() - -def main(): - years_urls = { - 2020: 'https://zjc.ncu.edu.cn/zszx/lnfs/3e5d5e97e5924d1794fd0eba09b79bf2.htm', - 2021: 'https://zjc.ncu.edu.cn/zszx/lnfs/47c901697a5549e998baf512a0c384f5.htm', - 2022: 'https://zjc.ncu.edu.cn/zszx/lnfs/9ba4f80248874172af2937017620226b.htm', - 2023: 'https://zjc.ncu.edu.cn/zszx/lnfs/5f3d9db4a5304265be3e31725a290b5c.htm' - } - # 遍历年份和对应的URL,爬取并保存数据 - for year, url in years_urls.items(): - crawl_and_save(year, url) - -if __name__ == '__main__': - main()