diff --git a/南昌大学专业录取分数线.py b/南昌大学专业录取分数线.py new file mode 100644 index 0000000..39aaeb1 --- /dev/null +++ b/南昌大学专业录取分数线.py @@ -0,0 +1,169 @@ +import requests +from lxml import etree +import pymysql +import re +from bs4 import BeautifulSoup + +# MySQL数据库配置 +db_config = { + 'host': 'localhost', + 'user': 'root', + 'password': '091020', + 'database': 'jdbc', + 'charset': 'utf8mb4', + 'cursorclass': pymysql.cursors.DictCursor # 游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名 +} +urls = {} + +def find(Iurl,Iyear,Istr): + url = Iurl + response = requests.get(url) + response.encoding = "utf-8" + html = response.text + soup = BeautifulSoup(html, 'html.parser') + a_tag = soup.find('a', string=lambda text: isinstance(text, str) and Istr in text) # 检查每个标签的文本内容(包括其所有子元素的文本) + if a_tag: + href_value = a_tag.get('href') + urls[Iyear] = f"https://zjc.ncu.edu.cn/zszx/lnfs/{href_value}" + else: + urls[Iyear] = "not find" + +# 爬取并保存数据到MySQL的函数 +def crawl_and_save(year, url): + datas = [] + response = requests.get(url) + response.encoding = "utf-8" + response.raise_for_status() # 检查请求是否成功 + html = etree.HTML(response.text) + htmlTxt = response.text + table = html.xpath('//table')[0] + # 提取数据 + if year == 2020: + # 使用XPath提取表格的所有行 + rows = table.xpath('.//tbody/tr') + # 遍历所有行,并提取每行的数据 + for row in rows: + data = { + 'year': year, + 'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', + 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', + 'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', + 'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', + 'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', + 'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', + 'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', + 'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '', + } + if data['category'] == '类别': + continue + datas.append(data) + + elif year == 2021: + # 使用BeautifulSoup解析HTML代码 + soup = BeautifulSoup(htmlTxt, 'html.parser') + # 找到包含表格的div元素 + div = soup.find('div', class_='blog-content') + # 找到表格并提取信息 + table = div.find('table') + rows = table.find_all('tr') + for row in rows[2:]: # 跳过表头和标题行 + cells = row.find_all('td') + if len(cells) == 8: # 确保行中有8个单元格 + category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells + data = { + 'year': year, + 'category': category.text, + 'FamCategories': category_type.text, + 'ProDirection': major.text, + 'EnrollmentPlan': recruitment.text, + 'ControlLine': control_line.text, + 'AdmissionLine': file_line.text, + 'ProAdmissionLine': major_line.text, + 'ReferenceRanking': rank.text, + } + datas.append(data) + + elif year == 2022: + # 正则表达式模式 + pattern = re.compile(r'(.*?)') + # 提取信息 + rows = re.findall(r'(.*?)', htmlTxt, flags=re.DOTALL) # 找到所有行 + for row in rows[2:]: # 跳过表头和标题行 + cells = pattern.findall(row) + if len(cells) == 7: + data = { + 'year': year, + 'category': None, + 'FamCategories': cells[0], + 'ProDirection': cells[1], + 'EnrollmentPlan': cells[2], + 'ControlLine': cells[3], + 'AdmissionLine': cells[4], + 'ProAdmissionLine': cells[5], + 'ReferenceRanking': cells[6], + } + datas.append(data) + + elif year == 2023: + # 使用XPath提取表格的所有行 + rows = table.xpath('.//tbody/tr') + # 遍历所有行,并提取每行的数据 + for row in rows: + data = { + 'year': year, + 'category': None, + 'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', + 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', + 'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', + 'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', + 'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', + 'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', + 'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', + } + if data['FamCategories'] == '科类': + continue + datas.append(data) + + # 保存到MySQL + save_to_mysql(datas) + +# 保存到MySQL函数 +def save_to_mysql(data_list): + connection = pymysql.connect(**db_config) + try: + with connection.cursor() as cursor: + # 插入数据 + insert_sql = """ + INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) + """ + for data in data_list: + cursor.execute(insert_sql, ( + data['year'], + data['category'], + data['ProDirection'], + data['FamCategories'], + data['EnrollmentPlan'], + data['ControlLine'], + data['AdmissionLine'], + data['ProAdmissionLine'], + data['ReferenceRanking'] + )) + + # 提交事务 + connection.commit() + finally: + connection.close() + +def main(): + find("https://zjc.ncu.edu.cn/zszx/lnfs/index1.htm", 2020, "南昌大学2020年江西省专业录取分数线") + find("https://zjc.ncu.edu.cn/zszx/lnfs/index1.htm", 2021, "南昌大学2021年江西省专业录取线") + find("https://zjc.ncu.edu.cn/zszx/lnfs/index.htm", 2022, "2022年南昌大学江西省第一批本科专业录取线") + find("https://zjc.ncu.edu.cn/zszx/lnfs/index.htm", 2023, "2023年南昌大学江西省第一批本科专业录取线") + + # 遍历年份和对应的URL,爬取并保存数据 + for year, url in urls.items(): + crawl_and_save(year, url) + +if __name__ == '__main__': + main()