From e5cb8802777d623b87065c1468659b44e15146c6 Mon Sep 17 00:00:00 2001 From: p94xcago5 <2609106649@qq.com> Date: Thu, 13 Jun 2024 10:19:36 +0800 Subject: [PATCH] ADD file via upload --- 南昌大学专业录取分数线.py | 169 +++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 南昌大学专业录取分数线.py diff --git a/南昌大学专业录取分数线.py b/南昌大学专业录取分数线.py new file mode 100644 index 0000000..39aaeb1 --- /dev/null +++ b/南昌大学专业录取分数线.py @@ -0,0 +1,169 @@ +import requests +from lxml import etree +import pymysql +import re +from bs4 import BeautifulSoup + +# MySQL数据库配置 +db_config = { + 'host': 'localhost', + 'user': 'root', + 'password': '091020', + 'database': 'jdbc', + 'charset': 'utf8mb4', + 'cursorclass': pymysql.cursors.DictCursor # 游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名 +} +urls = {} + +def find(Iurl,Iyear,Istr): + url = Iurl + response = requests.get(url) + response.encoding = "utf-8" + html = response.text + soup = BeautifulSoup(html, 'html.parser') + a_tag = soup.find('a', string=lambda text: isinstance(text, str) and Istr in text) # 检查每个标签的文本内容(包括其所有子元素的文本) + if a_tag: + href_value = a_tag.get('href') + urls[Iyear] = f"https://zjc.ncu.edu.cn/zszx/lnfs/{href_value}" + else: + urls[Iyear] = "not find" + +# 爬取并保存数据到MySQL的函数 +def crawl_and_save(year, url): + datas = [] + response = requests.get(url) + response.encoding = "utf-8" + response.raise_for_status() # 检查请求是否成功 + html = etree.HTML(response.text) + htmlTxt = response.text + table = html.xpath('//table')[0] + # 提取数据 + if year == 2020: + # 使用XPath提取表格的所有行 + rows = table.xpath('.//tbody/tr') + # 遍历所有行,并提取每行的数据 + for row in rows: + data = { + 'year': year, + 'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', + 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', + 'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', + 'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', + 'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', + 'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', + 'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', + 'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '', + } + if data['category'] == '类别': + continue + datas.append(data) + + elif year == 2021: + # 使用BeautifulSoup解析HTML代码 + soup = BeautifulSoup(htmlTxt, 'html.parser') + # 找到包含表格的div元素 + div = soup.find('div', class_='blog-content') + # 找到表格并提取信息 + table = div.find('table') + rows = table.find_all('tr') + for row in rows[2:]: # 跳过表头和标题行 + cells = row.find_all('td') + if len(cells) == 8: # 确保行中有8个单元格 + category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells + data = { + 'year': year, + 'category': category.text, + 'FamCategories': category_type.text, + 'ProDirection': major.text, + 'EnrollmentPlan': recruitment.text, + 'ControlLine': control_line.text, + 'AdmissionLine': file_line.text, + 'ProAdmissionLine': major_line.text, + 'ReferenceRanking': rank.text, + } + datas.append(data) + + elif year == 2022: + # 正则表达式模式 + pattern = re.compile(r'