From 4e92cd75115027b5b2d85b551bae2263f0288167 Mon Sep 17 00:00:00 2001 From: p94xcago5 <2609106649@qq.com> Date: Thu, 13 Jun 2024 08:44:55 +0800 Subject: [PATCH] ADD file via upload --- 南昌大学专业录取分数线.py | 157 +++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 南昌大学专业录取分数线.py diff --git a/南昌大学专业录取分数线.py b/南昌大学专业录取分数线.py new file mode 100644 index 0000000..256463a --- /dev/null +++ b/南昌大学专业录取分数线.py @@ -0,0 +1,157 @@ +import requests +from lxml import etree +import pymysql +import re +from bs4 import BeautifulSoup + +# MySQL数据库配置 +db_config = { + 'host': 'localhost', + 'user': 'root', + 'password': '091020', + 'database': 'jdbc', + 'charset': 'utf8mb4', + 'cursorclass': pymysql.cursors.DictCursor #游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名 +} + +# 爬取并保存数据到MySQL的函数 +def crawl_and_save(year, url): + datas = [] + response = requests.get(url) + response.raise_for_status() # 检查请求是否成功 + html = etree.HTML(response.text.encode('iso-8859-1').decode('utf-8')) + htmlTxt = response.text.encode('iso-8859-1').decode('utf-8') + + table = html.xpath('//table')[0] + # 提取数据 + if year == 2020: + # 使用XPath提取表格的所有行 + rows = table.xpath('.//tbody/tr') + # 遍历所有行,并提取每行的数据 + for row in rows: + data = { + 'year': year, + 'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', + 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', + 'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', + 'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', + 'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', + 'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', + 'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', + 'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '', + } + if data['category'] == '类别': + continue + datas.append(data) + + elif year == 2021: + # 使用BeautifulSoup解析HTML代码 + soup = BeautifulSoup(htmlTxt, 'html.parser') + # 找到包含表格的div元素 + div = soup.find('div', class_='blog-content') + # 找到表格并提取信息 + table = div.find('table') + rows = table.find_all('tr') + for row in rows[2:]: # 跳过表头和标题行 + cells = row.find_all('td') + if len(cells) == 8: # 确保行中有8个单元格 + category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells + data = { + 'year': year, + 'category': category.text, + 'FamCategories': category_type.text, + 'ProDirection': major.text, + 'EnrollmentPlan': recruitment.text, + 'ControlLine': control_line.text, + 'AdmissionLine': file_line.text, + 'ProAdmissionLine': major_line.text, + 'ReferenceRanking': rank.text, + } + datas.append(data) + + elif year == 2022: + # 正则表达式模式 + pattern = re.compile(r'