You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
cx00000/南昌大学专业录取分数线.py

170 lines
7.2 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import requests
from lxml import etree
import pymysql
import re
from bs4 import BeautifulSoup
# MySQL数据库配置
db_config = {
'host': 'localhost',
'user': 'root',
'password': '091020',
'database': 'jdbc',
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor # 游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名
}
urls = {}
def find(Iurl,Iyear,Istr):
url = Iurl
response = requests.get(url)
response.encoding = "utf-8"
html = response.text
soup = BeautifulSoup(html, 'html.parser')
a_tag = soup.find('a', string=lambda text: isinstance(text, str) and Istr in text) # 检查每个<a>标签的文本内容(包括其所有子元素的文本)
if a_tag:
href_value = a_tag.get('href')
urls[Iyear] = f"https://zjc.ncu.edu.cn/zszx/lnfs/{href_value}"
else:
urls[Iyear] = "not find"
# 爬取并保存数据到MySQL的函数
def crawl_and_save(year, url):
datas = []
response = requests.get(url)
response.encoding = "utf-8"
response.raise_for_status() # 检查请求是否成功
html = etree.HTML(response.text)
htmlTxt = response.text
table = html.xpath('//table')[0]
# 提取数据
if year == 2020:
# 使用XPath提取表格的所有行
rows = table.xpath('.//tbody/tr')
# 遍历所有行,并提取每行的数据
for row in rows:
data = {
'year': year,
'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '',
}
if data['category'] == '类别':
continue
datas.append(data)
elif year == 2021:
# 使用BeautifulSoup解析HTML代码
soup = BeautifulSoup(htmlTxt, 'html.parser')
# 找到包含表格的div元素
div = soup.find('div', class_='blog-content')
# 找到表格并提取信息
table = div.find('table')
rows = table.find_all('tr')
for row in rows[2:]: # 跳过表头和标题行
cells = row.find_all('td')
if len(cells) == 8: # 确保行中有8个单元格
category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells
data = {
'year': year,
'category': category.text,
'FamCategories': category_type.text,
'ProDirection': major.text,
'EnrollmentPlan': recruitment.text,
'ControlLine': control_line.text,
'AdmissionLine': file_line.text,
'ProAdmissionLine': major_line.text,
'ReferenceRanking': rank.text,
}
datas.append(data)
elif year == 2022:
# 正则表达式模式
pattern = re.compile(r'<td.*?>(.*?)</td>')
# 提取信息
rows = re.findall(r'<tr.*?>(.*?)</tr>', htmlTxt, flags=re.DOTALL) # 找到所有行
for row in rows[2:]: # 跳过表头和标题行
cells = pattern.findall(row)
if len(cells) == 7:
data = {
'year': year,
'category': None,
'FamCategories': cells[0],
'ProDirection': cells[1],
'EnrollmentPlan': cells[2],
'ControlLine': cells[3],
'AdmissionLine': cells[4],
'ProAdmissionLine': cells[5],
'ReferenceRanking': cells[6],
}
datas.append(data)
elif year == 2023:
# 使用XPath提取表格的所有行
rows = table.xpath('.//tbody/tr')
# 遍历所有行,并提取每行的数据
for row in rows:
data = {
'year': year,
'category': None,
'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
}
if data['FamCategories'] == '科类':
continue
datas.append(data)
# 保存到MySQL
save_to_mysql(datas)
# 保存到MySQL函数
def save_to_mysql(data_list):
connection = pymysql.connect(**db_config)
try:
with connection.cursor() as cursor:
# 插入数据
insert_sql = """
INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
for data in data_list:
cursor.execute(insert_sql, (
data['year'],
data['category'],
data['ProDirection'],
data['FamCategories'],
data['EnrollmentPlan'],
data['ControlLine'],
data['AdmissionLine'],
data['ProAdmissionLine'],
data['ReferenceRanking']
))
# 提交事务
connection.commit()
finally:
connection.close()
def main():
find("https://zjc.ncu.edu.cn/zszx/lnfs/index1.htm", 2020, "南昌大学2020年江西省专业录取分数线")
find("https://zjc.ncu.edu.cn/zszx/lnfs/index1.htm", 2021, "南昌大学2021年江西省专业录取线")
find("https://zjc.ncu.edu.cn/zszx/lnfs/index.htm", 2022, "2022年南昌大学江西省第一批本科专业录取线")
find("https://zjc.ncu.edu.cn/zszx/lnfs/index.htm", 2023, "2023年南昌大学江西省第一批本科专业录取线")
# 遍历年份和对应的URL爬取并保存数据
for year, url in urls.items():
crawl_and_save(year, url)
if __name__ == '__main__':
main()