ADD file via upload

main
p94xcago5 5 months ago
parent 9d3bfb9346
commit e5cb880277

@ -0,0 +1,169 @@
import requests
from lxml import etree
import pymysql
import re
from bs4 import BeautifulSoup
# MySQL数据库配置
db_config = {
'host': 'localhost',
'user': 'root',
'password': '091020',
'database': 'jdbc',
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor # 游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名
}
urls = {}
def find(Iurl,Iyear,Istr):
url = Iurl
response = requests.get(url)
response.encoding = "utf-8"
html = response.text
soup = BeautifulSoup(html, 'html.parser')
a_tag = soup.find('a', string=lambda text: isinstance(text, str) and Istr in text) # 检查每个<a>标签的文本内容(包括其所有子元素的文本)
if a_tag:
href_value = a_tag.get('href')
urls[Iyear] = f"https://zjc.ncu.edu.cn/zszx/lnfs/{href_value}"
else:
urls[Iyear] = "not find"
# 爬取并保存数据到MySQL的函数
def crawl_and_save(year, url):
datas = []
response = requests.get(url)
response.encoding = "utf-8"
response.raise_for_status() # 检查请求是否成功
html = etree.HTML(response.text)
htmlTxt = response.text
table = html.xpath('//table')[0]
# 提取数据
if year == 2020:
# 使用XPath提取表格的所有行
rows = table.xpath('.//tbody/tr')
# 遍历所有行,并提取每行的数据
for row in rows:
data = {
'year': year,
'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '',
}
if data['category'] == '类别':
continue
datas.append(data)
elif year == 2021:
# 使用BeautifulSoup解析HTML代码
soup = BeautifulSoup(htmlTxt, 'html.parser')
# 找到包含表格的div元素
div = soup.find('div', class_='blog-content')
# 找到表格并提取信息
table = div.find('table')
rows = table.find_all('tr')
for row in rows[2:]: # 跳过表头和标题行
cells = row.find_all('td')
if len(cells) == 8: # 确保行中有8个单元格
category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells
data = {
'year': year,
'category': category.text,
'FamCategories': category_type.text,
'ProDirection': major.text,
'EnrollmentPlan': recruitment.text,
'ControlLine': control_line.text,
'AdmissionLine': file_line.text,
'ProAdmissionLine': major_line.text,
'ReferenceRanking': rank.text,
}
datas.append(data)
elif year == 2022:
# 正则表达式模式
pattern = re.compile(r'<td.*?>(.*?)</td>')
# 提取信息
rows = re.findall(r'<tr.*?>(.*?)</tr>', htmlTxt, flags=re.DOTALL) # 找到所有行
for row in rows[2:]: # 跳过表头和标题行
cells = pattern.findall(row)
if len(cells) == 7:
data = {
'year': year,
'category': None,
'FamCategories': cells[0],
'ProDirection': cells[1],
'EnrollmentPlan': cells[2],
'ControlLine': cells[3],
'AdmissionLine': cells[4],
'ProAdmissionLine': cells[5],
'ReferenceRanking': cells[6],
}
datas.append(data)
elif year == 2023:
# 使用XPath提取表格的所有行
rows = table.xpath('.//tbody/tr')
# 遍历所有行,并提取每行的数据
for row in rows:
data = {
'year': year,
'category': None,
'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
}
if data['FamCategories'] == '科类':
continue
datas.append(data)
# 保存到MySQL
save_to_mysql(datas)
# 保存到MySQL函数
def save_to_mysql(data_list):
connection = pymysql.connect(**db_config)
try:
with connection.cursor() as cursor:
# 插入数据
insert_sql = """
INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
for data in data_list:
cursor.execute(insert_sql, (
data['year'],
data['category'],
data['ProDirection'],
data['FamCategories'],
data['EnrollmentPlan'],
data['ControlLine'],
data['AdmissionLine'],
data['ProAdmissionLine'],
data['ReferenceRanking']
))
# 提交事务
connection.commit()
finally:
connection.close()
def main():
find("https://zjc.ncu.edu.cn/zszx/lnfs/index1.htm", 2020, "南昌大学2020年江西省专业录取分数线")
find("https://zjc.ncu.edu.cn/zszx/lnfs/index1.htm", 2021, "南昌大学2021年江西省专业录取线")
find("https://zjc.ncu.edu.cn/zszx/lnfs/index.htm", 2022, "2022年南昌大学江西省第一批本科专业录取线")
find("https://zjc.ncu.edu.cn/zszx/lnfs/index.htm", 2023, "2023年南昌大学江西省第一批本科专业录取线")
# 遍历年份和对应的URL爬取并保存数据
for year, url in urls.items():
crawl_and_save(year, url)
if __name__ == '__main__':
main()
Loading…
Cancel
Save