import requests from lxml import etree import pymysql import re from bs4 import BeautifulSoup # MySQL数据库配置 db_config = { 'host': 'localhost', 'user': 'root', 'password': '091020', 'database': 'jdbc', 'charset': 'utf8mb4', 'cursorclass': pymysql.cursors.DictCursor #游标类型返回的结果是一个字典列表,其中每个字典都包含一行结果,字典的键是列名 } # 爬取并保存数据到MySQL的函数 def crawl_and_save(year, url): datas = [] response = requests.get(url) response.raise_for_status() # 检查请求是否成功 html = etree.HTML(response.text.encode('iso-8859-1').decode('utf-8')) htmlTxt = response.text.encode('iso-8859-1').decode('utf-8') table = html.xpath('//table')[0] # 提取数据 if year == 2020: # 使用XPath提取表格的所有行 rows = table.xpath('.//tbody/tr') # 遍历所有行,并提取每行的数据 for row in rows: data = { 'year': year, 'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', 'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', 'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', 'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', 'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', 'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', 'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '', } if data['category'] == '类别': continue datas.append(data) elif year == 2021: # 使用BeautifulSoup解析HTML代码 soup = BeautifulSoup(htmlTxt, 'html.parser') # 找到包含表格的div元素 div = soup.find('div', class_='blog-content') # 找到表格并提取信息 table = div.find('table') rows = table.find_all('tr') for row in rows[2:]: # 跳过表头和标题行 cells = row.find_all('td') if len(cells) == 8: # 确保行中有8个单元格 category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells data = { 'year': year, 'category': category.text, 'FamCategories': category_type.text, 'ProDirection': major.text, 'EnrollmentPlan': recruitment.text, 'ControlLine': control_line.text, 'AdmissionLine': file_line.text, 'ProAdmissionLine': major_line.text, 'ReferenceRanking': rank.text, } datas.append(data) elif year == 2022: # 正则表达式模式 pattern = re.compile(r'(.*?)') # 提取信息 rows = re.findall(r'(.*?)', htmlTxt, flags=re.DOTALL) # 找到所有行 for row in rows[2:]: # 跳过表头和标题行 cells = pattern.findall(row) if len(cells) == 7: data = { 'year': year, 'category': None, 'FamCategories': cells[0], 'ProDirection': cells[1], 'EnrollmentPlan': cells[2], 'ControlLine': cells[3], 'AdmissionLine': cells[4], 'ProAdmissionLine': cells[5], 'ReferenceRanking': cells[6], } datas.append(data) elif year == 2023: # 使用XPath提取表格的所有行 rows = table.xpath('.//tbody/tr') # 遍历所有行,并提取每行的数据 for row in rows: data = { 'year': year, 'category': None, 'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '', 'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '', 'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '', 'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '', 'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '', 'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '', 'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '', } if data['FamCategories'] == '科类': continue datas.append(data) # 保存到MySQL save_to_mysql(datas) # 保存到MySQL函数 def save_to_mysql(data_list): connection = pymysql.connect(**db_config) try: with connection.cursor() as cursor: # 插入数据 insert_sql = """ INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) """ for data in data_list: cursor.execute(insert_sql, ( data['year'], data['category'], data['ProDirection'], data['FamCategories'], data['EnrollmentPlan'], data['ControlLine'], data['AdmissionLine'], data['ProAdmissionLine'], data['ReferenceRanking'] )) # 提交事务 connection.commit() finally: connection.close() def main(): years_urls = { 2020: 'https://zjc.ncu.edu.cn/zszx/lnfs/3e5d5e97e5924d1794fd0eba09b79bf2.htm', 2021: 'https://zjc.ncu.edu.cn/zszx/lnfs/47c901697a5549e998baf512a0c384f5.htm', 2022: 'https://zjc.ncu.edu.cn/zszx/lnfs/9ba4f80248874172af2937017620226b.htm', 2023: 'https://zjc.ncu.edu.cn/zszx/lnfs/5f3d9db4a5304265be3e31725a290b5c.htm' } # 遍历年份和对应的URL,爬取并保存数据 for year, url in years_urls.items(): crawl_and_save(year, url) if __name__ == '__main__': main()