Delete '南昌大学专业录取分数线.py'

5 months ago · 9d3bfb9346
parent 4e92cd7511
commit 9d3bfb9346
1 changed files with 0 additions and 157 deletions
--- a/南昌大学专业录取分数线.py
+++ b/南昌大学专业录取分数线.py
@ -1,157 +0,0 @@
-import requests
-from lxml import etree
-import pymysql
-import re
-from bs4 import BeautifulSoup
-
-# MySQL数据库配置
-db_config = {
-    'host': 'localhost',
-    'user': 'root',
-    'password': '091020',
-    'database': 'jdbc',
-    'charset': 'utf8mb4',
-    'cursorclass': pymysql.cursors.DictCursor #游标类型返回的结果是一个字典列表，其中每个字典都包含一行结果，字典的键是列名
-}
-
-# 爬取并保存数据到MySQL的函数
-def crawl_and_save(year, url):
-    datas = []
-    response = requests.get(url)
-    response.raise_for_status()  # 检查请求是否成功
-    html = etree.HTML(response.text.encode('iso-8859-1').decode('utf-8'))
-    htmlTxt = response.text.encode('iso-8859-1').decode('utf-8')
-
-    table = html.xpath('//table')[0]
-    # 提取数据
-    if year == 2020:
-        # 使用XPath提取表格的所有行
-        rows = table.xpath('.//tbody/tr')
-        # 遍历所有行，并提取每行的数据
-        for row in rows:
-            data = {
-                'year': year,
-                'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
-                'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
-                'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
-                'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
-                'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
-                'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
-                'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
-                'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '',
-            }
-            if data['category'] == '类别':
-                continue
-            datas.append(data)
-
-    elif year == 2021:
-        # 使用BeautifulSoup解析HTML代码
-        soup = BeautifulSoup(htmlTxt, 'html.parser')
-        # 找到包含表格的div元素
-        div = soup.find('div', class_='blog-content')
-        # 找到表格并提取信息
-        table = div.find('table')
-        rows = table.find_all('tr')
-        for row in rows[2:]:  # 跳过表头和标题行
-            cells = row.find_all('td')
-            if len(cells) == 8:  # 确保行中有8个单元格
-                category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells
-                data = {
-                    'year': year,
-                    'category': category.text,
-                    'FamCategories': category_type.text,
-                    'ProDirection': major.text,
-                    'EnrollmentPlan': recruitment.text,
-                    'ControlLine': control_line.text,
-                    'AdmissionLine': file_line.text,
-                    'ProAdmissionLine': major_line.text,
-                    'ReferenceRanking': rank.text,
-                }
-                datas.append(data)
-
-    elif year == 2022:
-        # 正则表达式模式
-        pattern = re.compile(r'<td.*?>(.*?)</td>')
-        # 提取信息
-        rows = re.findall(r'<tr.*?>(.*?)</tr>', htmlTxt, flags=re.DOTALL)  # 找到所有行
-        for row in rows[2:]:  # 跳过表头和标题行
-            cells = pattern.findall(row)
-            if len(cells) == 7:
-                data = {
-                    'year': year,
-                    'category': None,
-                    'FamCategories': cells[0],
-                    'ProDirection': cells[1],
-                    'EnrollmentPlan': cells[2],
-                    'ControlLine': cells[3],
-                    'AdmissionLine': cells[4],
-                    'ProAdmissionLine': cells[5],
-                    'ReferenceRanking': cells[6],
-                }
-                datas.append(data)
-
-    elif year == 2023:
-        # 使用XPath提取表格的所有行
-        rows = table.xpath('.//tbody/tr')
-        # 遍历所有行，并提取每行的数据
-        for row in rows:
-            data = {
-                'year': year,
-                'category': None,
-                'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
-                'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
-                'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
-                'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
-                'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
-                'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
-                'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
-            }
-            if data['FamCategories'] == '科类':
-                continue
-            datas.append(data)
-
-    # 保存到MySQL
-    save_to_mysql(datas)
-
-
-# 保存到MySQL函数
-def save_to_mysql(data_list):
-    connection = pymysql.connect(**db_config)
-    try:
-        with connection.cursor() as cursor:
-            # 插入数据
-            insert_sql = """
-            INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`)  
-            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)  
-            """
-            for data in data_list:
-                cursor.execute(insert_sql, (
-                    data['year'],
-                    data['category'],
-                    data['ProDirection'],
-                    data['FamCategories'],
-                    data['EnrollmentPlan'],
-                    data['ControlLine'],
-                    data['AdmissionLine'],
-                    data['ProAdmissionLine'],
-                    data['ReferenceRanking']
-                ))
-
-                # 提交事务
-        connection.commit()
-    finally:
-        connection.close()
-
-def main():
-    years_urls = {
-        2020: 'https://zjc.ncu.edu.cn/zszx/lnfs/3e5d5e97e5924d1794fd0eba09b79bf2.htm',
-        2021: 'https://zjc.ncu.edu.cn/zszx/lnfs/47c901697a5549e998baf512a0c384f5.htm',
-        2022: 'https://zjc.ncu.edu.cn/zszx/lnfs/9ba4f80248874172af2937017620226b.htm',
-        2023: 'https://zjc.ncu.edu.cn/zszx/lnfs/5f3d9db4a5304265be3e31725a290b5c.htm'
-    }
-    # 遍历年份和对应的URL，爬取并保存数据
-    for year, url in years_urls.items():
-        crawl_and_save(year, url)
-
-if __name__ == '__main__':
-    main()