ADD file via upload

5 months ago · e5cb880277
parent 9d3bfb9346
commit e5cb880277
1 changed files with 169 additions and 0 deletions
--- a/南昌大学专业录取分数线.py
+++ b/南昌大学专业录取分数线.py
@ -0,0 +1,169 @@
+import requests
+from lxml import etree
+import pymysql
+import re
+from bs4 import BeautifulSoup
+
+# MySQL数据库配置
+db_config = {
+    'host': 'localhost',
+    'user': 'root',
+    'password': '091020',
+    'database': 'jdbc',
+    'charset': 'utf8mb4',
+    'cursorclass': pymysql.cursors.DictCursor # 游标类型返回的结果是一个字典列表，其中每个字典都包含一行结果，字典的键是列名
+}
+urls = {}
+
+def find(Iurl,Iyear,Istr):
+    url = Iurl
+    response = requests.get(url)
+    response.encoding = "utf-8"
+    html = response.text
+    soup = BeautifulSoup(html, 'html.parser')
+    a_tag = soup.find('a', string=lambda text: isinstance(text, str) and Istr in text)  # 检查每个<a>标签的文本内容（包括其所有子元素的文本）
+    if a_tag:
+        href_value = a_tag.get('href')
+        urls[Iyear] = f"https://zjc.ncu.edu.cn/zszx/lnfs/{href_value}"
+    else:
+        urls[Iyear] = "not find"
+
+# 爬取并保存数据到MySQL的函数
+def crawl_and_save(year, url):
+    datas = []
+    response = requests.get(url)
+    response.encoding = "utf-8"
+    response.raise_for_status()  # 检查请求是否成功
+    html = etree.HTML(response.text)
+    htmlTxt = response.text
+    table = html.xpath('//table')[0]
+    # 提取数据
+    if year == 2020:
+        # 使用XPath提取表格的所有行
+        rows = table.xpath('.//tbody/tr')
+        # 遍历所有行，并提取每行的数据
+        for row in rows:
+            data = {
+                'year': year,
+                'category': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
+                'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
+                'FamCategories': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
+                'EnrollmentPlan': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
+                'ControlLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
+                'AdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
+                'ProAdmissionLine': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
+                'ReferenceRanking': row.xpath('.//td[8]/text()')[0] if row.xpath('.//td[8]') else '',
+            }
+            if data['category'] == '类别':
+                continue
+            datas.append(data)
+
+    elif year == 2021:
+        # 使用BeautifulSoup解析HTML代码
+        soup = BeautifulSoup(htmlTxt, 'html.parser')
+        # 找到包含表格的div元素
+        div = soup.find('div', class_='blog-content')
+        # 找到表格并提取信息
+        table = div.find('table')
+        rows = table.find_all('tr')
+        for row in rows[2:]:  # 跳过表头和标题行
+            cells = row.find_all('td')
+            if len(cells) == 8:  # 确保行中有8个单元格
+                category, category_type, major, recruitment, control_line, file_line, major_line, rank = cells
+                data = {
+                    'year': year,
+                    'category': category.text,
+                    'FamCategories': category_type.text,
+                    'ProDirection': major.text,
+                    'EnrollmentPlan': recruitment.text,
+                    'ControlLine': control_line.text,
+                    'AdmissionLine': file_line.text,
+                    'ProAdmissionLine': major_line.text,
+                    'ReferenceRanking': rank.text,
+                }
+                datas.append(data)
+
+    elif year == 2022:
+        # 正则表达式模式
+        pattern = re.compile(r'<td.*?>(.*?)</td>')
+        # 提取信息
+        rows = re.findall(r'<tr.*?>(.*?)</tr>', htmlTxt, flags=re.DOTALL)  # 找到所有行
+        for row in rows[2:]:  # 跳过表头和标题行
+            cells = pattern.findall(row)
+            if len(cells) == 7:
+                data = {
+                    'year': year,
+                    'category': None,
+                    'FamCategories': cells[0],
+                    'ProDirection': cells[1],
+                    'EnrollmentPlan': cells[2],
+                    'ControlLine': cells[3],
+                    'AdmissionLine': cells[4],
+                    'ProAdmissionLine': cells[5],
+                    'ReferenceRanking': cells[6],
+                }
+                datas.append(data)
+
+    elif year == 2023:
+        # 使用XPath提取表格的所有行
+        rows = table.xpath('.//tbody/tr')
+        # 遍历所有行，并提取每行的数据
+        for row in rows:
+            data = {
+                'year': year,
+                'category': None,
+                'FamCategories': row.xpath('.//td[1]/text()')[0] if row.xpath('.//td[1]') else '',
+                'ProDirection': row.xpath('.//td[2]/text()')[0] if row.xpath('.//td[2]') else '',
+                'EnrollmentPlan': row.xpath('.//td[3]/text()')[0] if row.xpath('.//td[3]') else '',
+                'ControlLine': row.xpath('.//td[4]/text()')[0] if row.xpath('.//td[4]') else '',
+                'AdmissionLine': row.xpath('.//td[5]/text()')[0] if row.xpath('.//td[5]') else '',
+                'ProAdmissionLine': row.xpath('.//td[6]/text()')[0] if row.xpath('.//td[6]') else '',
+                'ReferenceRanking': row.xpath('.//td[7]/text()')[0] if row.xpath('.//td[7]') else '',
+            }
+            if data['FamCategories'] == '科类':
+                continue
+            datas.append(data)
+
+    # 保存到MySQL
+    save_to_mysql(datas)
+
+# 保存到MySQL函数
+def save_to_mysql(data_list):
+    connection = pymysql.connect(**db_config)
+    try:
+        with connection.cursor() as cursor:
+            # 插入数据
+            insert_sql = """
+            INSERT INTO `enroll` (`year`, `category`, `ProDirection`, `FamCategories`, `EnrollmentPlan`, `ControlLine`, `AdmissionLine`, `ProAdmissionLine`, `ReferenceRanking`)  
+            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)  
+            """
+            for data in data_list:
+                cursor.execute(insert_sql, (
+                    data['year'],
+                    data['category'],
+                    data['ProDirection'],
+                    data['FamCategories'],
+                    data['EnrollmentPlan'],
+                    data['ControlLine'],
+                    data['AdmissionLine'],
+                    data['ProAdmissionLine'],
+                    data['ReferenceRanking']
+                ))
+
+                # 提交事务
+        connection.commit()
+    finally:
+        connection.close()
+
+def main():
+    find("https://zjc.ncu.edu.cn/zszx/lnfs/index1.htm", 2020, "南昌大学2020年江西省专业录取分数线")
+    find("https://zjc.ncu.edu.cn/zszx/lnfs/index1.htm", 2021, "南昌大学2021年江西省专业录取线")
+    find("https://zjc.ncu.edu.cn/zszx/lnfs/index.htm", 2022, "2022年南昌大学江西省第一批本科专业录取线")
+    find("https://zjc.ncu.edu.cn/zszx/lnfs/index.htm", 2023, "2023年南昌大学江西省第一批本科专业录取线")
+
+    # 遍历年份和对应的URL，爬取并保存数据
+    for year, url in urls.items():
+        crawl_and_save(year, url)
+
+if __name__ == '__main__':
+    main()