import csv import re import pymysql import requests from bs4 import BeautifulSoup from lxml import etree import time h = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"} source = requests.get("https://www.jxmtc.edu.cn/cl%2BAPThawuXTWHX4ydhQKA%3D%3D?encrypt=1", headers=h) source_text = source.content.decode('utf-8') time.sleep(2) # print(source_text) # https://ww.jxmtc.edu.cn/xwzx/wjtz/13.htm # https://ww.jxmtc.edu.cn/xwzx/wjtz/12.htm # https://ww.jxmtc.edu.cn/xwzx/wjtz/1.htm url_lists = ["https://ww.jxmtc.edu.cn/xwzx/wjtz.htm"] for pn in range(1, 14): wy = f"https://ww.jxmtc.edu.cn/xwzx/wjtz/{pn}.htm" url_lists.append(wy) # print(wy) # print(url_lists) results = [] for i in url_lists: res = requests.get(i, headers=h) res_text = res.content.decode('utf-8') res_dom = etree.HTML(res_text) # print(res_text) # 发布时间,链接,标题 res_title = res_dom.xpath('//div[@class="ej_zw_right_nfont"]/ul/li/a') res_time = res_dom.xpath('//div[@class="ej_zw_right_nfont"]/ul/li/a/span/text()') # print(res_time) # print(res_title) for j, x in zip(res_time, res_title): title = x.get('title') href = x.get('href') if not href.startswith(('http:', 'https:')): url_name = href.split("/", 1)[-1] base_url = "https://www.jxmtc.edu.cn/" full_urls = f"{base_url}{url_name}" else: full_urls = href # print(full_urls) results.append([j, title, full_urls]) # def get_html_content(url): # response = requests.get(url) # text = response.content.decode('utf-8') # return text # # # # lists = [] # for url in full_urls: # html_content = get_html_content(full_urls) # soup = BeautifulSoup(html_content, 'lxml') # result = soup.find_all('div', class_="date") # # print(result) # for all in result: # a = all.find_all('span') # # print(len(a)) # print(results) for i, j in zip(results, range(len(results))): i.append('江西制造职业技术学院') i.insert(0, str(j)) # 交换位置 使得列表更加完美 temp = i[1] i[1] = i[2] i[2] = temp # for i in results: # print(i) with open('jxzz.csv', "w", encoding='utf-8', newline='') as f: w = csv.writer(f) w.writerow(["序列", "标题", "时间","链接","新闻来源"]) w.writerows(results) with open('jxzz.txt','a',encoding='utf-8')as file: for row in results: row_str = ', '.join(map(str, row)) file.write(row_str + "\n") # 数据库连接 con = pymysql.Connect(host="localhost",user="root",password="123456",charset="utf8") cursor =con.cursor() cursor.execute("CREATE DATABASE IF NOT EXISTS JXZZ;") cursor.execute("use jxzz;") # 创建表 cursor.execute('''CREATE TABLE news ( sequence INT, title VARCHAR(255) NOT NULL, time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, link VARCHAR(255) NOT NULL, news_source VARCHAR(255) NOT NULL );''') for i in results: cursor.execute("insert into news value(%s,%s,%s,%s,%s);",(i[0], i[1], i[2], i[3], i[4])) # 提交更改并关闭连接 con.commit() cursor.close() con.cursor()