From 7b9e1f9976d58cbf97854cb2db5bdb9929b028cd Mon Sep 17 00:00:00 2001 From: pqk5c82f3 <2042437448@qq.com> Date: Tue, 11 Jun 2024 19:04:14 +0800 Subject: [PATCH] ADD file via upload --- zz.py | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 zz.py diff --git a/zz.py b/zz.py new file mode 100644 index 0000000..ff3d82b --- /dev/null +++ b/zz.py @@ -0,0 +1,112 @@ +import csv +import re +import pymysql +import requests +from bs4 import BeautifulSoup +from lxml import etree +import time + + +h = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"} +source = requests.get("https://ww.jxmtc.edu.cn/xwzx/wjtz.htm", headers=h) +source_text = source.content.decode('utf-8') +time.sleep(2) +# print(source_text) + +# https://ww.jxmtc.edu.cn/xwzx/wjtz/13.htm +# https://ww.jxmtc.edu.cn/xwzx/wjtz/12.htm +# https://ww.jxmtc.edu.cn/xwzx/wjtz/1.htm + +url_lists = ["https://ww.jxmtc.edu.cn/xwzx/wjtz.htm"] +for pn in range(1, 14): + wy = f"https://ww.jxmtc.edu.cn/xwzx/wjtz/{pn}.htm" + url_lists.append(wy) + # print(wy) +# print(url_lists) + +results = [] +for i in url_lists: + res = requests.get(i, headers=h) + res_text = res.content.decode('utf-8') + res_dom = etree.HTML(res_text) + # print(res_text) + # 发布时间,链接,标题 + res_title = res_dom.xpath('//div[@class="ej_zw_right_nfont"]/ul/li/a') + res_time = res_dom.xpath('//div[@class="ej_zw_right_nfont"]/ul/li/a/span/text()') + # print(res_time) + # print(res_title) + for j, x in zip(res_time, res_title): + title = x.get('title') + href = x.get('href') + if not href.startswith(('http:', 'https:')): + url_name = href.split("/", 1)[-1] + base_url = "https://ww.jxmtc.edu.cn/" + full_urls = f"{base_url}{url_name}" + else: + full_urls = href + # print(full_urls) + results.append([j, title, full_urls]) + + # def get_html_content(url): + # response = requests.get(url) + # text = response.content.decode('utf-8') + # return text + # # + # + # lists = [] + # for url in full_urls: + # html_content = get_html_content(full_urls) + # soup = BeautifulSoup(html_content, 'lxml') + # result = soup.find_all('div', class_="date") + # # print(result) + # for all in result: + # a = all.find_all('span') + # # print(len(a)) + +# print(results) +for i, j in zip(results, range(len(results))): + i.append('江西制造职业技术学院') + i.insert(0, str(j)) + # 交换位置 使得列表更加完美 + temp = i[1] + i[1] = i[2] + i[2] = temp + +# for i in results: +# print(i) + +with open('jxzz.csv', "w", encoding='utf-8', newline='') as f: + w = csv.writer(f) + w.writerow(["序列", "标题", "时间","链接","新闻来源"]) + w.writerows(results) + +with open('jxzz.txt','a',encoding='utf-8')as file: + for row in results: + row_str = ', '.join(map(str, row)) + file.write(row_str + "\n") + + + +# 数据库连接 +con = pymysql.Connect(host="localhost",user="root",password="123456",charset="utf8") + +cursor =con.cursor() +cursor.execute("CREATE DATABASE IF NOT EXISTS JXZZ;") +con.select_db("JXZZ") +# 创建表 +cursor.execute('''CREATE TABLE news ( + sequence INT, + title VARCHAR(255) NOT NULL, + time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + link VARCHAR(255) NOT NULL, + news_source VARCHAR(255) NOT NULL +);''') + +for i in results: + cursor.execute("insert into news value(%s,%s,%s,%s,%s);",(i[0], i[1], i[2], i[3], i[4])) + +# 提交更改并关闭连接 +con.commit() +cursor.close() +con.cursor()