ADD file via upload

1 year ago · 1afc199029
parent c737631e07
commit 1afc199029
1 changed files with 112 additions and 0 deletions
--- a/zz.py
+++ b/zz.py
@ -0,0 +1,112 @@
+import csv
+import re
+import pymysql
+import requests
+from bs4 import BeautifulSoup
+from lxml import etree
+import time
+
+
+h = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"}
+source = requests.get("https://www.jxmtc.edu.cn/cl%2BAPThawuXTWHX4ydhQKA%3D%3D?encrypt=1", headers=h)
+source_text = source.content.decode('utf-8')
+time.sleep(2)
+# print(source_text)
+
+# https://ww.jxmtc.edu.cn/xwzx/wjtz/13.htm
+# https://ww.jxmtc.edu.cn/xwzx/wjtz/12.htm
+# https://ww.jxmtc.edu.cn/xwzx/wjtz/1.htm
+
+url_lists = ["https://ww.jxmtc.edu.cn/xwzx/wjtz.htm"]
+for pn in range(1, 14):
+    wy = f"https://ww.jxmtc.edu.cn/xwzx/wjtz/{pn}.htm"
+    url_lists.append(wy)
+    # print(wy)
+# print(url_lists)
+
+results = []
+for i in url_lists:
+    res = requests.get(i, headers=h)
+    res_text = res.content.decode('utf-8')
+    res_dom = etree.HTML(res_text)
+    # print(res_text)
+    # 发布时间，链接，标题
+    res_title = res_dom.xpath('//div[@class="ej_zw_right_nfont"]/ul/li/a')
+    res_time = res_dom.xpath('//div[@class="ej_zw_right_nfont"]/ul/li/a/span/text()')
+    # print(res_time)
+    # print(res_title)
+    for j, x in zip(res_time, res_title):
+        title = x.get('title')
+        href = x.get('href')
+        if not href.startswith(('http:', 'https:')):
+            url_name = href.split("/", 1)[-1]
+            base_url = "https://www.jxmtc.edu.cn/"
+            full_urls = f"{base_url}{url_name}"
+        else:
+            full_urls = href
+        # print(full_urls)
+        results.append([j, title, full_urls])
+
+    #     def get_html_content(url):
+    #         response = requests.get(url)
+    #         text = response.content.decode('utf-8')
+    #         return text
+    # #
+    #
+    #     lists = []
+    #     for url in full_urls:
+    #         html_content = get_html_content(full_urls)
+    #         soup = BeautifulSoup(html_content, 'lxml')
+    #         result = soup.find_all('div', class_="date")
+    #         # print(result)
+    #         for all in result:
+    #             a = all.find_all('span')
+    #             # print(len(a))
+
+# print(results)
+for i, j in zip(results, range(len(results))):
+    i.append('江西制造职业技术学院')
+    i.insert(0, str(j))
+    # 交换位置 使得列表更加完美
+    temp = i[1]
+    i[1] = i[2]
+    i[2] = temp
+
+# for i in results:
+#     print(i)
+
+with open('jxzz.csv', "w", encoding='utf-8', newline='') as f:
+    w = csv.writer(f)
+    w.writerow(["序列", "标题", "时间","链接","新闻来源"])
+    w.writerows(results)
+
+with open('jxzz.txt','a',encoding='utf-8')as file:
+    for row in results:
+        row_str = ', '.join(map(str, row))
+        file.write(row_str + "\n")
+
+
+
+# 数据库连接
+con = pymysql.Connect(host="localhost",user="root",password="123456",charset="utf8")
+
+cursor =con.cursor()
+cursor.execute("CREATE DATABASE IF NOT EXISTS JXZZ;")
+cursor.execute("use jxzz;")
+# 创建表
+cursor.execute('''CREATE TABLE news (
+    sequence INT,
+    title VARCHAR(255) NOT NULL,
+    time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    link VARCHAR(255) NOT NULL,
+    news_source VARCHAR(255) NOT NULL
+);''')
+
+for i in results:
+    cursor.execute("insert into news value(%s,%s,%s,%s,%s);",(i[0], i[1], i[2], i[3], i[4]))
+
+# 提交更改并关闭连接
+con.commit()
+cursor.close()
+con.cursor()