You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
113 lines
3.4 KiB
113 lines
3.4 KiB
import csv
|
|
import re
|
|
import pymysql
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from lxml import etree
|
|
import time
|
|
|
|
|
|
h = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0"}
|
|
source = requests.get("https://www.jxmtc.edu.cn/cl%2BAPThawuXTWHX4ydhQKA%3D%3D?encrypt=1", headers=h)
|
|
source_text = source.content.decode('utf-8')
|
|
time.sleep(2)
|
|
# print(source_text)
|
|
|
|
# https://ww.jxmtc.edu.cn/xwzx/wjtz/13.htm
|
|
# https://ww.jxmtc.edu.cn/xwzx/wjtz/12.htm
|
|
# https://ww.jxmtc.edu.cn/xwzx/wjtz/1.htm
|
|
|
|
url_lists = ["https://ww.jxmtc.edu.cn/xwzx/wjtz.htm"]
|
|
for pn in range(1, 14):
|
|
wy = f"https://ww.jxmtc.edu.cn/xwzx/wjtz/{pn}.htm"
|
|
url_lists.append(wy)
|
|
# print(wy)
|
|
# print(url_lists)
|
|
|
|
results = []
|
|
for i in url_lists:
|
|
res = requests.get(i, headers=h)
|
|
res_text = res.content.decode('utf-8')
|
|
res_dom = etree.HTML(res_text)
|
|
# print(res_text)
|
|
# 发布时间,链接,标题
|
|
res_title = res_dom.xpath('//div[@class="ej_zw_right_nfont"]/ul/li/a')
|
|
res_time = res_dom.xpath('//div[@class="ej_zw_right_nfont"]/ul/li/a/span/text()')
|
|
# print(res_time)
|
|
# print(res_title)
|
|
for j, x in zip(res_time, res_title):
|
|
title = x.get('title')
|
|
href = x.get('href')
|
|
if not href.startswith(('http:', 'https:')):
|
|
url_name = href.split("/", 1)[-1]
|
|
base_url = "https://www.jxmtc.edu.cn/"
|
|
full_urls = f"{base_url}{url_name}"
|
|
else:
|
|
full_urls = href
|
|
# print(full_urls)
|
|
results.append([j, title, full_urls])
|
|
|
|
# def get_html_content(url):
|
|
# response = requests.get(url)
|
|
# text = response.content.decode('utf-8')
|
|
# return text
|
|
# #
|
|
#
|
|
# lists = []
|
|
# for url in full_urls:
|
|
# html_content = get_html_content(full_urls)
|
|
# soup = BeautifulSoup(html_content, 'lxml')
|
|
# result = soup.find_all('div', class_="date")
|
|
# # print(result)
|
|
# for all in result:
|
|
# a = all.find_all('span')
|
|
# # print(len(a))
|
|
|
|
# print(results)
|
|
for i, j in zip(results, range(len(results))):
|
|
i.append('江西制造职业技术学院')
|
|
i.insert(0, str(j))
|
|
# 交换位置 使得列表更加完美
|
|
temp = i[1]
|
|
i[1] = i[2]
|
|
i[2] = temp
|
|
|
|
# for i in results:
|
|
# print(i)
|
|
|
|
with open('jxzz.csv', "w", encoding='utf-8', newline='') as f:
|
|
w = csv.writer(f)
|
|
w.writerow(["序列", "标题", "时间","链接","新闻来源"])
|
|
w.writerows(results)
|
|
|
|
with open('jxzz.txt','a',encoding='utf-8')as file:
|
|
for row in results:
|
|
row_str = ', '.join(map(str, row))
|
|
file.write(row_str + "\n")
|
|
|
|
|
|
|
|
# 数据库连接
|
|
con = pymysql.Connect(host="localhost",user="root",password="123456",charset="utf8")
|
|
|
|
cursor =con.cursor()
|
|
cursor.execute("CREATE DATABASE IF NOT EXISTS JXZZ;")
|
|
cursor.execute("use jxzz;")
|
|
# 创建表
|
|
cursor.execute('''CREATE TABLE news (
|
|
sequence INT,
|
|
title VARCHAR(255) NOT NULL,
|
|
time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
link VARCHAR(255) NOT NULL,
|
|
news_source VARCHAR(255) NOT NULL
|
|
);''')
|
|
|
|
for i in results:
|
|
cursor.execute("insert into news value(%s,%s,%s,%s,%s);",(i[0], i[1], i[2], i[3], i[4]))
|
|
|
|
# 提交更改并关闭连接
|
|
con.commit()
|
|
cursor.close()
|
|
con.cursor()
|