You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
94 lines
3.9 KiB
94 lines
3.9 KiB
import requests
|
|
import re
|
|
import tkinter as tk
|
|
from tkinter import Toplevel
|
|
|
|
headers = {
|
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.1311 SLBChan/11"
|
|
}
|
|
|
|
def shicigefu():
|
|
def fetch_data():
|
|
urls = [f"https://www.gushiwen.cn/default_{i}.aspx" for i in range(1, 5)]
|
|
gushici = []
|
|
for url in urls:
|
|
response = requests.get(url, headers=headers)
|
|
content = response.text
|
|
titles = re.findall('<b>(.*?)</b>', content, re.DOTALL)
|
|
authors = re.findall('<p class="source">.*?<a.*?<a.*?>(.*?)</a>', content, re.DOTALL)
|
|
dynastys = re.findall('<p class="source">.*?<a.*?<a.*?>(.*?)</a>', content, re.DOTALL)
|
|
poetics = re.findall('<div class="contson.*?">(.*?)</div>', content, re.DOTALL)
|
|
new_poetics = [''.join(re.split('<.*?>|<.*? />', p)).strip() for p in poetics]
|
|
|
|
for title, author, dynasty, poetic in zip(titles, authors, dynastys, new_poetics):
|
|
gushici.append({"title": title, "author": author, "dynasty": dynasty, "poetic": poetic})
|
|
|
|
return gushici
|
|
|
|
def is_data_unique(new_data, existing_data):
|
|
"""检查新数据是否已存在于现有数据中"""
|
|
for new_item in new_data:
|
|
if any(new_item['title'] == item['title'] and
|
|
new_item['author'] == item['author'] and
|
|
new_item['dynasty'] == item['dynasty'] and
|
|
new_item['poetic'] == item['poetic']
|
|
for item in existing_data):
|
|
return False
|
|
return True
|
|
|
|
def read_existing_data_from_file(filename="gushi.txt"):
|
|
"""从文件中读取现有数据"""
|
|
existing_data = []
|
|
try:
|
|
with open(filename, "r", encoding="utf-8") as file:
|
|
lines = file.readlines()
|
|
for line in lines:
|
|
parts = line.strip().split(', ')
|
|
if len(parts) == 4: # 确保每一行数据格式正确
|
|
existing_data.append({
|
|
'title': parts[0].split(': ')[1],
|
|
'author': parts[1].split(': ')[1],
|
|
'dynasty': parts[2].split(': ')[1],
|
|
'poetic': parts[3].split(': ')[1]
|
|
})
|
|
except FileNotFoundError:
|
|
pass # 文件不存在则忽略
|
|
return existing_data
|
|
|
|
def save_data_to_file(data, filename="gushi.txt"):
|
|
"""将去重后的新数据保存到文件"""
|
|
existing_data = read_existing_data_from_file(filename)
|
|
if is_data_unique(data, existing_data):
|
|
with open(filename, "a", encoding="utf-8") as file:
|
|
for item in data:
|
|
file.write(f"标题: {item['title']}, 作者: {item['author']}, 朝代: {item['dynasty']}, 内容: {item['poetic']}\n")
|
|
|
|
def show_data_window(data):
|
|
window = Toplevel(root)
|
|
window.title("古诗词信息")
|
|
text_widget = tk.Text(window)
|
|
text_widget.pack(expand=True, fill='both')
|
|
|
|
for item in data[:10]:
|
|
text_widget.insert(tk.END, f"标题: {item['title']}\n作者: {item['author']}\n朝代: {item['dynasty']}\n内容: {item['poetic']}\n\n")
|
|
window.mainloop()
|
|
|
|
def fetch_and_save():
|
|
gushici_data = fetch_data()
|
|
save_data_to_file(gushici_data)
|
|
show_data_window(gushici_data) # 现在这里应该能够正确引用到外部的show_data_window函数
|
|
|
|
def save_button_action():
|
|
gushici_data = fetch_data()
|
|
save_data_to_file(gushici_data)
|
|
|
|
root = tk.Tk()
|
|
root.title("古诗词爬虫")
|
|
|
|
fetch_button = tk.Button(root, text="开始爬取并显示", command=fetch_and_save)
|
|
fetch_button.pack(pady=20)
|
|
|
|
save_button = tk.Button(root, text="仅储存新数据", command=save_button_action)
|
|
save_button.pack(pady=10)
|
|
|
|
root.mainloop() |