Update 提取title.py

turtle
pwmglvzfq 2 years ago
parent 655b3d0c84
commit 4878db0351

@ -1,77 +1,76 @@
''' '''
Author:@阳春布泽 Author:@阳春布泽
date23-1-5 date23-1-5
列表 [(日期,title网址)] note:目前只完成了爬虫部分需要解决前端信息推送问题有会QQ机器人或者微信小程序等同学欢迎合作
写入提取日期 '''
''' import requests
import requests from bs4 import BeautifulSoup
from bs4 import BeautifulSoup import time
import time
url = 'http://dean.xjtu.edu.cn/'
url = 'http://dean.xjtu.edu.cn/' header = {
header = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' }
} title_list = []
title_list = [] output_list = []
output_list = [] path = 'jwc.txt'
path = 'C:\\Users\\LZH\\Desktop\\jwc.txt' path_data = 'old_data.txt'
path_data = 'C:\\Python\\程序\\爬取教务处通知\\old_data.txt'
def get_title(url, header, title_list):
def get_title(url, header, title_list): r = requests.get(url, headers=header)
r = requests.get(url, headers=header) r.encoding = r.apparent_encoding
r.encoding = r.apparent_encoding html = r.text
html = r.text soup = BeautifulSoup(html, 'html.parser')
soup = BeautifulSoup(html, 'html.parser') for i in soup.find_all(name='li'):
for i in soup.find_all(name='li'): date = i.get_text()[:4]
date = i.get_text()[:4] try:
try: for j in i.find_all(name='a', attrs={'target': "_blank"}):
for j in i.find_all(name='a', attrs={'target': "_blank"}): title_list.append((date, j['title'], j['href']))
title_list.append((date, j['title'], j['href'])) except:
except: print('failed')
print('failed') return title_list
return title_list
def compare_title(title_list, path_data, output_list):
def compare_title(title_list, path_data, output_list): db = open(path_data, 'r', encoding='utf-8')
db = open(path_data, 'r', encoding='utf-8') db_list = db.read().split('//')
db_list = db.read().split('//') for title in title_list:
for title in title_list: if not str(title[0]) in db_list:
if not str(title[0]) in db_list: output_list.append(title)
output_list.append(title) db.close()
db.close() return output_list
return output_list
def store_title(title_list):
def store_title(title_list): db = open(path_data, 'wb')
db = open(path_data, 'wb') for title_tuple in title_list:
for title_tuple in title_list: db.write(bytes(title_tuple[0], encoding='utf-8'))
db.write(bytes(title_tuple[0], encoding='utf-8')) db.write(bytes('//', encoding='utf-8'))
db.write(bytes('//', encoding='utf-8')) db.close()
db.close()
def show_msg(output_list, path):
def show_msg(output_list, path): f = open(path, 'wb')
f = open(path, 'wb') for i in output_list:
for i in output_list: for j in i:
for j in i: f.write(bytes(j, encoding='utf-8'))
f.write(bytes(j, encoding='utf-8')) f.write(bytes('\t', encoding='utf-8'))
f.write(bytes('\t', encoding='utf-8')) f.write(bytes('\n', encoding='utf-8'))
f.write(bytes('\n', encoding='utf-8')) f.close()
f.close()
def print_msg(output_list):
def print_msg(output_list): print(f'已为您更新{len(output_list)}条通知')
print(f'已为您更新{len(output_list)}条通知') time.sleep(3)
time.sleep(3)
def main(url, header, title_list, path_data, output_list, path):
def main(url, header, title_list, path_data, output_list, path): title_list = get_title(url, header, title_list)
title_list = get_title(url, header, title_list) output_list = compare_title(title_list, path_data, output_list)
output_list = compare_title(title_list, path_data, output_list) store_title(title_list)
store_title(title_list) show_msg(output_list, path)
show_msg(output_list, path) print_msg(output_list)
print_msg(output_list)
main(url, header, title_list, path_data, output_list, path)
main(url, header, title_list, path_data, output_list, path)

Loading…
Cancel
Save