Update 提取title.py

turtle
pwmglvzfq 2 years ago
parent 655b3d0c84
commit 4878db0351

@ -1,77 +1,76 @@
'''
Author:@阳春布泽
date23-1-5
列表 [(日期,title网址)]
写入提取日期
'''
import requests
from bs4 import BeautifulSoup
import time
url = 'http://dean.xjtu.edu.cn/'
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
title_list = []
output_list = []
path = 'C:\\Users\\LZH\\Desktop\\jwc.txt'
path_data = 'C:\\Python\\程序\\爬取教务处通知\\old_data.txt'
def get_title(url, header, title_list):
r = requests.get(url, headers=header)
r.encoding = r.apparent_encoding
html = r.text
soup = BeautifulSoup(html, 'html.parser')
for i in soup.find_all(name='li'):
date = i.get_text()[:4]
try:
for j in i.find_all(name='a', attrs={'target': "_blank"}):
title_list.append((date, j['title'], j['href']))
except:
print('failed')
return title_list
def compare_title(title_list, path_data, output_list):
db = open(path_data, 'r', encoding='utf-8')
db_list = db.read().split('//')
for title in title_list:
if not str(title[0]) in db_list:
output_list.append(title)
db.close()
return output_list
def store_title(title_list):
db = open(path_data, 'wb')
for title_tuple in title_list:
db.write(bytes(title_tuple[0], encoding='utf-8'))
db.write(bytes('//', encoding='utf-8'))
db.close()
def show_msg(output_list, path):
f = open(path, 'wb')
for i in output_list:
for j in i:
f.write(bytes(j, encoding='utf-8'))
f.write(bytes('\t', encoding='utf-8'))
f.write(bytes('\n', encoding='utf-8'))
f.close()
def print_msg(output_list):
print(f'已为您更新{len(output_list)}条通知')
time.sleep(3)
def main(url, header, title_list, path_data, output_list, path):
title_list = get_title(url, header, title_list)
output_list = compare_title(title_list, path_data, output_list)
store_title(title_list)
show_msg(output_list, path)
print_msg(output_list)
main(url, header, title_list, path_data, output_list, path)
'''
Author:@阳春布泽
date23-1-5
note:目前只完成了爬虫部分需要解决前端信息推送问题有会QQ机器人或者微信小程序等同学欢迎合作
'''
import requests
from bs4 import BeautifulSoup
import time
url = 'http://dean.xjtu.edu.cn/'
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
title_list = []
output_list = []
path = 'jwc.txt'
path_data = 'old_data.txt'
def get_title(url, header, title_list):
r = requests.get(url, headers=header)
r.encoding = r.apparent_encoding
html = r.text
soup = BeautifulSoup(html, 'html.parser')
for i in soup.find_all(name='li'):
date = i.get_text()[:4]
try:
for j in i.find_all(name='a', attrs={'target': "_blank"}):
title_list.append((date, j['title'], j['href']))
except:
print('failed')
return title_list
def compare_title(title_list, path_data, output_list):
db = open(path_data, 'r', encoding='utf-8')
db_list = db.read().split('//')
for title in title_list:
if not str(title[0]) in db_list:
output_list.append(title)
db.close()
return output_list
def store_title(title_list):
db = open(path_data, 'wb')
for title_tuple in title_list:
db.write(bytes(title_tuple[0], encoding='utf-8'))
db.write(bytes('//', encoding='utf-8'))
db.close()
def show_msg(output_list, path):
f = open(path, 'wb')
for i in output_list:
for j in i:
f.write(bytes(j, encoding='utf-8'))
f.write(bytes('\t', encoding='utf-8'))
f.write(bytes('\n', encoding='utf-8'))
f.close()
def print_msg(output_list):
print(f'已为您更新{len(output_list)}条通知')
time.sleep(3)
def main(url, header, title_list, path_data, output_list, path):
title_list = get_title(url, header, title_list)
output_list = compare_title(title_list, path_data, output_list)
store_title(title_list)
show_msg(output_list, path)
print_msg(output_list)
main(url, header, title_list, path_data, output_list, path)

Loading…
Cancel
Save