diff --git a/提取title.py b/提取title.py new file mode 100644 index 0000000..0557e20 --- /dev/null +++ b/提取title.py @@ -0,0 +1,77 @@ +''' +Author:@阳春布泽 +date:23-1-5 +列表 [(日期,title,网址)] +写入提取日期 +''' +import requests +from bs4 import BeautifulSoup +import time + +url = 'http://dean.xjtu.edu.cn/' +header = { + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36' +} +title_list = [] +output_list = [] +path = 'C:\\Users\\LZH\\Desktop\\jwc.txt' +path_data = 'C:\\Python(旧)\\程序\\爬取教务处通知\\old_data.txt' + + +def get_title(url, header, title_list): + r = requests.get(url, headers=header) + r.encoding = r.apparent_encoding + html = r.text + soup = BeautifulSoup(html, 'html.parser') + for i in soup.find_all(name='li'): + date = i.get_text()[:4] + try: + for j in i.find_all(name='a', attrs={'target': "_blank"}): + title_list.append((date, j['title'], j['href'])) + except: + print('failed') + return title_list + + +def compare_title(title_list, path_data, output_list): + db = open(path_data, 'r', encoding='utf-8') + db_list = db.read().split('//') + for title in title_list: + if not str(title[0]) in db_list: + output_list.append(title) + db.close() + return output_list + + +def store_title(title_list): + db = open(path_data, 'wb') + for title_tuple in title_list: + db.write(bytes(title_tuple[0], encoding='utf-8')) + db.write(bytes('//', encoding='utf-8')) + db.close() + + +def show_msg(output_list, path): + f = open(path, 'wb') + for i in output_list: + for j in i: + f.write(bytes(j, encoding='utf-8')) + f.write(bytes('\t', encoding='utf-8')) + f.write(bytes('\n', encoding='utf-8')) + f.close() + + +def print_msg(output_list): + print(f'已为您更新{len(output_list)}条通知') + time.sleep(3) + + +def main(url, header, title_list, path_data, output_list, path): + title_list = get_title(url, header, title_list) + output_list = compare_title(title_list, path_data, output_list) + store_title(title_list) + show_msg(output_list, path) + print_msg(output_list) + + +main(url, header, title_list, path_data, output_list, path)