Update 提取title.py

2 years ago · 4878db0351
parent 655b3d0c84
commit 4878db0351
1 changed files with 76 additions and 77 deletions
--- a/提取title.py
+++ b/提取title.py
@ -1,77 +1,76 @@
-'''
+'''
-Author:@阳春布泽
+Author:@阳春布泽
-date：23-1-5
+date：23-1-5
-列表 [(日期,title，网址)]
+note:目前只完成了爬虫部分，需要解决前端信息推送问题，有会QQ机器人或者微信小程序等同学欢迎合作
-写入提取日期
+'''
-'''
+import requests
-import requests
+from bs4 import BeautifulSoup
-from bs4 import BeautifulSoup
+import time
-import time
+
-
+url = 'http://dean.xjtu.edu.cn/'
-url = 'http://dean.xjtu.edu.cn/'
+header = {
-header = {
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
-    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
+}
-}
+title_list = []
-title_list = []
+output_list = []
-output_list = []
+path = 'jwc.txt'
-path = 'C:\\Users\\LZH\\Desktop\\jwc.txt'
+path_data = 'old_data.txt'
-path_data = 'C:\\Python（旧）\\程序\\爬取教务处通知\\old_data.txt'
+
-
+
-
+def get_title(url, header, title_list):
-def get_title(url, header, title_list):
+    r = requests.get(url, headers=header)
-    r = requests.get(url, headers=header)
+    r.encoding = r.apparent_encoding
-    r.encoding = r.apparent_encoding
+    html = r.text
-    html = r.text
+    soup = BeautifulSoup(html, 'html.parser')
-    soup = BeautifulSoup(html, 'html.parser')
+    for i in soup.find_all(name='li'):
-    for i in soup.find_all(name='li'):
+        date = i.get_text()[:4]
-        date = i.get_text()[:4]
+        try:
-        try:
+            for j in i.find_all(name='a', attrs={'target': "_blank"}):
-            for j in i.find_all(name='a', attrs={'target': "_blank"}):
+                title_list.append((date, j['title'], j['href']))
-                title_list.append((date, j['title'], j['href']))
+        except:
-        except:
+            print('failed')
-            print('failed')
+    return title_list
-    return title_list
+
-
+
-
+def compare_title(title_list, path_data, output_list):
-def compare_title(title_list, path_data, output_list):
+    db = open(path_data, 'r', encoding='utf-8')
-    db = open(path_data, 'r', encoding='utf-8')
+    db_list = db.read().split('//')
-    db_list = db.read().split('//')
+    for title in title_list:
-    for title in title_list:
+        if not str(title[0]) in db_list:
-        if not str(title[0]) in db_list:
+            output_list.append(title)
-            output_list.append(title)
+    db.close()
-    db.close()
+    return output_list
-    return output_list
+
-
+
-
+def store_title(title_list):
-def store_title(title_list):
+    db = open(path_data, 'wb')
-    db = open(path_data, 'wb')
+    for title_tuple in title_list:
-    for title_tuple in title_list:
+        db.write(bytes(title_tuple[0], encoding='utf-8'))
-        db.write(bytes(title_tuple[0], encoding='utf-8'))
+        db.write(bytes('//', encoding='utf-8'))
-        db.write(bytes('//', encoding='utf-8'))
+    db.close()
-    db.close()
+
-
+
-
+def show_msg(output_list, path):
-def show_msg(output_list, path):
+    f = open(path, 'wb')
-    f = open(path, 'wb')
+    for i in output_list:
-    for i in output_list:
+        for j in i:
-        for j in i:
+            f.write(bytes(j, encoding='utf-8'))
-            f.write(bytes(j, encoding='utf-8'))
+            f.write(bytes('\t', encoding='utf-8'))
-            f.write(bytes('\t', encoding='utf-8'))
+        f.write(bytes('\n', encoding='utf-8'))
-        f.write(bytes('\n', encoding='utf-8'))
+    f.close()
-    f.close()
+
-
+
-
+def print_msg(output_list):
-def print_msg(output_list):
+    print(f'已为您更新{len(output_list)}条通知')
-    print(f'已为您更新{len(output_list)}条通知')
+    time.sleep(3)
-    time.sleep(3)
+
-
+
-
+def main(url, header, title_list, path_data, output_list, path):
-def main(url, header, title_list, path_data, output_list, path):
+    title_list = get_title(url, header, title_list)
-    title_list = get_title(url, header, title_list)
+    output_list = compare_title(title_list, path_data, output_list)
-    output_list = compare_title(title_list, path_data, output_list)
+    store_title(title_list)
-    store_title(title_list)
+    show_msg(output_list, path)
-    show_msg(output_list, path)
+    print_msg(output_list)
-    print_msg(output_list)
+
-
+
-
+main(url, header, title_list, path_data, output_list, path)
 main(url, header, title_list, path_data, output_list, path)