Add xd

1 year ago · 899aa287a9
parent d78f8b6341
commit 899aa287a9
1 changed files with 53 additions and 0 deletions
--- a/53
+++ b/53
@ -0,0 +1,53 @@
 import re
 import requests
 h = {
 'User-Agent':
 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61'
 }
 with open('江西现代职业技术学院要闻.html','r',encoding='utf8') as f:
    html = f.read()
    # print(html)
 url_re = r'<select class="page_zdy">.*?</select>'
 urls = re.findall(url_re,html,re.S)
 urls_list_re = r'<option id=".*?" value="(.*?)"'
 urls_list = re.findall(urls_list_re,urls[0],re.S)[:20]
 # print(urls_list)
 # title_re = '<li><a href="(.*?)" title=".*?">.*?<img src=".*?" alt="" style="margin-right:10px;">(.*?)</span><span class="fr timee">(.*?)</span></a></li>'
 # title_list = re.findall(title_re,html,re.S)
 # print(title_list)
 c = 1
 for url in urls_list:
    res = requests.get(url)
    # print(res.encoding)
    # print(res.status_code)
    if c == 1:
        #print(res.text)
        c = c+1
 for i in urls_list:
    i_html = requests.get(i, headers=h)
    i_html = i_html.content.decode('utf8')
    # print(i)
    # print(i_html)
    # title_re = r'<li>.*?<span class="fr timee">(.*?)</span>.*?</li>'
    title_re_h = r'<div class="youc-li">.*?</div>'
    title_re_h_d = re.findall(title_re_h,i_html,re.S)
    # print(title_re_h_d)
    title_r = r'<a href="(.*?)" title="(.*?)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
    title_lis = re.findall(title_r, title_re_h_d[0], re.S)
    # print(title_lis)
    title_re = r'<a href="([^>]+)" title="([^>]+产教融合+[^<]+)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
    title_list = re.findall(title_re,title_re_h_d[0],re.S)
    # print(title_list)
    title_re_1 = r'<a href="[^>]+" title="([^>]+)">.*?</span>.*?<span class="fr timee">(2024-03+[^<]+)</span>'
    title_list_1 = re.findall(title_re_1, title_re_h_d[0], re.S)
    print(title_list_1)