jxxd/xd

import re
import requests
h = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61'
}
with open('江西现代职业技术学院要闻.html','r',encoding='utf8') as f:
    html = f.read()
    # print(html)

url_re = r'<select class="page_zdy">.*?</select>'
urls = re.findall(url_re,html,re.S)

urls_list_re = r'<option id=".*?" value="(.*?)"'
urls_list = re.findall(urls_list_re,urls[0],re.S)[:20]
# print(urls_list)
# title_re = '<li><a href="(.*?)" title=".*?">.*?<img src=".*?" alt="" style="margin-right:10px;">(.*?)</span><span class="fr timee">(.*?)</span></a></li>'
# title_list = re.findall(title_re,html,re.S)
# print(title_list)

c = 1
for url in urls_list:
    res = requests.get(url)
    # print(res.encoding)
    # print(res.status_code)
    if c == 1:
        #print(res.text)
        c = c+1


for i in urls_list:
    i_html = requests.get(i, headers=h)
    i_html = i_html.content.decode('utf8')
    # print(i)
    # print(i_html)
    # title_re = r'<li>.*?<span class="fr timee">(.*?)</span>.*?</li>'
    title_re_h = r'<div class="youc-li">.*?</div>'
    title_re_h_d = re.findall(title_re_h,i_html,re.S)

    # print(title_re_h_d)
    title_r = r'<a href="(.*?)" title="(.*?)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
    title_lis = re.findall(title_r, title_re_h_d[0], re.S)
    # print(title_lis)

    title_re = r'<a href="([^>]+)" title="([^>]+产教融合+[^<]+)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
    title_list = re.findall(title_re,title_re_h_d[0],re.S)
    # print(title_list)

    title_re_1 = r'<a href="[^>]+" title="([^>]+)">.*?</span>.*?<span class="fr timee">(2024-03+[^<]+)</span>'
    title_list_1 = re.findall(title_re_1, title_re_h_d[0], re.S)
    print(title_list_1)
Add xd 7 months ago			`import re`
			`import requests`
			`h = {`
			`'User-Agent':`
			`'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61'`
			`}`
			`with open('江西现代职业技术学院要闻.html','r',encoding='utf8') as f:`
			`html = f.read()`
			`# print(html)`

			`url_re = r'<select class="page_zdy">.*?</select>'`
			`urls = re.findall(url_re,html,re.S)`

			`urls_list_re = r'<option id=".?" value="(.?)"'`
			`urls_list = re.findall(urls_list_re,urls[0],re.S)[:20]`
			`# print(urls_list)`
			`# title_re = '<li><a href="(.?)" title=".?">.?<img src=".?" alt="" style="margin-right:10px;">(.?)</span><span class="fr timee">(.?)</span></a></li>'`
			`# title_list = re.findall(title_re,html,re.S)`
			`# print(title_list)`

			`c = 1`
			`for url in urls_list:`
			`res = requests.get(url)`
			`# print(res.encoding)`
			`# print(res.status_code)`
			`if c == 1:`
			`#print(res.text)`
			`c = c+1`


			`for i in urls_list:`
			`i_html = requests.get(i, headers=h)`
			`i_html = i_html.content.decode('utf8')`
			`# print(i)`
			`# print(i_html)`
			`# title_re = r'<li>.?<span class="fr timee">(.?)</span>.*?</li>'`
			`title_re_h = r'<div class="youc-li">.*?</div>'`
			`title_re_h_d = re.findall(title_re_h,i_html,re.S)`

			`# print(title_re_h_d)`
			`title_r = r'<a href="(.?)" title="(.?)">.?</span>.?<span class="fr timee">(.*?)</span>'`
			`title_lis = re.findall(title_r, title_re_h_d[0], re.S)`
			`# print(title_lis)`

			`title_re = r'<a href="([^>]+)" title="([^>]+产教融合+[^<]+)">.?</span>.?<span class="fr timee">(.*?)</span>'`
			`title_list = re.findall(title_re,title_re_h_d[0],re.S)`
			`# print(title_list)`

			`title_re_1 = r'<a href="[^>]+" title="([^>]+)">.?</span>.?<span class="fr timee">(2024-03+[^<]+)</span>'`
			`title_list_1 = re.findall(title_re_1, title_re_h_d[0], re.S)`
			`print(title_list_1)`