You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

54 lines
1.8 KiB

7 months ago
import re
import requests
h = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61'
}
with open('江西现代职业技术学院要闻.html','r',encoding='utf8') as f:
html = f.read()
# print(html)
url_re = r'<select class="page_zdy">.*?</select>'
urls = re.findall(url_re,html,re.S)
urls_list_re = r'<option id=".*?" value="(.*?)"'
urls_list = re.findall(urls_list_re,urls[0],re.S)[:20]
# print(urls_list)
# title_re = '<li><a href="(.*?)" title=".*?">.*?<img src=".*?" alt="" style="margin-right:10px;">(.*?)</span><span class="fr timee">(.*?)</span></a></li>'
# title_list = re.findall(title_re,html,re.S)
# print(title_list)
c = 1
for url in urls_list:
res = requests.get(url)
# print(res.encoding)
# print(res.status_code)
if c == 1:
#print(res.text)
c = c+1
for i in urls_list:
i_html = requests.get(i, headers=h)
i_html = i_html.content.decode('utf8')
# print(i)
# print(i_html)
# title_re = r'<li>.*?<span class="fr timee">(.*?)</span>.*?</li>'
title_re_h = r'<div class="youc-li">.*?</div>'
title_re_h_d = re.findall(title_re_h,i_html,re.S)
# print(title_re_h_d)
title_r = r'<a href="(.*?)" title="(.*?)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
title_lis = re.findall(title_r, title_re_h_d[0], re.S)
# print(title_lis)
title_re = r'<a href="([^>]+)" title="([^>]+产教融合+[^<]+)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
title_list = re.findall(title_re,title_re_h_d[0],re.S)
# print(title_list)
title_re_1 = r'<a href="[^>]+" title="([^>]+)">.*?</span>.*?<span class="fr timee">(2024-03+[^<]+)</span>'
title_list_1 = re.findall(title_re_1, title_re_h_d[0], re.S)
print(title_list_1)