You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
54 lines
1.8 KiB
54 lines
1.8 KiB
import re
|
|
import requests
|
|
h = {
|
|
'User-Agent':
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61'
|
|
}
|
|
with open('江西现代职业技术学院要闻.html','r',encoding='utf8') as f:
|
|
html = f.read()
|
|
# print(html)
|
|
|
|
url_re = r'<select class="page_zdy">.*?</select>'
|
|
urls = re.findall(url_re,html,re.S)
|
|
|
|
urls_list_re = r'<option id=".*?" value="(.*?)"'
|
|
urls_list = re.findall(urls_list_re,urls[0],re.S)[:20]
|
|
# print(urls_list)
|
|
# title_re = '<li><a href="(.*?)" title=".*?">.*?<img src=".*?" alt="" style="margin-right:10px;">(.*?)</span><span class="fr timee">(.*?)</span></a></li>'
|
|
# title_list = re.findall(title_re,html,re.S)
|
|
# print(title_list)
|
|
|
|
c = 1
|
|
for url in urls_list:
|
|
res = requests.get(url)
|
|
# print(res.encoding)
|
|
# print(res.status_code)
|
|
if c == 1:
|
|
#print(res.text)
|
|
c = c+1
|
|
|
|
|
|
for i in urls_list:
|
|
i_html = requests.get(i, headers=h)
|
|
i_html = i_html.content.decode('utf8')
|
|
# print(i)
|
|
# print(i_html)
|
|
# title_re = r'<li>.*?<span class="fr timee">(.*?)</span>.*?</li>'
|
|
title_re_h = r'<div class="youc-li">.*?</div>'
|
|
title_re_h_d = re.findall(title_re_h,i_html,re.S)
|
|
|
|
# print(title_re_h_d)
|
|
title_r = r'<a href="(.*?)" title="(.*?)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
|
|
title_lis = re.findall(title_r, title_re_h_d[0], re.S)
|
|
# print(title_lis)
|
|
|
|
title_re = r'<a href="([^>]+)" title="([^>]+产教融合+[^<]+)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
|
|
title_list = re.findall(title_re,title_re_h_d[0],re.S)
|
|
# print(title_list)
|
|
|
|
title_re_1 = r'<a href="[^>]+" title="([^>]+)">.*?</span>.*?<span class="fr timee">(2024-03+[^<]+)</span>'
|
|
title_list_1 = re.findall(title_re_1, title_re_h_d[0], re.S)
|
|
print(title_list_1)
|
|
|
|
|