parent
d78f8b6341
commit
899aa287a9
@ -0,0 +1,53 @@
|
|||||||
|
import re
|
||||||
|
import requests
|
||||||
|
h = {
|
||||||
|
'User-Agent':
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.61'
|
||||||
|
}
|
||||||
|
with open('江西现代职业技术学院要闻.html','r',encoding='utf8') as f:
|
||||||
|
html = f.read()
|
||||||
|
# print(html)
|
||||||
|
|
||||||
|
url_re = r'<select class="page_zdy">.*?</select>'
|
||||||
|
urls = re.findall(url_re,html,re.S)
|
||||||
|
|
||||||
|
urls_list_re = r'<option id=".*?" value="(.*?)"'
|
||||||
|
urls_list = re.findall(urls_list_re,urls[0],re.S)[:20]
|
||||||
|
# print(urls_list)
|
||||||
|
# title_re = '<li><a href="(.*?)" title=".*?">.*?<img src=".*?" alt="" style="margin-right:10px;">(.*?)</span><span class="fr timee">(.*?)</span></a></li>'
|
||||||
|
# title_list = re.findall(title_re,html,re.S)
|
||||||
|
# print(title_list)
|
||||||
|
|
||||||
|
c = 1
|
||||||
|
for url in urls_list:
|
||||||
|
res = requests.get(url)
|
||||||
|
# print(res.encoding)
|
||||||
|
# print(res.status_code)
|
||||||
|
if c == 1:
|
||||||
|
#print(res.text)
|
||||||
|
c = c+1
|
||||||
|
|
||||||
|
|
||||||
|
for i in urls_list:
|
||||||
|
i_html = requests.get(i, headers=h)
|
||||||
|
i_html = i_html.content.decode('utf8')
|
||||||
|
# print(i)
|
||||||
|
# print(i_html)
|
||||||
|
# title_re = r'<li>.*?<span class="fr timee">(.*?)</span>.*?</li>'
|
||||||
|
title_re_h = r'<div class="youc-li">.*?</div>'
|
||||||
|
title_re_h_d = re.findall(title_re_h,i_html,re.S)
|
||||||
|
|
||||||
|
# print(title_re_h_d)
|
||||||
|
title_r = r'<a href="(.*?)" title="(.*?)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
|
||||||
|
title_lis = re.findall(title_r, title_re_h_d[0], re.S)
|
||||||
|
# print(title_lis)
|
||||||
|
|
||||||
|
title_re = r'<a href="([^>]+)" title="([^>]+产教融合+[^<]+)">.*?</span>.*?<span class="fr timee">(.*?)</span>'
|
||||||
|
title_list = re.findall(title_re,title_re_h_d[0],re.S)
|
||||||
|
# print(title_list)
|
||||||
|
|
||||||
|
title_re_1 = r'<a href="[^>]+" title="([^>]+)">.*?</span>.*?<span class="fr timee">(2024-03+[^<]+)</span>'
|
||||||
|
title_list_1 = re.findall(title_re_1, title_re_h_d[0], re.S)
|
||||||
|
print(title_list_1)
|
||||||
|
|
||||||
|
|
Loading…
Reference in new issue