You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
url/正则表达式代码.txt

36 lines
1.8 KiB

import re
import requests
from lxml import etree
h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.1311 SLBChan/109"}
source = requests.get("http://www.jxxdxy.edu.cn/",headers=h).text
# print(source)
# https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html
# https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-2.html
# https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-3.html
url_lists=["https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html"]
for pn in range(2,21):
xh=f"https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-{pn}.html"
url_lists.append(xh)
# print(xh)
# print(url_lists)
encoding_lists=[]
status_lists=[]
for i in url_lists:
res=requests.get(i,headers=h)
encoding_lists.append(res.encoding)
status_lists.append(res.status_code)
res_text=res.content.decode('utf-8')
res_dom=etree.HTML(res_text)
# print(res.status_code)
# print(res_text)
# print(res_dom)
lianjie=r'<li><a href="(.*?)" title=".*?"><span class="fl"><img src="/statics/xdxy/jnjs/images/ty.png" alt="" style="margin-right:10px;" >(.*?)</span><span class="fr timee">(.*?)</span></a></li>'
result=re.findall(lianjie,res_text,re.S)
# print(result)
cjrh=(r'<li><a href="([^>]+)" title="([^<]+产教融合+[^>]+)"><span class="fl"><img src="/statics/xdxy/jnjs/images/ty.png" alt="" style="margin-right:10px;" >.*?</span><span class="fr timee">(.*?)</span></a></li>')
result1=re.findall(cjrh,res_text,re.S)
# print(result1)
sy=(r'<li><a href="(.*?)" title="(.*?)"><span class="fl"><img src="/statics/xdxy/jnjs/images/ty.png" alt="" style="margin-right:10px;" >(.*?)</span><span class="fr timee">(2024-03.*?)</span></a></li>')
result2=re.findall(sy,res_text,re.S)
# print(result2)