diff --git a/1.py b/1.py new file mode 100644 index 0000000..d7b7efd --- /dev/null +++ b/1.py @@ -0,0 +1,36 @@ +import re + +import requests +from lxml import etree +h={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.1311 SLBChan/109"} +source = requests.get("http://www.jxxdxy.edu.cn/",headers=h).text +# print(source) +# https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html +# https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-2.html +# https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-3.html +url_lists=["https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html"] +for pn in range(2,21): + xh=f"https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-{pn}.html" + url_lists.append(xh) + # print(xh) +# print(url_lists) +encoding_lists=[] +status_lists=[] +for i in url_lists: + res=requests.get(i,headers=h) + encoding_lists.append(res.encoding) + status_lists.append(res.status_code) + res_text=res.content.decode('utf-8') + res_dom=etree.HTML(res_text) + # print(res.status_code) + # print(res_text) + # print(res_dom) + lianjie=r'
  • (.*?)(.*?)
  • ' + result=re.findall(lianjie,res_text,re.S) + # print(result) + cjrh=(r'
  • .*?(.*?)
  • ') + result1=re.findall(cjrh,res_text,re.S) + # print(result1) + sy=(r'
  • (.*?)(2024-03.*?)
  • ') + result2=re.findall(sy,res_text,re.S) + # print(result2)