import requests from lxml import etree import re header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"} r = requests.get('https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html', headers=header) #print(r.text) text = r.content.decode('utf-8') dom = etree.HTML(text) block = dom.xpath('//div[@class="pages_con"]/span/select/option') n = 0 url_lists = ['https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html'] for i in block: if (n < 20): if (n == 0): n += 1 continue else: url_lists.append(i.get('value')) n = n + 1 else: break encoding_lists = [] status_lists = [] allCjrh = [] allMarch = [] all_20 = [] all = set() for i in url_lists: res = requests.get(i, headers=header) encoding_lists.append(res.encoding) status_lists.append(res.status_code) if i == url_lists[0]: print(res.text) res_text = res.content.decode('utf-8') # 发布时间、标题及链接 pattern = r'(.*?)' matches = re.findall(pattern, res_text) for match in matches: all_20.append([match[2], match[0], match[1]]) # 获取标题内容包含“产教融合”的文章标题、发布时间及链接 pattern_cjrh = r'(.*?)' matches_cjrh = re.findall(pattern_cjrh, res_text) for match in matches_cjrh: allCjrh.append([match[0], match[1], match[2]]) # 获取发布时间为 2024 年 3 月份的文章数量及所有文章标题 pattern_march = r'(2024-03.*?)' matches_march = re.findall(pattern_march, res_text) for match in matches_march: for ii in all_20: if ii[0] == match: all.add(ii[1]) print(encoding_lists) print(status_lists) print(allCjrh) print(allMarch) print(all_20) print(all)