import re import requests url_first = 'https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen' url_last = '.html' urls = [] urls.append(url_first + url_last) for i in range(2, 21): urls.append(url_first + '-'+str(i) + url_last) # 2.打印20页的url # for url in urls: # print(url) head = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrom/124.0.0.0 Safari/537.36 Edg/124.0.0.0' } # res = requests.get(urls[0], headers=head) # res.encoding = 'utf-8' # 3.打印首页源代码 # print(res.text) # 打印响应编码和状态码 # for url in urls: # response = requests.get(url, headers=head) # print(response.status_code) # print(response.encoding) texts = [] for url in urls: response = requests.get(url, headers=head) texts.append(response.text) my_re = '
  • (.*?)' # 4.打印链接、标题、发布时间 arr = [] for text in texts: result = re.findall(my_re, text, re.S) arr.append(result) # print(result) # print(len(result)) # 5.使用正则表达式获取标题内容包含“产教融合”的文章标题、发布时间及链接 # str = '产教融合' # for text in arr: # for i in text: # if str in i[1]: # print(i) # 6.使用正则表达式获取发布时间为2024年3月份的文章数量及所有文章标题 month = '2024-03' for text in arr: for i in text: if month in i[2]: print(i)