diff --git a/2.py b/2.py new file mode 100644 index 0000000..597fd5d --- /dev/null +++ b/2.py @@ -0,0 +1,68 @@ +import requests +from lxml import etree +import re + +header = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"} +r = requests.get('https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html', headers=header) +#print(r.text) +text = r.content.decode('utf-8') + +dom = etree.HTML(text) +block = dom.xpath('//div[@class="pages_con"]/span/select/option') +n = 0 +url_lists = ['https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html'] +for i in block: + if (n < 20): + if (n == 0): + n += 1 + continue + else: + url_lists.append(i.get('value')) + n = n + 1 + else: + break + +encoding_lists = [] +status_lists = [] +allCjrh = [] +allMarch = [] +all_20 = [] +all = set() + +for i in url_lists: + res = requests.get(i, headers=header) + encoding_lists.append(res.encoding) + status_lists.append(res.status_code) + + if i == url_lists[0]: + print(res.text) + + res_text = res.content.decode('utf-8') + + # 发布时间、标题及链接 + pattern = r'(.*?)' + matches = re.findall(pattern, res_text) + for match in matches: + all_20.append([match[2], match[0], match[1]]) + + # 获取标题内容包含“产教融合”的文章标题、发布时间及链接 + pattern_cjrh = r'(.*?)' + matches_cjrh = re.findall(pattern_cjrh, res_text) + for match in matches_cjrh: + allCjrh.append([match[0], match[1], match[2]]) + + # 获取发布时间为 2024 年 3 月份的文章数量及所有文章标题 + pattern_march = r'(2024-03.*?)' + matches_march = re.findall(pattern_march, res_text) + for match in matches_march: + for ii in all_20: + if ii[0] == match: + all.add(ii[1]) + +print(encoding_lists) +print(status_lists) +print(allCjrh) +print(allMarch) +print(all_20) +print(all) \ No newline at end of file