parent
f43c9a904a
commit
fbe6f30f3c
@ -0,0 +1,68 @@
|
|||||||
|
import requests
|
||||||
|
from lxml import etree
|
||||||
|
import re
|
||||||
|
|
||||||
|
header = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0"}
|
||||||
|
r = requests.get('https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html', headers=header)
|
||||||
|
#print(r.text)
|
||||||
|
text = r.content.decode('utf-8')
|
||||||
|
|
||||||
|
dom = etree.HTML(text)
|
||||||
|
block = dom.xpath('//div[@class="pages_con"]/span/select/option')
|
||||||
|
n = 0
|
||||||
|
url_lists = ['https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen.html']
|
||||||
|
for i in block:
|
||||||
|
if (n < 20):
|
||||||
|
if (n == 0):
|
||||||
|
n += 1
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
url_lists.append(i.get('value'))
|
||||||
|
n = n + 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
encoding_lists = []
|
||||||
|
status_lists = []
|
||||||
|
allCjrh = []
|
||||||
|
allMarch = []
|
||||||
|
all_20 = []
|
||||||
|
all = set()
|
||||||
|
|
||||||
|
for i in url_lists:
|
||||||
|
res = requests.get(i, headers=header)
|
||||||
|
encoding_lists.append(res.encoding)
|
||||||
|
status_lists.append(res.status_code)
|
||||||
|
|
||||||
|
if i == url_lists[0]:
|
||||||
|
print(res.text)
|
||||||
|
|
||||||
|
res_text = res.content.decode('utf-8')
|
||||||
|
|
||||||
|
# 发布时间、标题及链接
|
||||||
|
pattern = r'<a.*?title="(.*?)".*?href="(.*?)".*?<span.*?>(.*?)</span>'
|
||||||
|
matches = re.findall(pattern, res_text)
|
||||||
|
for match in matches:
|
||||||
|
all_20.append([match[2], match[0], match[1]])
|
||||||
|
|
||||||
|
# 获取标题内容包含“产教融合”的文章标题、发布时间及链接
|
||||||
|
pattern_cjrh = r'<a.*?title="(.*?产教融合.*?)".*?href="(.*?)".*?<span.*?>(.*?)</span>'
|
||||||
|
matches_cjrh = re.findall(pattern_cjrh, res_text)
|
||||||
|
for match in matches_cjrh:
|
||||||
|
allCjrh.append([match[0], match[1], match[2]])
|
||||||
|
|
||||||
|
# 获取发布时间为 2024 年 3 月份的文章数量及所有文章标题
|
||||||
|
pattern_march = r'<a.*?<span.*?>(2024-03.*?)</span>'
|
||||||
|
matches_march = re.findall(pattern_march, res_text)
|
||||||
|
for match in matches_march:
|
||||||
|
for ii in all_20:
|
||||||
|
if ii[0] == match:
|
||||||
|
all.add(ii[1])
|
||||||
|
|
||||||
|
print(encoding_lists)
|
||||||
|
print(status_lists)
|
||||||
|
print(allCjrh)
|
||||||
|
print(allMarch)
|
||||||
|
print(all_20)
|
||||||
|
print(all)
|
Loading…
Reference in new issue