You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
50 lines
1.5 KiB
50 lines
1.5 KiB
import re
|
|
import requests
|
|
|
|
url_first = 'https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen'
|
|
url_last = '.html'
|
|
urls = []
|
|
urls.append(url_first + url_last)
|
|
for i in range(2, 21):
|
|
urls.append(url_first + '-'+str(i) + url_last)
|
|
# 2.打印20页的url
|
|
# for url in urls:
|
|
# print(url)
|
|
head = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrom/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
|
|
|
|
}
|
|
# res = requests.get(urls[0], headers=head)
|
|
# res.encoding = 'utf-8'
|
|
# 3.打印首页源代码
|
|
# print(res.text)
|
|
# 打印响应编码和状态码
|
|
# for url in urls:
|
|
# response = requests.get(url, headers=head)
|
|
# print(response.status_code)
|
|
# print(response.encoding)
|
|
texts = []
|
|
for url in urls:
|
|
response = requests.get(url, headers=head)
|
|
texts.append(response.text)
|
|
my_re = '<li><a href="(.*?)" title="(.*?)".*?fr timee">(.*?)</span>'
|
|
# 4.打印链接、标题、发布时间
|
|
arr = []
|
|
for text in texts:
|
|
result = re.findall(my_re, text, re.S)
|
|
arr.append(result)
|
|
# print(result)
|
|
# print(len(result))
|
|
# 5.使用正则表达式获取标题内容包含“产教融合”的文章标题、发布时间及链接
|
|
# str = '产教融合'
|
|
# for text in arr:
|
|
# for i in text:
|
|
# if str in i[1]:
|
|
# print(i)
|
|
|
|
# 6.使用正则表达式获取发布时间为2024年3月份的文章数量及所有文章标题
|
|
month = '2024-03'
|
|
for text in arr:
|
|
for i in text:
|
|
if month in i[2]:
|
|
print(i) |