You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

54 lines
2.0 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import re
import requests
# 1.进入江西现代职业技术学院官网“ http://www.jxxdxy.edu.cn/”, 点击首页中的“现代要闻”,页面中有“现代要闻”栏目。
url_head = 'https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen'
url_tail = '.html'
url_list = []
url_list.append(url_head + url_tail)
for i in range(2, 21):
url_list.append(url_head + '-'+str(i) + url_tail)
# 2.使用python列表及循环语句构造所有页的url列表总共20页
for url in url_list:
print(url)
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0"}
res = requests.get(url_list[0], headers=headers)
res.encoding = "utf-8"
# 3.循环使用requests库的get方法定制请求头获取“现代要闻”所有页的网页源代码并打印响应对象编码、响应状态码和第1页的网页源代码
print(res.text)
for url in url_list:
response = requests.get(url, headers=headers)
print(response.status_code)
print(response.encoding)
texts = []
for url in url_list:
response = requests.get(url, headers=headers)
texts.append(response.text)
my_re = '<li><a href="(.*?)" title="(.*?)".*?fr timee">(.*?)</span>'
# 4.使用正则表达式获取“现代要闻”该栏目下所有文章共20页的发布时间、标题及链接
arr = []
for text in texts:
result = re.findall(my_re, text, re.S)
arr.append(result)
print(result)
print(len(result))
# 5.使用正则表达式获取标题内容包含“产教融合”的文章标题、发布时间及链接
str = "产教融合"
for text in arr:
for i in text:
if str in i[1]:
print(i)
# 6.使用正则表达式获取发布时间为2024年3月份的文章数量及所有文章标题
month = "2024-03"
for text in arr:
for i in text:
if month in i[2]:
print(i)