|
|
|
@ -0,0 +1,54 @@
|
|
|
|
|
import re
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
# 1.进入江西现代职业技术学院官网“ http://www.jxxdxy.edu.cn/”, 点击首页中的“现代要闻”,页面中有“现代要闻”栏目。
|
|
|
|
|
url_head = 'https://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen'
|
|
|
|
|
url_tail = '.html'
|
|
|
|
|
url_list = []
|
|
|
|
|
url_list.append(url_head + url_tail)
|
|
|
|
|
for i in range(2, 21):
|
|
|
|
|
url_list.append(url_head + '-'+str(i) + url_tail)
|
|
|
|
|
|
|
|
|
|
# 2.使用python列表及循环语句构造所有页的url列表(总共20页)。
|
|
|
|
|
for url in url_list:
|
|
|
|
|
print(url)
|
|
|
|
|
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
|
|
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0"}
|
|
|
|
|
|
|
|
|
|
res = requests.get(url_list[0], headers=headers)
|
|
|
|
|
res.encoding = "utf-8"
|
|
|
|
|
|
|
|
|
|
# 3.循环使用requests库的get方法定制请求头获取“现代要闻”所有页的网页源代码,并打印响应对象编码、响应状态码和第1页的网页源代码
|
|
|
|
|
print(res.text)
|
|
|
|
|
for url in url_list:
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
print(response.status_code)
|
|
|
|
|
print(response.encoding)
|
|
|
|
|
|
|
|
|
|
texts = []
|
|
|
|
|
for url in url_list:
|
|
|
|
|
response = requests.get(url, headers=headers)
|
|
|
|
|
texts.append(response.text)
|
|
|
|
|
my_re = '<li><a href="(.*?)" title="(.*?)".*?fr timee">(.*?)</span>'
|
|
|
|
|
|
|
|
|
|
# 4.使用正则表达式获取“现代要闻”该栏目下所有文章(共20页)的发布时间、标题及链接
|
|
|
|
|
arr = []
|
|
|
|
|
for text in texts:
|
|
|
|
|
result = re.findall(my_re, text, re.S)
|
|
|
|
|
arr.append(result)
|
|
|
|
|
print(result)
|
|
|
|
|
print(len(result))
|
|
|
|
|
|
|
|
|
|
# 5.使用正则表达式获取标题内容包含“产教融合”的文章标题、发布时间及链接
|
|
|
|
|
str = "产教融合"
|
|
|
|
|
for text in arr:
|
|
|
|
|
for i in text:
|
|
|
|
|
if str in i[1]:
|
|
|
|
|
print(i)
|
|
|
|
|
|
|
|
|
|
# 6.使用正则表达式获取发布时间为2024年3月份的文章数量及所有文章标题
|
|
|
|
|
month = "2024-03"
|
|
|
|
|
for text in arr:
|
|
|
|
|
for i in text:
|
|
|
|
|
if month in i[2]:
|
|
|
|
|
print(i)
|