You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
1.8 KiB

7 months ago
import requests
import re
head = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
}
#获取所有网站
def website():
urlList = []
for i in range(20):
url = "http://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-" + str(i + 1) +".html"
urlList.append(url)
return urlList
#响应对象编码、响应状态码和第1页的网页源代码
def qw(urlList):
re = requests.get(urlList[0], head)
re.encoding = "utf-8"
print(re.encoding)
print(re.status_code)
print(re.text)
#获取所有源代码
def code(urlList):
htmlList = []
for i in urlList:
re = requests.get(i, head)
re.encoding = "utf-8"
htmlList.append(re.text)
return htmlList
#获取标题时间网站
def method(htmlList):
List = []
obtain = '<li>.*?<a href="(.*?)" title="(.*?)">.*?<span class="fr timee">(.*?)</span>'
for html in htmlList:
answer = re.findall(obtain,html)
print(answer)
# list.append(answer)
return list
#标题内容包含“产教融合”的文章标题、发布时间及链接
def contain(htmlList):
List = []
obtain = '<li>.*?<a href="(.*?)" title="(.*?产教融合.*?)">.*?<span class="fr timee">(.*?)</span>'
for html in htmlList:
answer = re.findall(obtain,html)
print(answer)
# list.append(answer)
return list
#获取发布时间为2024年3月份的文章数量及所有文章标题
def time(htmlList):
all = []
obtain = '<li>.*?title="(.*?)">.*?<span class="fr timee">2024-03.*?</span></a></li>'
for html in htmlList:
answer = re.findall(obtain, html,re.S)
for i in answer:
all.append(i)
print(all)
print(len(all))