You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
61 lines
1.8 KiB
61 lines
1.8 KiB
import requests
|
|
import re
|
|
head = {
|
|
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
|
|
}
|
|
#获取所有网站
|
|
def website():
|
|
urlList = []
|
|
for i in range(20):
|
|
url = "http://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-" + str(i + 1) +".html"
|
|
urlList.append(url)
|
|
return urlList
|
|
|
|
#响应对象编码、响应状态码和第1页的网页源代码
|
|
def qw(urlList):
|
|
re = requests.get(urlList[0], head)
|
|
re.encoding = "utf-8"
|
|
print(re.encoding)
|
|
print(re.status_code)
|
|
print(re.text)
|
|
|
|
#获取所有源代码
|
|
def code(urlList):
|
|
htmlList = []
|
|
for i in urlList:
|
|
re = requests.get(i, head)
|
|
re.encoding = "utf-8"
|
|
htmlList.append(re.text)
|
|
return htmlList
|
|
|
|
#获取标题时间网站
|
|
def method(htmlList):
|
|
List = []
|
|
obtain = '<li>.*?<a href="(.*?)" title="(.*?)">.*?<span class="fr timee">(.*?)</span>'
|
|
for html in htmlList:
|
|
answer = re.findall(obtain,html)
|
|
print(answer)
|
|
# list.append(answer)
|
|
return list
|
|
|
|
#标题内容包含“产教融合”的文章标题、发布时间及链接
|
|
def contain(htmlList):
|
|
List = []
|
|
obtain = '<li>.*?<a href="(.*?)" title="(.*?产教融合.*?)">.*?<span class="fr timee">(.*?)</span>'
|
|
for html in htmlList:
|
|
answer = re.findall(obtain,html)
|
|
print(answer)
|
|
# list.append(answer)
|
|
return list
|
|
|
|
#获取发布时间为2024年3月份的文章数量及所有文章标题
|
|
def time(htmlList):
|
|
all = []
|
|
obtain = '<li>.*?title="(.*?)">.*?<span class="fr timee">2024-03.*?</span></a></li>'
|
|
for html in htmlList:
|
|
answer = re.findall(obtain, html,re.S)
|
|
for i in answer:
|
|
all.append(i)
|
|
print(all)
|
|
print(len(all))
|