parent
670e91b488
commit
be71ba177f
@ -0,0 +1,60 @@
|
||||
import requests
|
||||
import re
|
||||
head = {
|
||||
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'
|
||||
}
|
||||
#获取所有网站
|
||||
def website():
|
||||
urlList = []
|
||||
for i in range(20):
|
||||
url = "http://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-" + str(i + 1) +".html"
|
||||
urlList.append(url)
|
||||
return urlList
|
||||
|
||||
#响应对象编码、响应状态码和第1页的网页源代码
|
||||
def qw(urlList):
|
||||
re = requests.get(urlList[0], head)
|
||||
re.encoding = "utf-8"
|
||||
print(re.encoding)
|
||||
print(re.status_code)
|
||||
print(re.text)
|
||||
|
||||
#获取所有源代码
|
||||
def code(urlList):
|
||||
htmlList = []
|
||||
for i in urlList:
|
||||
re = requests.get(i, head)
|
||||
re.encoding = "utf-8"
|
||||
htmlList.append(re.text)
|
||||
return htmlList
|
||||
|
||||
#获取标题时间网站
|
||||
def method(htmlList):
|
||||
List = []
|
||||
obtain = '<li>.*?<a href="(.*?)" title="(.*?)">.*?<span class="fr timee">(.*?)</span>'
|
||||
for html in htmlList:
|
||||
answer = re.findall(obtain,html)
|
||||
print(answer)
|
||||
# list.append(answer)
|
||||
return list
|
||||
|
||||
#标题内容包含“产教融合”的文章标题、发布时间及链接
|
||||
def contain(htmlList):
|
||||
List = []
|
||||
obtain = '<li>.*?<a href="(.*?)" title="(.*?产教融合.*?)">.*?<span class="fr timee">(.*?)</span>'
|
||||
for html in htmlList:
|
||||
answer = re.findall(obtain,html)
|
||||
print(answer)
|
||||
# list.append(answer)
|
||||
return list
|
||||
|
||||
#获取发布时间为2024年3月份的文章数量及所有文章标题
|
||||
def time(htmlList):
|
||||
all = []
|
||||
obtain = '<li>.*?title="(.*?)">.*?<span class="fr timee">2024-03.*?</span></a></li>'
|
||||
for html in htmlList:
|
||||
answer = re.findall(obtain, html,re.S)
|
||||
for i in answer:
|
||||
all.append(i)
|
||||
print(all)
|
||||
print(len(all))
|
Loading…
Reference in new issue