diff --git a/demo b/demo new file mode 100644 index 0000000..5eb493c --- /dev/null +++ b/demo @@ -0,0 +1,60 @@ +import requests +import re +head = { + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0' +} +#获取所有网站 +def website(): + urlList = [] + for i in range(20): + url = "http://www.jxxdxy.edu.cn/news-list-xiaoyuanyaowen-" + str(i + 1) +".html" + urlList.append(url) + return urlList + +#响应对象编码、响应状态码和第1页的网页源代码 +def qw(urlList): + re = requests.get(urlList[0], head) + re.encoding = "utf-8" + print(re.encoding) + print(re.status_code) + print(re.text) + +#获取所有源代码 +def code(urlList): + htmlList = [] + for i in urlList: + re = requests.get(i, head) + re.encoding = "utf-8" + htmlList.append(re.text) + return htmlList + +#获取标题时间网站 +def method(htmlList): + List = [] + obtain = '
  • .*?.*?(.*?)' + for html in htmlList: + answer = re.findall(obtain,html) + print(answer) + # list.append(answer) + return list + +#标题内容包含“产教融合”的文章标题、发布时间及链接 +def contain(htmlList): + List = [] + obtain = '
  • .*?.*?(.*?)' + for html in htmlList: + answer = re.findall(obtain,html) + print(answer) + # list.append(answer) + return list + +#获取发布时间为2024年3月份的文章数量及所有文章标题 +def time(htmlList): + all = [] + obtain = '
  • .*?title="(.*?)">.*?2024-03.*?
  • ' + for html in htmlList: + answer = re.findall(obtain, html,re.S) + for i in answer: + all.append(i) + print(all) + print(len(all))