From 05ad0a4c7d021e587c4707ec3f139e63f22294c8 Mon Sep 17 00:00:00 2001 From: pseyg6lzf <1986224603@qq.com> Date: Sat, 27 Apr 2024 19:27:29 +0800 Subject: [PATCH] ADD file via upload --- 新建 文本文档.txt | 130 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 新建 文本文档.txt diff --git a/新建 文本文档.txt b/新建 文本文档.txt new file mode 100644 index 0000000..ede0bcd --- /dev/null +++ b/新建 文本文档.txt @@ -0,0 +1,130 @@ +import requests +from bs4 import BeautifulSoup +from urllib.robotparser import RobotFileParser +from matplotlib import pyplot as plt +import numpy as np + +def can_fetch(urlrobots, url): + rp = RobotFileParser() + rp.set_url(urlrobots+"/robots.txt") + rp.read() + return rp.can_fetch('*', url) + +def check_robots(url): + if can_fetch(url, url): + response = requests.get(url) + if response.status_code == 200: + flag=1 + print('Robots协议允许访问该网站') + return True + else: + print("Robots协议不允许访问该网站") + return False + +def get_pictures(url,path): + headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'} + re=requests.get(url,headers=headers) + print(re.status_code)#查看请求状态,返回200说明正常 + with open('img/'+path, 'wb') as f:#把图片数据写入本地,wb表示二进制储存 + f.write(re.content) + +def get_pictures_urls(text): + st='img src="' + m=len(st) + i=0 + n=len(text) + urls=[]#储存url + while i