diff --git a/新建 文本文档.txt b/新建 文本文档.txt new file mode 100644 index 0000000..ede0bcd --- /dev/null +++ b/新建 文本文档.txt @@ -0,0 +1,130 @@ +import requests +from bs4 import BeautifulSoup +from urllib.robotparser import RobotFileParser +from matplotlib import pyplot as plt +import numpy as np + +def can_fetch(urlrobots, url): + rp = RobotFileParser() + rp.set_url(urlrobots+"/robots.txt") + rp.read() + return rp.can_fetch('*', url) + +def check_robots(url): + if can_fetch(url, url): + response = requests.get(url) + if response.status_code == 200: + flag=1 + print('Robots协议允许访问该网站') + return True + else: + print("Robots协议不允许访问该网站") + return False + +def get_pictures(url,path): + headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0'} + re=requests.get(url,headers=headers) + print(re.status_code)#查看请求状态,返回200说明正常 + with open('img/'+path, 'wb') as f:#把图片数据写入本地,wb表示二进制储存 + f.write(re.content) + +def get_pictures_urls(text): + st='img src="' + m=len(st) + i=0 + n=len(text) + urls=[]#储存url + while i