diff --git a/PicturescratchRevolved.py b/PicturescratchRevolved.py new file mode 100644 index 0000000..c671124 --- /dev/null +++ b/PicturescratchRevolved.py @@ -0,0 +1,124 @@ +# Development Software: PyCharm +# Original Project: pythonProject +# Name: PicturescratchRevolved.py +# Author: Caesar Ren, Wong Tat Chun +# Creation Time: 2022/5/17 19:30 + +import requests +import os +import re +from PIL import Image +import pandas as pd + +# word是要爬的图片名字 +word = input("请输入关键词:") +n = eval(input("请输入下载页数:")) +# j用来标记图片数量 +j = 1 + +pagesize = 0 + + +def size_format(size): + if size < 1024: + return '%i' % size + 'Bytes' + elif 1024 <= size < 1024 ** 2: + return '%.1f' % float(size / 1024) + f'KB ({size} Bytes)' + elif 1024 ** 2 <= size < 1024 ** 3: + return '%.1f' % float(size / 1024 ** 2) + f'MB ({size} Bytes)' + elif 1024 ** 3 <= size < 1024 ** 4: + return '%.1f' % float(size / 1024 ** 3) + f'GB ({size} Bytes)' + elif 1024 ** 4 <= size: + return '%.1f' % float(size / 1024 ** 4) + f'TB ({size} Bytes)' + + +def getimgsize(word, i, path="E:\\Pythonprojects\\Spider\\PictureDownload"): + file_path = path + "\\" + word + "\\" + word + f"{i}.jpg" + img0 = Image.open(file_path) + w = img0.width # 图片的宽 + h = img0.height # 图片的高 + return w, h + + +class PicSpider: + def __init__(self, word, i, ): + # 设置存放路径 + self.path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + # 页数 + self.page = i / 20 + 1 + # 如果文件夹不存在,则创建文件夹 + if not os.path.exists(self.path): + os.mkdir(self.path) + + # 发出requests请求 + def requests_get(self, url): + req = requests.get(url, timeout=30) + req.encoding = "utf-8" + self.req = req.text + + # 正则找到图片链接 + def get_imgurl(self): + imgurls = re.findall('"objURL":"(.*?)"', self.req, re.S) + self.imgurls = imgurls + + # 下载图片到本地 + def download(self): + pagesize = 0 + global j + for imgurl in self.imgurls: + path = self.path + word + str(j) + # 写入文件 + with open(path + ".jpg", "wb") as f: + r = requests.get(imgurl) + f.write(r.content) + # 读取大小 + size = os.path.getsize(path + ".jpg") + pagesize += size + acrtsize = size_format(size) + print("%s.jpg下载成功," % path, "大小为%s" % acrtsize, sep='') + j += 1 + pagesize0 = size_format(pagesize) + print("第{}页下载结束!".format(self.page), f"总大小为{pagesize0}", sep='') + + +# 通过pn参数实现翻页,第一页为0,,间隔为20 +def run(word, n): + totalsize = 0 + for i in range(0, 20 * n, 20): + url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={0}&pn={1}&gsm=50&ct=&ic=0&lm=-1&width=0&height=0".format( + word, i) + Run = PicSpider(word, i) + Run.requests_get(url) + Run.get_imgurl() + Run.download() + for i in range(60 * n): + path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word + size = os.path.getsize(path + f"{i + 1}.jpg") + totalsize += size + print(f'下载文件总大小为{size_format(totalsize)}') + + +run(word, n) + +x = input("是否进行文件分析?是请输入1,否请输入0:") +if x == '1': + numberlist = [] + sizelist = [] + widthlist = [] + heightlist = [] + dict = {} + for i in range(60 * n): + path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word + size = os.path.getsize(path + f"{i + 1}.jpg") + imgsize = getimgsize(word, i + 1) + sizelist.append(size) + widthlist.append(imgsize[0]) + heightlist.append(imgsize[1]) + numberlist.append(i + 1) + dict['Number'] = numberlist + dict['Size'] = sizelist + dict['Width'] = widthlist + dict['Height'] = heightlist + df = pd.DataFrame(dict) + df.set_index('Number', inplace=True) + print(df)