|
|
# Development Software: PyCharm
|
|
|
# Original Project: pythonProject
|
|
|
# Name: PicturescratchRevolved.py
|
|
|
# Author: Caesar Ren, Wong Tat Chun
|
|
|
# Creation Time: 2022/5/17 19:30
|
|
|
|
|
|
import requests
|
|
|
import os
|
|
|
import re
|
|
|
from PIL import Image
|
|
|
import pandas as pd
|
|
|
|
|
|
# word是要爬的图片名字
|
|
|
word = input("请输入关键词:")
|
|
|
n = eval(input("请输入下载页数:"))
|
|
|
# j用来标记图片数量
|
|
|
j = 1
|
|
|
|
|
|
pagesize = 0
|
|
|
|
|
|
|
|
|
def size_format(size):
|
|
|
if size < 1024:
|
|
|
return '%i' % size + 'Bytes'
|
|
|
elif 1024 <= size < 1024 ** 2:
|
|
|
return '%.1f' % float(size / 1024) + f'KB ({size} Bytes)'
|
|
|
elif 1024 ** 2 <= size < 1024 ** 3:
|
|
|
return '%.1f' % float(size / 1024 ** 2) + f'MB ({size} Bytes)'
|
|
|
elif 1024 ** 3 <= size < 1024 ** 4:
|
|
|
return '%.1f' % float(size / 1024 ** 3) + f'GB ({size} Bytes)'
|
|
|
elif 1024 ** 4 <= size:
|
|
|
return '%.1f' % float(size / 1024 ** 4) + f'TB ({size} Bytes)'
|
|
|
|
|
|
|
|
|
def getimgsize(word, i, path="E:\\Pythonprojects\\Spider\\PictureDownload"):
|
|
|
file_path = path + "\\" + word + "\\" + word + f"{i}.jpg"
|
|
|
img0 = Image.open(file_path)
|
|
|
w = img0.width # 图片的宽
|
|
|
h = img0.height # 图片的高
|
|
|
return w, h
|
|
|
|
|
|
|
|
|
class PicSpider:
|
|
|
def __init__(self, word, i, ):
|
|
|
# 设置存放路径
|
|
|
self.path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\"
|
|
|
# 页数
|
|
|
self.page = i / 20 + 1
|
|
|
# 如果文件夹不存在,则创建文件夹
|
|
|
if not os.path.exists(self.path):
|
|
|
os.mkdir(self.path)
|
|
|
|
|
|
# 发出requests请求
|
|
|
def requests_get(self, url):
|
|
|
req = requests.get(url, timeout=30)
|
|
|
req.encoding = "utf-8"
|
|
|
self.req = req.text
|
|
|
|
|
|
# 正则找到图片链接
|
|
|
def get_imgurl(self):
|
|
|
imgurls = re.findall('"objURL":"(.*?)"', self.req, re.S)
|
|
|
self.imgurls = imgurls
|
|
|
|
|
|
# 下载图片到本地
|
|
|
def download(self):
|
|
|
pagesize = 0
|
|
|
global j
|
|
|
for imgurl in self.imgurls:
|
|
|
path = self.path + word + str(j)
|
|
|
# 写入文件
|
|
|
with open(path + ".jpg", "wb") as f:
|
|
|
r = requests.get(imgurl)
|
|
|
f.write(r.content)
|
|
|
# 读取大小
|
|
|
size = os.path.getsize(path + ".jpg")
|
|
|
pagesize += size
|
|
|
acrtsize = size_format(size)
|
|
|
print("%s.jpg下载成功," % path, "大小为%s" % acrtsize, sep='')
|
|
|
j += 1
|
|
|
pagesize0 = size_format(pagesize)
|
|
|
print("第{}页下载结束!".format(self.page), f"总大小为{pagesize0}", sep='')
|
|
|
|
|
|
|
|
|
# 通过pn参数实现翻页,第一页为0,,间隔为20
|
|
|
def run(word, n):
|
|
|
totalsize = 0
|
|
|
for i in range(0, 20 * n, 20):
|
|
|
url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={0}&pn={1}&gsm=50&ct=&ic=0&lm=-1&width=0&height=0".format(
|
|
|
word, i)
|
|
|
Run = PicSpider(word, i)
|
|
|
Run.requests_get(url)
|
|
|
Run.get_imgurl()
|
|
|
Run.download()
|
|
|
for i in range(60 * n):
|
|
|
path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
|
|
|
size = os.path.getsize(path + f"{i + 1}.jpg")
|
|
|
totalsize += size
|
|
|
print(f'下载文件总大小为{size_format(totalsize)}')
|
|
|
|
|
|
|
|
|
run(word, n)
|
|
|
|
|
|
x = input("是否进行文件分析?是请输入1,否请输入0:")
|
|
|
if x == '1':
|
|
|
numberlist = []
|
|
|
sizelist = []
|
|
|
widthlist = []
|
|
|
heightlist = []
|
|
|
dict = {}
|
|
|
for i in range(60 * n):
|
|
|
path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
|
|
|
size = os.path.getsize(path + f"{i + 1}.jpg")
|
|
|
imgsize = getimgsize(word, i + 1)
|
|
|
sizelist.append(size)
|
|
|
widthlist.append(imgsize[0])
|
|
|
heightlist.append(imgsize[1])
|
|
|
numberlist.append(i + 1)
|
|
|
dict['Number'] = numberlist
|
|
|
dict['Size'] = sizelist
|
|
|
dict['Width'] = widthlist
|
|
|
dict['Height'] = heightlist
|
|
|
df = pd.DataFrame(dict)
|
|
|
df.set_index('Number', inplace=True)
|
|
|
print(df)
|