spiderofbaidupic/PicturescratchRevolved.py

# Development Software: PyCharm
# Original Project:     pythonProject
# Name:                 PicturescratchRevolved.py
# Author:               Caesar Ren, Wong Tat Chun
# Creation Time:        2022/5/17 19:30

import requests
import os
import re
from PIL import Image
import pandas as pd

# word是要爬的图片名字
word = input("请输入关键词：")
n = eval(input("请输入下载页数："))
# j用来标记图片数量
j = 1

pagesize = 0


def size_format(size):
	if size < 1024:
		return '%i' % size + 'Bytes'
	elif 1024 <= size < 1024 ** 2:
		return '%.1f' % float(size / 1024) + f'KB ({size} Bytes)'
	elif 1024 ** 2 <= size < 1024 ** 3:
		return '%.1f' % float(size / 1024 ** 2) + f'MB ({size} Bytes)'
	elif 1024 ** 3 <= size < 1024 ** 4:
		return '%.1f' % float(size / 1024 ** 3) + f'GB ({size} Bytes)'
	elif 1024 ** 4 <= size:
		return '%.1f' % float(size / 1024 ** 4) + f'TB ({size} Bytes)'


def getimgsize(word, i, path="E:\\Pythonprojects\\Spider\\PictureDownload"):
	file_path = path + "\\" + word + "\\" + word + f"{i}.jpg"
	img0 = Image.open(file_path)
	w = img0.width  # 图片的宽
	h = img0.height  # 图片的高
	return w, h


class PicSpider:
	def __init__(self, word, i, ):
		# 设置存放路径
		self.path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\"
		# 页数
		self.page = i / 20 + 1
		# 如果文件夹不存在，则创建文件夹
		if not os.path.exists(self.path):
			os.mkdir(self.path)

	# 发出requests请求
	def requests_get(self, url):
		req = requests.get(url, timeout=30)
		req.encoding = "utf-8"
		self.req = req.text

	# 正则找到图片链接
	def get_imgurl(self):
		imgurls = re.findall('"objURL":"(.*?)"', self.req, re.S)
		self.imgurls = imgurls

	# 下载图片到本地
	def download(self):
		pagesize = 0
		global j
		for imgurl in self.imgurls:
			path = self.path + word + str(j)
			# 写入文件
			with open(path + ".jpg", "wb") as f:
				r = requests.get(imgurl)
				f.write(r.content)
			# 读取大小
			size = os.path.getsize(path + ".jpg")
			pagesize += size
			acrtsize = size_format(size)
			print("%s.jpg下载成功，" % path, "大小为%s" % acrtsize, sep='')
			j += 1
		pagesize0 = size_format(pagesize)
		print("第{}页下载结束！".format(self.page), f"总大小为{pagesize0}", sep='')


# 通过pn参数实现翻页，第一页为0，,间隔为20
def run(word, n):
	totalsize = 0
	for i in range(0, 20 * n, 20):
		url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={0}&pn={1}&gsm=50&ct=&ic=0&lm=-1&width=0&height=0".format(
			word, i)
		Run = PicSpider(word, i)
		Run.requests_get(url)
		Run.get_imgurl()
		Run.download()
	for i in range(60 * n):
		path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
		size = os.path.getsize(path + f"{i + 1}.jpg")
		totalsize += size
	print(f'下载文件总大小为{size_format(totalsize)}')


run(word, n)

x = input("是否进行文件分析？是请输入1，否请输入0：")
if x == '1':
	numberlist = []
	sizelist = []
	widthlist = []
	heightlist = []
	dict = {}
	for i in range(60 * n):
		path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
		size = os.path.getsize(path + f"{i + 1}.jpg")
		imgsize = getimgsize(word, i + 1)
		sizelist.append(size)
		widthlist.append(imgsize[0])
		heightlist.append(imgsize[1])
		numberlist.append(i + 1)
	dict['Number'] = numberlist
	dict['Size'] = sizelist
	dict['Width'] = widthlist
	dict['Height'] = heightlist
	df = pd.DataFrame(dict)
	df.set_index('Number', inplace=True)
	print(df)