ADD file via upload

3 years ago · 4eb1729f1d
parent abdce9f1fe
commit 4eb1729f1d
1 changed files with 124 additions and 0 deletions
--- a/PicturescratchRevolved.py
+++ b/PicturescratchRevolved.py
@ -0,0 +1,124 @@
+# Development Software: PyCharm
+# Original Project:     pythonProject
+# Name:                 PicturescratchRevolved.py
+# Author:               Caesar Ren, Wong Tat Chun
+# Creation Time:        2022/5/17 19:30
+
+import requests
+import os
+import re
+from PIL import Image
+import pandas as pd
+
+# word是要爬的图片名字
+word = input("请输入关键词：")
+n = eval(input("请输入下载页数："))
+# j用来标记图片数量
+j = 1
+
+pagesize = 0
+
+
+def size_format(size):
+	if size < 1024:
+		return '%i' % size + 'Bytes'
+	elif 1024 <= size < 1024 ** 2:
+		return '%.1f' % float(size / 1024) + f'KB ({size} Bytes)'
+	elif 1024 ** 2 <= size < 1024 ** 3:
+		return '%.1f' % float(size / 1024 ** 2) + f'MB ({size} Bytes)'
+	elif 1024 ** 3 <= size < 1024 ** 4:
+		return '%.1f' % float(size / 1024 ** 3) + f'GB ({size} Bytes)'
+	elif 1024 ** 4 <= size:
+		return '%.1f' % float(size / 1024 ** 4) + f'TB ({size} Bytes)'
+
+
+def getimgsize(word, i, path="E:\\Pythonprojects\\Spider\\PictureDownload"):
+	file_path = path + "\\" + word + "\\" + word + f"{i}.jpg"
+	img0 = Image.open(file_path)
+	w = img0.width  # 图片的宽
+	h = img0.height  # 图片的高
+	return w, h
+
+
+class PicSpider:
+	def __init__(self, word, i, ):
+		# 设置存放路径
+		self.path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\"
+		# 页数
+		self.page = i / 20 + 1
+		# 如果文件夹不存在，则创建文件夹
+		if not os.path.exists(self.path):
+			os.mkdir(self.path)
+
+	# 发出requests请求
+	def requests_get(self, url):
+		req = requests.get(url, timeout=30)
+		req.encoding = "utf-8"
+		self.req = req.text
+
+	# 正则找到图片链接
+	def get_imgurl(self):
+		imgurls = re.findall('"objURL":"(.*?)"', self.req, re.S)
+		self.imgurls = imgurls
+
+	# 下载图片到本地
+	def download(self):
+		pagesize = 0
+		global j
+		for imgurl in self.imgurls:
+			path = self.path + word + str(j)
+			# 写入文件
+			with open(path + ".jpg", "wb") as f:
+				r = requests.get(imgurl)
+				f.write(r.content)
+			# 读取大小
+			size = os.path.getsize(path + ".jpg")
+			pagesize += size
+			acrtsize = size_format(size)
+			print("%s.jpg下载成功，" % path, "大小为%s" % acrtsize, sep='')
+			j += 1
+		pagesize0 = size_format(pagesize)
+		print("第{}页下载结束！".format(self.page), f"总大小为{pagesize0}", sep='')
+
+
+# 通过pn参数实现翻页，第一页为0，,间隔为20
+def run(word, n):
+	totalsize = 0
+	for i in range(0, 20 * n, 20):
+		url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={0}&pn={1}&gsm=50&ct=&ic=0&lm=-1&width=0&height=0".format(
+			word, i)
+		Run = PicSpider(word, i)
+		Run.requests_get(url)
+		Run.get_imgurl()
+		Run.download()
+	for i in range(60 * n):
+		path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
+		size = os.path.getsize(path + f"{i + 1}.jpg")
+		totalsize += size
+	print(f'下载文件总大小为{size_format(totalsize)}')
+
+
+run(word, n)
+
+x = input("是否进行文件分析？是请输入1，否请输入0：")
+if x == '1':
+	numberlist = []
+	sizelist = []
+	widthlist = []
+	heightlist = []
+	dict = {}
+	for i in range(60 * n):
+		path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
+		size = os.path.getsize(path + f"{i + 1}.jpg")
+		imgsize = getimgsize(word, i + 1)
+		sizelist.append(size)
+		widthlist.append(imgsize[0])
+		heightlist.append(imgsize[1])
+		numberlist.append(i + 1)
+	dict['Number'] = numberlist
+	dict['Size'] = sizelist
+	dict['Width'] = widthlist
+	dict['Height'] = heightlist
+	df = pd.DataFrame(dict)
+	df.set_index('Number', inplace=True)
+	print(df)