ADD file via upload

master
hnu202110040108 3 years ago
parent abdce9f1fe
commit 4eb1729f1d

@ -0,0 +1,124 @@
# Development Software: PyCharm
# Original Project: pythonProject
# Name: PicturescratchRevolved.py
# Author: Caesar Ren, Wong Tat Chun
# Creation Time: 2022/5/17 19:30
import requests
import os
import re
from PIL import Image
import pandas as pd
# word是要爬的图片名字
word = input("请输入关键词:")
n = eval(input("请输入下载页数:"))
# j用来标记图片数量
j = 1
pagesize = 0
def size_format(size):
if size < 1024:
return '%i' % size + 'Bytes'
elif 1024 <= size < 1024 ** 2:
return '%.1f' % float(size / 1024) + f'KB ({size} Bytes)'
elif 1024 ** 2 <= size < 1024 ** 3:
return '%.1f' % float(size / 1024 ** 2) + f'MB ({size} Bytes)'
elif 1024 ** 3 <= size < 1024 ** 4:
return '%.1f' % float(size / 1024 ** 3) + f'GB ({size} Bytes)'
elif 1024 ** 4 <= size:
return '%.1f' % float(size / 1024 ** 4) + f'TB ({size} Bytes)'
def getimgsize(word, i, path="E:\\Pythonprojects\\Spider\\PictureDownload"):
file_path = path + "\\" + word + "\\" + word + f"{i}.jpg"
img0 = Image.open(file_path)
w = img0.width # 图片的宽
h = img0.height # 图片的高
return w, h
class PicSpider:
def __init__(self, word, i, ):
# 设置存放路径
self.path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\"
# 页数
self.page = i / 20 + 1
# 如果文件夹不存在,则创建文件夹
if not os.path.exists(self.path):
os.mkdir(self.path)
# 发出requests请求
def requests_get(self, url):
req = requests.get(url, timeout=30)
req.encoding = "utf-8"
self.req = req.text
# 正则找到图片链接
def get_imgurl(self):
imgurls = re.findall('"objURL":"(.*?)"', self.req, re.S)
self.imgurls = imgurls
# 下载图片到本地
def download(self):
pagesize = 0
global j
for imgurl in self.imgurls:
path = self.path + word + str(j)
# 写入文件
with open(path + ".jpg", "wb") as f:
r = requests.get(imgurl)
f.write(r.content)
# 读取大小
size = os.path.getsize(path + ".jpg")
pagesize += size
acrtsize = size_format(size)
print("%s.jpg下载成功" % path, "大小为%s" % acrtsize, sep='')
j += 1
pagesize0 = size_format(pagesize)
print("{}页下载结束!".format(self.page), f"总大小为{pagesize0}", sep='')
# 通过pn参数实现翻页第一页为0,间隔为20
def run(word, n):
totalsize = 0
for i in range(0, 20 * n, 20):
url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={0}&pn={1}&gsm=50&ct=&ic=0&lm=-1&width=0&height=0".format(
word, i)
Run = PicSpider(word, i)
Run.requests_get(url)
Run.get_imgurl()
Run.download()
for i in range(60 * n):
path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
size = os.path.getsize(path + f"{i + 1}.jpg")
totalsize += size
print(f'下载文件总大小为{size_format(totalsize)}')
run(word, n)
x = input("是否进行文件分析是请输入1否请输入0")
if x == '1':
numberlist = []
sizelist = []
widthlist = []
heightlist = []
dict = {}
for i in range(60 * n):
path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
size = os.path.getsize(path + f"{i + 1}.jpg")
imgsize = getimgsize(word, i + 1)
sizelist.append(size)
widthlist.append(imgsize[0])
heightlist.append(imgsize[1])
numberlist.append(i + 1)
dict['Number'] = numberlist
dict['Size'] = sizelist
dict['Width'] = widthlist
dict['Height'] = heightlist
df = pd.DataFrame(dict)
df.set_index('Number', inplace=True)
print(df)
Loading…
Cancel
Save