You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
spiderofbaidupic/PicturescratchRevolved.py

125 lines
3.6 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# Development Software: PyCharm
# Original Project: pythonProject
# Name: PicturescratchRevolved.py
# Author: Caesar Ren, Wong Tat Chun
# Creation Time: 2022/5/17 19:30
import requests
import os
import re
from PIL import Image
import pandas as pd
# word是要爬的图片名字
word = input("请输入关键词:")
n = eval(input("请输入下载页数:"))
# j用来标记图片数量
j = 1
pagesize = 0
def size_format(size):
if size < 1024:
return '%i' % size + 'Bytes'
elif 1024 <= size < 1024 ** 2:
return '%.1f' % float(size / 1024) + f'KB ({size} Bytes)'
elif 1024 ** 2 <= size < 1024 ** 3:
return '%.1f' % float(size / 1024 ** 2) + f'MB ({size} Bytes)'
elif 1024 ** 3 <= size < 1024 ** 4:
return '%.1f' % float(size / 1024 ** 3) + f'GB ({size} Bytes)'
elif 1024 ** 4 <= size:
return '%.1f' % float(size / 1024 ** 4) + f'TB ({size} Bytes)'
def getimgsize(word, i, path="E:\\Pythonprojects\\Spider\\PictureDownload"):
file_path = path + "\\" + word + "\\" + word + f"{i}.jpg"
img0 = Image.open(file_path)
w = img0.width # 图片的宽
h = img0.height # 图片的高
return w, h
class PicSpider:
def __init__(self, word, i, ):
# 设置存放路径
self.path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\"
# 页数
self.page = i / 20 + 1
# 如果文件夹不存在,则创建文件夹
if not os.path.exists(self.path):
os.mkdir(self.path)
# 发出requests请求
def requests_get(self, url):
req = requests.get(url, timeout=30)
req.encoding = "utf-8"
self.req = req.text
# 正则找到图片链接
def get_imgurl(self):
imgurls = re.findall('"objURL":"(.*?)"', self.req, re.S)
self.imgurls = imgurls
# 下载图片到本地
def download(self):
pagesize = 0
global j
for imgurl in self.imgurls:
path = self.path + word + str(j)
# 写入文件
with open(path + ".jpg", "wb") as f:
r = requests.get(imgurl)
f.write(r.content)
# 读取大小
size = os.path.getsize(path + ".jpg")
pagesize += size
acrtsize = size_format(size)
print("%s.jpg下载成功" % path, "大小为%s" % acrtsize, sep='')
j += 1
pagesize0 = size_format(pagesize)
print("{}页下载结束!".format(self.page), f"总大小为{pagesize0}", sep='')
# 通过pn参数实现翻页第一页为0,间隔为20
def run(word, n):
totalsize = 0
for i in range(0, 20 * n, 20):
url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={0}&pn={1}&gsm=50&ct=&ic=0&lm=-1&width=0&height=0".format(
word, i)
Run = PicSpider(word, i)
Run.requests_get(url)
Run.get_imgurl()
Run.download()
for i in range(60 * n):
path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
size = os.path.getsize(path + f"{i + 1}.jpg")
totalsize += size
print(f'下载文件总大小为{size_format(totalsize)}')
run(word, n)
x = input("是否进行文件分析是请输入1否请输入0")
if x == '1':
numberlist = []
sizelist = []
widthlist = []
heightlist = []
dict = {}
for i in range(60 * n):
path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\" + word
size = os.path.getsize(path + f"{i + 1}.jpg")
imgsize = getimgsize(word, i + 1)
sizelist.append(size)
widthlist.append(imgsize[0])
heightlist.append(imgsize[1])
numberlist.append(i + 1)
dict['Number'] = numberlist
dict['Size'] = sizelist
dict['Width'] = widthlist
dict['Height'] = heightlist
df = pd.DataFrame(dict)
df.set_index('Number', inplace=True)
print(df)