parent
ec7562054c
commit
a106905f57
@ -0,0 +1,56 @@
|
||||
# Development Software: PyCharm
|
||||
# Original Project: pythonProject
|
||||
# Name: RegularExpression.py
|
||||
# Author: Caesar Ren
|
||||
# Creation Time: 2022/5/28 20:46
|
||||
|
||||
import requests
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
class PicSpider:
|
||||
def __init__(self, word, i, ):
|
||||
# 设置存放路径
|
||||
self.path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\"
|
||||
# 页数
|
||||
self.page = i / 20 + 1
|
||||
# 如果文件夹不存在,则创建文件夹
|
||||
if not os.path.exists(self.path):
|
||||
os.mkdir(self.path)
|
||||
|
||||
# 发出requests请求
|
||||
def requests_get(self, url):
|
||||
req = requests.get(url, timeout=30)
|
||||
req.encoding = "utf-8"
|
||||
self.req = req.text
|
||||
|
||||
# 正则找到图片链接
|
||||
def get_imgurl(self):
|
||||
imgurls = re.findall('"hoverURL":"(.*?)"', self.req, re.S)
|
||||
self.imgurls = imgurls
|
||||
print(imgurls)
|
||||
|
||||
def get_imgurl2(self):
|
||||
imgurls = re.findall('"fromPageTitle":"(.*?)",', self.req, re.S)
|
||||
self.imgurls = imgurls
|
||||
print(imgurls)
|
||||
|
||||
def get_h(self):
|
||||
heights = []
|
||||
for string in self.imgurls:
|
||||
height = re.findall('h=(.*)', string, re.S)
|
||||
heights.append(height)
|
||||
print(heights)
|
||||
print(len(heights))
|
||||
|
||||
|
||||
word = '使徒'
|
||||
i = 3
|
||||
url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={0}&pn={1}&gsm=50&ct=&ic=0&lm=-1&width=0&height=0".format(
|
||||
word, i)
|
||||
Run = PicSpider(word, i)
|
||||
Run.requests_get(url)
|
||||
Run.get_imgurl()
|
||||
Run.get_imgurl2()
|
||||
Run.get_h()
|
Loading…
Reference in new issue