parent
ec7562054c
commit
a106905f57
@ -0,0 +1,56 @@
|
|||||||
|
# Development Software: PyCharm
|
||||||
|
# Original Project: pythonProject
|
||||||
|
# Name: RegularExpression.py
|
||||||
|
# Author: Caesar Ren
|
||||||
|
# Creation Time: 2022/5/28 20:46
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class PicSpider:
|
||||||
|
def __init__(self, word, i, ):
|
||||||
|
# 设置存放路径
|
||||||
|
self.path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\"
|
||||||
|
# 页数
|
||||||
|
self.page = i / 20 + 1
|
||||||
|
# 如果文件夹不存在,则创建文件夹
|
||||||
|
if not os.path.exists(self.path):
|
||||||
|
os.mkdir(self.path)
|
||||||
|
|
||||||
|
# 发出requests请求
|
||||||
|
def requests_get(self, url):
|
||||||
|
req = requests.get(url, timeout=30)
|
||||||
|
req.encoding = "utf-8"
|
||||||
|
self.req = req.text
|
||||||
|
|
||||||
|
# 正则找到图片链接
|
||||||
|
def get_imgurl(self):
|
||||||
|
imgurls = re.findall('"hoverURL":"(.*?)"', self.req, re.S)
|
||||||
|
self.imgurls = imgurls
|
||||||
|
print(imgurls)
|
||||||
|
|
||||||
|
def get_imgurl2(self):
|
||||||
|
imgurls = re.findall('"fromPageTitle":"(.*?)",', self.req, re.S)
|
||||||
|
self.imgurls = imgurls
|
||||||
|
print(imgurls)
|
||||||
|
|
||||||
|
def get_h(self):
|
||||||
|
heights = []
|
||||||
|
for string in self.imgurls:
|
||||||
|
height = re.findall('h=(.*)', string, re.S)
|
||||||
|
heights.append(height)
|
||||||
|
print(heights)
|
||||||
|
print(len(heights))
|
||||||
|
|
||||||
|
|
||||||
|
word = '使徒'
|
||||||
|
i = 3
|
||||||
|
url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={0}&pn={1}&gsm=50&ct=&ic=0&lm=-1&width=0&height=0".format(
|
||||||
|
word, i)
|
||||||
|
Run = PicSpider(word, i)
|
||||||
|
Run.requests_get(url)
|
||||||
|
Run.get_imgurl()
|
||||||
|
Run.get_imgurl2()
|
||||||
|
Run.get_h()
|
Loading…
Reference in new issue