You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
57 lines
1.4 KiB
57 lines
1.4 KiB
3 years ago
|
# Development Software: PyCharm
|
||
|
# Original Project: pythonProject
|
||
|
# Name: RegularExpression.py
|
||
|
# Author: Caesar Ren
|
||
|
# Creation Time: 2022/5/28 20:46
|
||
|
|
||
|
import requests
|
||
|
import os
|
||
|
import re
|
||
|
|
||
|
|
||
|
class PicSpider:
|
||
|
def __init__(self, word, i, ):
|
||
|
# 设置存放路径
|
||
|
self.path = "E:\\Pythonprojects\\Spider\\PictureDownload\\" + word + "\\"
|
||
|
# 页数
|
||
|
self.page = i / 20 + 1
|
||
|
# 如果文件夹不存在,则创建文件夹
|
||
|
if not os.path.exists(self.path):
|
||
|
os.mkdir(self.path)
|
||
|
|
||
|
# 发出requests请求
|
||
|
def requests_get(self, url):
|
||
|
req = requests.get(url, timeout=30)
|
||
|
req.encoding = "utf-8"
|
||
|
self.req = req.text
|
||
|
|
||
|
# 正则找到图片链接
|
||
|
def get_imgurl(self):
|
||
|
imgurls = re.findall('"hoverURL":"(.*?)"', self.req, re.S)
|
||
|
self.imgurls = imgurls
|
||
|
print(imgurls)
|
||
|
|
||
|
def get_imgurl2(self):
|
||
|
imgurls = re.findall('"fromPageTitle":"(.*?)",', self.req, re.S)
|
||
|
self.imgurls = imgurls
|
||
|
print(imgurls)
|
||
|
|
||
|
def get_h(self):
|
||
|
heights = []
|
||
|
for string in self.imgurls:
|
||
|
height = re.findall('h=(.*)', string, re.S)
|
||
|
heights.append(height)
|
||
|
print(heights)
|
||
|
print(len(heights))
|
||
|
|
||
|
|
||
|
word = '使徒'
|
||
|
i = 3
|
||
|
url = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={0}&pn={1}&gsm=50&ct=&ic=0&lm=-1&width=0&height=0".format(
|
||
|
word, i)
|
||
|
Run = PicSpider(word, i)
|
||
|
Run.requests_get(url)
|
||
|
Run.get_imgurl()
|
||
|
Run.get_imgurl2()
|
||
|
Run.get_h()
|