parent
f9b1197f3c
commit
d81717feb7
@ -0,0 +1,76 @@
|
||||
import os
|
||||
import datetime
|
||||
import time
|
||||
from asyncio import futures
|
||||
|
||||
import requests
|
||||
import re
|
||||
|
||||
Max_Workers = 24 # 最大线程数
|
||||
|
||||
# 打开目标网站
|
||||
def work(index):
|
||||
url = 'https://pic.netbian.com/index_' + str(index) + '.html'
|
||||
req = requests.get(url)
|
||||
return req.text
|
||||
|
||||
|
||||
# 使用正则表达式匹配图片访问链接
|
||||
def matchPicUrl(html):
|
||||
# regexp = r'src="(/uploads.*?\.jpg)"'
|
||||
regexp = r'<a href="(/tupian.*?)"'
|
||||
pattern = re.compile(regexp)
|
||||
result = re.findall(pattern, html)
|
||||
return result
|
||||
|
||||
|
||||
# 匹配到图片的下载地址
|
||||
def openPicurl(picurl1, filePath,index):
|
||||
url = "https://pic.netbian.com/" + picurl1
|
||||
req = requests.get(url)
|
||||
regexp = r'src="(/uploads.*?\.jpg)"'
|
||||
pattern = re.compile(regexp)
|
||||
# 只需要使用得到的第一个链接就可以了
|
||||
result = re.findall(pattern, req.text)
|
||||
download(result[0], index, filePath)
|
||||
|
||||
|
||||
# 下载图片
|
||||
def download(picurl, index, filePath):
|
||||
url = "https://pic.netbian.com" + picurl
|
||||
req = requests.get(url)
|
||||
fb = open("{}/第{}张图.jpg".format(filePath, index), "wb")
|
||||
fb.write(req.content)
|
||||
fb.flush()
|
||||
fb.close()
|
||||
print("第{}张图片下载完成".format(index))
|
||||
|
||||
|
||||
def main():
|
||||
print("声明:仅供参考学习!!!!")
|
||||
print("最大页面数为 1000")
|
||||
n = eval(input("请输入爬取页面数="))
|
||||
index = 1
|
||||
start = time.time()
|
||||
for i in range(2,n+1):
|
||||
# 获取页面内容
|
||||
html = work(i)
|
||||
|
||||
picUrls_1 = matchPicUrl(html)
|
||||
timeStr = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d-%H-%M-%S')
|
||||
filepath = 'd:\spiderFile\\'+timeStr
|
||||
# 创建多级目录
|
||||
os.makedirs(filepath)
|
||||
print("文件夹创建成功")
|
||||
# 打开图片地址,下载图片
|
||||
# workers = min(Max_Workers,len(picUrls_1))
|
||||
# with futures.ThreadPoolExecutor(workers) as executor:
|
||||
# res = executor.map(download())
|
||||
for index in range(0,len(picUrls_1)):
|
||||
openPicurl(picUrls_1[index], filepath,index+1)
|
||||
print("图片保存在"+filepath)
|
||||
end = time.time()
|
||||
print("下载完成!")
|
||||
print("一共耗时: {} s".format(end - start))
|
||||
main()
|
||||
|
Loading…
Reference in new issue