Compare commits

...

1 Commits

Author SHA1 Message Date
psjyva5l8 d81717feb7 ADD file via upload
3 years ago

@ -0,0 +1,76 @@
import os
import datetime
import time
from asyncio import futures
import requests
import re
Max_Workers = 24 # 最大线程数
# 打开目标网站
def work(index):
url = 'https://pic.netbian.com/index_' + str(index) + '.html'
req = requests.get(url)
return req.text
# 使用正则表达式匹配图片访问链接
def matchPicUrl(html):
# regexp = r'src="(/uploads.*?\.jpg)"'
regexp = r'<a href="(/tupian.*?)"'
pattern = re.compile(regexp)
result = re.findall(pattern, html)
return result
# 匹配到图片的下载地址
def openPicurl(picurl1, filePath,index):
url = "https://pic.netbian.com/" + picurl1
req = requests.get(url)
regexp = r'src="(/uploads.*?\.jpg)"'
pattern = re.compile(regexp)
# 只需要使用得到的第一个链接就可以了
result = re.findall(pattern, req.text)
download(result[0], index, filePath)
# 下载图片
def download(picurl, index, filePath):
url = "https://pic.netbian.com" + picurl
req = requests.get(url)
fb = open("{}/第{}张图.jpg".format(filePath, index), "wb")
fb.write(req.content)
fb.flush()
fb.close()
print("{}张图片下载完成".format(index))
def main():
print("声明:仅供参考学习!!!!")
print("最大页面数为 1000")
n = eval(input("请输入爬取页面数="))
index = 1
start = time.time()
for i in range(2,n+1):
# 获取页面内容
html = work(i)
picUrls_1 = matchPicUrl(html)
timeStr = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d-%H-%M-%S')
filepath = 'd:\spiderFile\\'+timeStr
# 创建多级目录
os.makedirs(filepath)
print("文件夹创建成功")
# 打开图片地址,下载图片
# workers = min(Max_Workers,len(picUrls_1))
# with futures.ThreadPoolExecutor(workers) as executor:
# res = executor.map(download())
for index in range(0,len(picUrls_1)):
openPicurl(picUrls_1[index], filepath,index+1)
print("图片保存在"+filepath)
end = time.time()
print("下载完成!")
print("一共耗时: {} s".format(end - start))
main()
Loading…
Cancel
Save