Compare commits
1 Commits
Author | SHA1 | Date |
---|---|---|
psjyva5l8 | d81717feb7 | 3 years ago |
@ -0,0 +1,76 @@
|
|||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
import time
|
||||||
|
from asyncio import futures
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
|
||||||
|
Max_Workers = 24 # 最大线程数
|
||||||
|
|
||||||
|
# 打开目标网站
|
||||||
|
def work(index):
|
||||||
|
url = 'https://pic.netbian.com/index_' + str(index) + '.html'
|
||||||
|
req = requests.get(url)
|
||||||
|
return req.text
|
||||||
|
|
||||||
|
|
||||||
|
# 使用正则表达式匹配图片访问链接
|
||||||
|
def matchPicUrl(html):
|
||||||
|
# regexp = r'src="(/uploads.*?\.jpg)"'
|
||||||
|
regexp = r'<a href="(/tupian.*?)"'
|
||||||
|
pattern = re.compile(regexp)
|
||||||
|
result = re.findall(pattern, html)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# 匹配到图片的下载地址
|
||||||
|
def openPicurl(picurl1, filePath,index):
|
||||||
|
url = "https://pic.netbian.com/" + picurl1
|
||||||
|
req = requests.get(url)
|
||||||
|
regexp = r'src="(/uploads.*?\.jpg)"'
|
||||||
|
pattern = re.compile(regexp)
|
||||||
|
# 只需要使用得到的第一个链接就可以了
|
||||||
|
result = re.findall(pattern, req.text)
|
||||||
|
download(result[0], index, filePath)
|
||||||
|
|
||||||
|
|
||||||
|
# 下载图片
|
||||||
|
def download(picurl, index, filePath):
|
||||||
|
url = "https://pic.netbian.com" + picurl
|
||||||
|
req = requests.get(url)
|
||||||
|
fb = open("{}/第{}张图.jpg".format(filePath, index), "wb")
|
||||||
|
fb.write(req.content)
|
||||||
|
fb.flush()
|
||||||
|
fb.close()
|
||||||
|
print("第{}张图片下载完成".format(index))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("声明:仅供参考学习!!!!")
|
||||||
|
print("最大页面数为 1000")
|
||||||
|
n = eval(input("请输入爬取页面数="))
|
||||||
|
index = 1
|
||||||
|
start = time.time()
|
||||||
|
for i in range(2,n+1):
|
||||||
|
# 获取页面内容
|
||||||
|
html = work(i)
|
||||||
|
|
||||||
|
picUrls_1 = matchPicUrl(html)
|
||||||
|
timeStr = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d-%H-%M-%S')
|
||||||
|
filepath = 'd:\spiderFile\\'+timeStr
|
||||||
|
# 创建多级目录
|
||||||
|
os.makedirs(filepath)
|
||||||
|
print("文件夹创建成功")
|
||||||
|
# 打开图片地址,下载图片
|
||||||
|
# workers = min(Max_Workers,len(picUrls_1))
|
||||||
|
# with futures.ThreadPoolExecutor(workers) as executor:
|
||||||
|
# res = executor.map(download())
|
||||||
|
for index in range(0,len(picUrls_1)):
|
||||||
|
openPicurl(picUrls_1[index], filepath,index+1)
|
||||||
|
print("图片保存在"+filepath)
|
||||||
|
end = time.time()
|
||||||
|
print("下载完成!")
|
||||||
|
print("一共耗时: {} s".format(end - start))
|
||||||
|
main()
|
||||||
|
|
Loading…
Reference in new issue