ADD file via upload

1 changed files with 76 additions and 0 deletions
--- a/spider2.py
+++ b/spider2.py
@ -0,0 +1,76 @@
 import os
 import datetime
 import time
 from asyncio import futures
 import requests
 import re
 Max_Workers = 24  # 最大线程数
 # 打开目标网站
 def work(index):
    url = 'https://pic.netbian.com/index_' + str(index) + '.html'
    req = requests.get(url)
    return req.text
 # 使用正则表达式匹配图片访问链接
 def matchPicUrl(html):
    # regexp = r'src="(/uploads.*?\.jpg)"'
    regexp = r'<a href="(/tupian.*?)"'
    pattern = re.compile(regexp)
    result = re.findall(pattern, html)
    return result
 # 匹配到图片的下载地址
 def openPicurl(picurl1, filePath,index):
    url = "https://pic.netbian.com/" + picurl1
    req = requests.get(url)
    regexp = r'src="(/uploads.*?\.jpg)"'
    pattern = re.compile(regexp)
    # 只需要使用得到的第一个链接就可以了
    result = re.findall(pattern, req.text)
    download(result[0], index, filePath)
 # 下载图片
 def download(picurl, index, filePath):
    url = "https://pic.netbian.com" + picurl
    req = requests.get(url)
    fb = open("{}/第{}张图.jpg".format(filePath, index), "wb")
    fb.write(req.content)
    fb.flush()
    fb.close()
    print("第{}张图片下载完成".format(index))
 def main():
    print("声明：仅供参考学习！！！！")
    print("最大页面数为 1000")
    n = eval(input("请输入爬取页面数="))
    index = 1
    start = time.time()
    for i in range(2,n+1):
        # 获取页面内容
        html = work(i)
        picUrls_1 = matchPicUrl(html)
        timeStr = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d-%H-%M-%S')
        filepath = 'd:\spiderFile\\'+timeStr
        # 创建多级目录
        os.makedirs(filepath)
        print("文件夹创建成功")
        # 打开图片地址，下载图片
        # workers = min(Max_Workers,len(picUrls_1))
        # with futures.ThreadPoolExecutor(workers) as executor:
        #      res = executor.map(download())
        for index in range(0,len(picUrls_1)):
            openPicurl(picUrls_1[index], filepath,index+1)
        print("图片保存在"+filepath)
    end = time.time()
    print("下载完成！")
    print("一共耗时: {} s".format(end - start))
 main()