ADD file via upload

3 years ago · d81717feb7
parent f9b1197f3c
commit d81717feb7
1 changed files with 76 additions and 0 deletions
--- a/spider2.py
+++ b/spider2.py
@ -0,0 +1,76 @@
+import os
+import datetime
+import time
+from asyncio import futures
+
+import requests
+import re
+
+Max_Workers = 24  # 最大线程数
+
+# 打开目标网站
+def work(index):
+    url = 'https://pic.netbian.com/index_' + str(index) + '.html'
+    req = requests.get(url)
+    return req.text
+
+
+# 使用正则表达式匹配图片访问链接
+def matchPicUrl(html):
+    # regexp = r'src="(/uploads.*?\.jpg)"'
+    regexp = r'<a href="(/tupian.*?)"'
+    pattern = re.compile(regexp)
+    result = re.findall(pattern, html)
+    return result
+
+
+# 匹配到图片的下载地址
+def openPicurl(picurl1, filePath,index):
+    url = "https://pic.netbian.com/" + picurl1
+    req = requests.get(url)
+    regexp = r'src="(/uploads.*?\.jpg)"'
+    pattern = re.compile(regexp)
+    # 只需要使用得到的第一个链接就可以了
+    result = re.findall(pattern, req.text)
+    download(result[0], index, filePath)
+
+
+# 下载图片
+def download(picurl, index, filePath):
+    url = "https://pic.netbian.com" + picurl
+    req = requests.get(url)
+    fb = open("{}/第{}张图.jpg".format(filePath, index), "wb")
+    fb.write(req.content)
+    fb.flush()
+    fb.close()
+    print("第{}张图片下载完成".format(index))
+
+
+def main():
+    print("声明：仅供参考学习！！！！")
+    print("最大页面数为 1000")
+    n = eval(input("请输入爬取页面数="))
+    index = 1
+    start = time.time()
+    for i in range(2,n+1):
+        # 获取页面内容
+        html = work(i)
+
+        picUrls_1 = matchPicUrl(html)
+        timeStr = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d-%H-%M-%S')
+        filepath = 'd:\spiderFile\\'+timeStr
+        # 创建多级目录
+        os.makedirs(filepath)
+        print("文件夹创建成功")
+        # 打开图片地址，下载图片
+        # workers = min(Max_Workers,len(picUrls_1))
+        # with futures.ThreadPoolExecutor(workers) as executor:
+        #      res = executor.map(download())
+        for index in range(0,len(picUrls_1)):
+            openPicurl(picUrls_1[index], filepath,index+1)
+        print("图片保存在"+filepath)
+    end = time.time()
+    print("下载完成！")
+    print("一共耗时: {} s".format(end - start))
+main()
+