You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
146 lines
5.6 KiB
146 lines
5.6 KiB
import os
|
|
import sys
|
|
import time
|
|
import json
|
|
import requests
|
|
import random
|
|
import urllib
|
|
import http.client
|
|
import csv
|
|
from bs4 import BeautifulSoup
|
|
try:
|
|
import ip as ipmanage
|
|
except:
|
|
from download import ip as ipmanage
|
|
|
|
class Downloader(object):
|
|
|
|
def __init__(self):
|
|
self.ip_pool = []
|
|
self.headers = {
|
|
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
|
|
}
|
|
|
|
# 初始化IP池
|
|
def init_ip_pool(self, ip_num=100):
|
|
try:
|
|
t = os.path.getmtime(sys.path[0] + "\\ip_pool.json")
|
|
flag = True
|
|
except:
|
|
flag = False
|
|
if flag and time.time() - t < 3600: # ip池未过期, 调用本地缓存
|
|
f = open(sys.path[0] + "\\ip_pool.json", "r", encoding="utf8")
|
|
self.ip_pool = json.loads(f.read())
|
|
else:
|
|
ip_manage = ipmanage.IpManage(max_ip_num=ip_num)
|
|
ip_manage.craw_ips()
|
|
self.ip_pool = ip_manage.ip_pool
|
|
|
|
# get 请求
|
|
def requests_get(self, url, type, data=None):
|
|
response = requests.get(url=url, headers=self.headers, data=data)
|
|
if response.status_code == 200:
|
|
# request successfully
|
|
if type == "img":
|
|
# 获取图片
|
|
return response.content
|
|
if type == "html":
|
|
html = response.content
|
|
# html_content = str(html,'utf-8')
|
|
html_content = html.decode("utf-8","ignore")
|
|
return html_content
|
|
if type == "text":
|
|
return response.text
|
|
if type == "json":
|
|
return response.content
|
|
else:
|
|
print("Request Falied For Code: %s" % response.status_code)
|
|
return "0"
|
|
|
|
# post 请求
|
|
def requests_post(self, url, type, data=None):
|
|
response = requests.post(url=url, headers=self.headers, data=data)
|
|
if response.status_code == 200:
|
|
# request successfully
|
|
if type == "img":
|
|
# 获取图片
|
|
return response.content
|
|
if type == "html":
|
|
html = response.content
|
|
# html_content = str(html,'utf-8')
|
|
html_content = html.decode("utf-8","ignore")
|
|
return html_content
|
|
if type == "text":
|
|
return response.text
|
|
else:
|
|
print("Request Falied For Code: %s" % response.status_code)
|
|
|
|
def download_netease_csv(self, url, filepath):
|
|
# response = requests.get(url=url)
|
|
# with open(filepath, "wb", encoding="gbk") as f:
|
|
# f.write(response.content)
|
|
http.client.HTTPConnection._http_vsn = 10
|
|
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
|
|
urllib.request.urlretrieve(url, filepath)
|
|
|
|
|
|
# 通过代理发起 get 请求
|
|
def proxy_get(self, url, type, data=None):
|
|
# try:
|
|
ip = self.ip_pool[random.randint(0, len(self.ip_pool)-1)]
|
|
proxy = {
|
|
"http": "http://%s:%s" % (ip['ip'], ip['port'])
|
|
}
|
|
print(proxy)
|
|
response = requests.get(url, proxies=proxy, headers=self.headers, data=data)
|
|
# response = requests.get(url=url, headers=self.headers, data=data)
|
|
if response.status_code == 200:
|
|
# request successfully
|
|
if type == "img":
|
|
# 获取图片
|
|
return response.content
|
|
if type == "html":
|
|
html = response.content
|
|
# html_content = str(html,'utf-8')
|
|
html_content = html.decode("utf-8","ignore")
|
|
return html_content
|
|
if type == "text":
|
|
return response.text
|
|
else:
|
|
print("Request Falied For Code: %s" % response.status_code)
|
|
return "0"
|
|
# except:
|
|
# return "0"
|
|
|
|
# 通过 代理ip平台的ip获取get请求
|
|
def zhilian_ip_proxy_get(self, url):
|
|
zhilian_ip_json_path = sys.path[0].split("quant-on-volume")[0] + "quant-on-volume\\zhilian_ip.json"
|
|
f = open(zhilian_ip_json_path, 'r', encoding="utf-8")
|
|
ips = json.loads(f.read())
|
|
while True:
|
|
ip = ips[random.randint(0, len(ips)-1)] # 随机 ip
|
|
try:
|
|
proxy = {
|
|
"http": "http://%s:%s" % (ip['IP'], ip['Port']),
|
|
"https": "http://%s:%s" % (ip['IP'], ip['Port'])
|
|
}
|
|
response = requests.get(url, proxies=proxy, headers=self.headers, timeout=1)
|
|
if response.status_code == 200:
|
|
return response.content
|
|
elif response.status_code == 503:
|
|
self.init_zhilian_ip()
|
|
else:
|
|
print("http://%s:%s" % (ip['IP'], ip['Port']), ":FAILED-1")
|
|
continue
|
|
except:
|
|
print("http://%s:%s" % (ip['IP'], ip['Port']), ":FAILED-2")
|
|
|
|
# 初始化 智联 ip
|
|
def init_zhilian_ip(self):
|
|
# url = input("输入api:\n")
|
|
url = "http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&qty=100&time=1&pro=&city=&port=1&format=json&ss=5&css=&dt=1&specialTxt=3&specialJson=&usertype=16"
|
|
response = requests.get(url)
|
|
ips = json.loads(response.content)['data']
|
|
zhilian_ip_json_path = sys.path[0].split("quant-on-volume")[0] + "quant-on-volume\\zhilian_ip.json"
|
|
with open(zhilian_ip_json_path, "w") as f:
|
|
json.dump(ips, f) |