You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

146 lines
5.6 KiB

import os
import sys
import time
import json
import requests
import random
import urllib
import http.client
import csv
from bs4 import BeautifulSoup
try:
import ip as ipmanage
except:
from download import ip as ipmanage
class Downloader(object):
def __init__(self):
self.ip_pool = []
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}
# 初始化IP池
def init_ip_pool(self, ip_num=100):
try:
t = os.path.getmtime(sys.path[0] + "\\ip_pool.json")
flag = True
except:
flag = False
if flag and time.time() - t < 3600: # ip池未过期, 调用本地缓存
f = open(sys.path[0] + "\\ip_pool.json", "r", encoding="utf8")
self.ip_pool = json.loads(f.read())
else:
ip_manage = ipmanage.IpManage(max_ip_num=ip_num)
ip_manage.craw_ips()
self.ip_pool = ip_manage.ip_pool
# get 请求
def requests_get(self, url, type, data=None):
response = requests.get(url=url, headers=self.headers, data=data)
if response.status_code == 200:
# request successfully
if type == "img":
# 获取图片
return response.content
if type == "html":
html = response.content
# html_content = str(html,'utf-8')
html_content = html.decode("utf-8","ignore")
return html_content
if type == "text":
return response.text
if type == "json":
return response.content
else:
print("Request Falied For Code: %s" % response.status_code)
return "0"
# post 请求
def requests_post(self, url, type, data=None):
response = requests.post(url=url, headers=self.headers, data=data)
if response.status_code == 200:
# request successfully
if type == "img":
# 获取图片
return response.content
if type == "html":
html = response.content
# html_content = str(html,'utf-8')
html_content = html.decode("utf-8","ignore")
return html_content
if type == "text":
return response.text
else:
print("Request Falied For Code: %s" % response.status_code)
def download_netease_csv(self, url, filepath):
# response = requests.get(url=url)
# with open(filepath, "wb", encoding="gbk") as f:
# f.write(response.content)
http.client.HTTPConnection._http_vsn = 10
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
urllib.request.urlretrieve(url, filepath)
# 通过代理发起 get 请求
def proxy_get(self, url, type, data=None):
# try:
ip = self.ip_pool[random.randint(0, len(self.ip_pool)-1)]
proxy = {
"http": "http://%s:%s" % (ip['ip'], ip['port'])
}
print(proxy)
response = requests.get(url, proxies=proxy, headers=self.headers, data=data)
# response = requests.get(url=url, headers=self.headers, data=data)
if response.status_code == 200:
# request successfully
if type == "img":
# 获取图片
return response.content
if type == "html":
html = response.content
# html_content = str(html,'utf-8')
html_content = html.decode("utf-8","ignore")
return html_content
if type == "text":
return response.text
else:
print("Request Falied For Code: %s" % response.status_code)
return "0"
# except:
# return "0"
# 通过 代理ip平台的ip获取get请求
def zhilian_ip_proxy_get(self, url):
zhilian_ip_json_path = sys.path[0].split("quant-on-volume")[0] + "quant-on-volume\\zhilian_ip.json"
f = open(zhilian_ip_json_path, 'r', encoding="utf-8")
ips = json.loads(f.read())
while True:
ip = ips[random.randint(0, len(ips)-1)] # 随机 ip
try:
proxy = {
"http": "http://%s:%s" % (ip['IP'], ip['Port']),
"https": "http://%s:%s" % (ip['IP'], ip['Port'])
}
response = requests.get(url, proxies=proxy, headers=self.headers, timeout=1)
if response.status_code == 200:
return response.content
elif response.status_code == 503:
self.init_zhilian_ip()
else:
print("http://%s:%s" % (ip['IP'], ip['Port']), ":FAILED-1")
continue
except:
print("http://%s:%s" % (ip['IP'], ip['Port']), ":FAILED-2")
# 初始化 智联 ip
def init_zhilian_ip(self):
# url = input("输入api:\n")
url = "http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&qty=100&time=1&pro=&city=&port=1&format=json&ss=5&css=&dt=1&specialTxt=3&specialJson=&usertype=16"
response = requests.get(url)
ips = json.loads(response.content)['data']
zhilian_ip_json_path = sys.path[0].split("quant-on-volume")[0] + "quant-on-volume\\zhilian_ip.json"
with open(zhilian_ip_json_path, "w") as f:
json.dump(ips, f)