quant-on-volume/craw_data/dayline/netease_dayline.py

import sys
import json

import pandas as pd
from download import download
import math
import threading
import os
import random
import time
from tqdm import tqdm

class NeteaseDayline(object):

    def __init__(self, end_date='99999999', thread_num=1, timeout=10):

        const_path = sys.path[0].replace("\\craw_data\\dayline", "")
        # print(const_path)
        f = open(const_path + "\\const.json", "r", encoding='utf8')
        consts = json.loads(f.read())

        self.stock_list_file = consts['stock_list_file']         # 全部股票信息的csv文件
        self.save_path_prefix = consts['day_line_file_prefix']['netease']     # 日线存储文件夹目录
        self.end_date = end_date                                  # 截止日期
        self.thread_num = thread_num                              # 线程数
        self.timeout = timeout                                    # 线程超时

        self.downloader = download.Downloader()                   # 下载器
        # self.downloader.init_ip_pool()                            # 初始化 ip 池

    # 控制器入口
    def entrance(self):
        try:
            df = pd.read_csv(self.stock_list_file, encoding="gbk", error_bad_lines=False)
        except:
            print("ERROR Opening File: %s" % self.stock_list_file)
            return False

        codes = []
        for index, row in df.iterrows():
            codes.append(row['股票代码'][1:])       # 字符串第一位为 `（ 增强 csv 文件可读性 ）
        self.craw_by_threads(codes)
        print("\n\n\n\nALL THREADS FINISHED")
        while True:
            if self.is_complete(codes): break

    # 多线程抓取
    def craw_by_threads(self, codes):
        all_count = len(codes)
        offset = math.ceil(all_count / thread_num)
        threads = []
        for i in range(self.thread_num):
            start = i * offset
            end = (i+1) * offset if (i+1)*offset < all_count else all_count
            thread = threading.Thread(target=self.craw_block, args=(start, end, codes, i))
            threads.append(thread)
        for t in threads:
            t.setDaemon(True)
            t.start()
        for t in threads:
            t.join(timeout=self.timeout)

    # 抓取 codes 块
    def craw_block(self, start, end, codes, thread_id):
        time_start = time.perf_counter()
        block = codes[start:end] if end > 0 else codes[start:]
        # 记录日志
        log_file = "log\\netease_dayline\\thread_%s_%s.txt" % (thread_id, int(time.time()))
        try:
            f = open(log_file, "w")
        except:
            print("ERROR OPENING FILE: %s" % log_file)
        for i in tqdm(range(len(block))):
            code = block[i]
            status = "线程%s: %s下载中 [%s / %s] [%s / %s]" % (thread_id, code, i+1, len(block), time.perf_counter(), time_start)
            f.write(status + "\n")
            try:
                # 进行下载
                filepath = self.save_path_prefix + code + ".csv"
                url = self.handle_netease_url(code)
                self.downloader.download_netease_csv(url=url, filepath=filepath)
            except:
                error = "线程 %s 下载 %s 时出现错误" % (thread_id, code)
                f.write(error + "\n")
                continue
            time.sleep(random.random()*2)

    # 处理网易财经日线 下载的 url
    def handle_netease_url(self, code):
        # 处理代码前缀
        netease_prefix = ""
        if str(code)[0] == "0" or str(code)[0] == "3":
            netease_prefix = "1"
        elif str(code)[0] == "6":
            netease_prefix = "0"
        code = str(netease_prefix) + str(code)
        return 'http://quotes.money.163.com/service/chddata.html?code=%s&end=%s&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP' % (code, self.end_date)

    # 校验是否全部下载完毕， 下载未下载的
    def is_complete(self, codes):
        print("\n正在校验文件是否完整")
        filelist = os.listdir(self.save_path_prefix)
        print("总共应下载 %s 个文件， 实际下载 %s 个" % (len(codes), len(filelist)))
        if len(filelist) == len(codes): return True
        downloaded = []
        for name in filelist:
            downloaded.append(name[0:6])
        need_to_download = []
        for code in codes:
            if code not in downloaded:
                need_to_download.append(code)
        self.thread_num = 4
        self.craw_by_threads(need_to_download)
        return False

if __name__ == "__main__":

    # end_date = "20190616"                                          # 截至日期
    thread_num = 4                                                  # 线程数
    timeout = 10                                                     # 线程超时

    netease_dayline = NeteaseDayline(
        thread_num=thread_num,
        timeout=timeout
    )
    netease_dayline.entrance()