quant-on-volume/analyze_data/hly_vol.py

import sys
import json
import pandas as pd
import time
import os

from tqdm import tqdm
from functools import reduce
from collections import deque

class HlyScore(object):

    def __init__(self, serial_day_arr=[7], expectation=[5,10,15,20]):
        const_path = sys.path[0].replace("\\analyze_data", "")
        f = open(const_path + "\\const.json", "r", encoding="utf8")
        self.consts = json.loads(f.read())
        self.dayline_file_prefix = self.consts['day_line_file_prefix']['netease_clean']
        self.with_my_result = self.consts['path']['result']['hly']

        self.serial_day_arr = serial_day_arr    # 连续计算的天数
        self.expectation = expectation          # ma 数组
        self.res_matrix = {}
        self.res_matrix_reduce = {}

    # 计算单只股票的加分
    def calculate_score_one(self,code):
        column_name = 'hly_score'
        try:
            df = pd.read_csv("%s%s.csv" % (self.dayline_file_prefix, code), encoding="gbk", error_bad_lines=False)
        except:
            print("ERROR WHILE OPENING %s.csv" % code)
        df = df[::-1]
        df[column_name] = ""
        last_row = pd.Series([])
        for index,row in df.iterrows():
            if last_row.empty: score = 0
            else: score = self.get_score(last_row,row)
            df.loc[index, column_name] = score
            last_row = row
        df[::-1].to_csv("%s%s.csv" % (self.with_my_result, code), encoding="gbk", index=None)

    def calculate_score_all(self):
        time_start = time.time()
        file_list = os.listdir(self.dayline_file_prefix)
        file_count = len(file_list)
        for i in tqdm(range(file_count)):
            file = file_list[i]
            code = file[0:6]
            self.calculate_score_one(code)
        time_end = time.time()
        time_c= time_end - time_start   #运行所花时间
        print('time cost: %s Seconds' % time_c)

    # 计算单只股票的加分和
    def calculate_sum_one(self, code):
        colmun_name = "sum_%s"
        try:
            df = pd.read_csv("%s%s.csv" % (self.with_my_result, code), encoding="gbk", error_bad_lines=False)
        except:
            print("ERROR WHILE OPENING %s.csv" % code)
        sum_arr = {}
        for sum in self.serial_day_arr:
            df[colmun_name % sum] = ""
            sum_arr[sum] = deque([])

        df = df[::-1]
        for index,row in df.iterrows():
            for day in sum_arr:
                sum_arr[day].append(row['hly_score'])
                if len(sum_arr[day]) < int(day):
                    df.loc[index,colmun_name % day] = 0
                else:
                    df.loc[index,colmun_name % day] = reduce(lambda x,y:x+y,sum_arr[day])
                    sum_arr[day].popleft()
        df[::-1].to_csv("%s%s.csv" % (self.with_my_result, code), encoding="gbk",index=None)

    def calculate_sum_all(self):
        time_start = time.time()
        file_list = os.listdir(self.dayline_file_prefix)
        file_count = len(file_list)
        for i in tqdm(range(file_count)):
            file = file_list[i]
            code = file[0:6]
            self.calculate_sum_one(code)
        time_end = time.time()
        time_c= time_end - time_start   #运行所花时间
        print('time cost: %s Seconds' % time_c)

    def get_score(self,yesterday,today):
        if yesterday['收盘价'] < today['收盘价']:
            if yesterday['成交量'] < today['成交量']:
                return 2
            else:
                return 1
        elif yesterday['收盘价'] > today['收盘价']:
            if yesterday['成交量'] < today['成交量']:
                return -2
            else:
                return -1
        return 0

    def analyze_one(self, code):
        try:
            df = pd.read_csv("%s%s.csv" % (self.with_my_result, code), encoding="gbk", error_bad_lines=False)
        except:
            print("ERROR WHILE OPENING %s.csv" % code)
        colmun_name = "MA_%s"
        df = df[::-1]
        ma_arr = {}
        for sum in self.expectation:
            df[colmun_name % sum] = ""
            ma_arr[sum] = deque([])

        for index,row in df.iterrows():
            for ma in ma_arr:
                ma_arr[ma].append(row['收盘价'])
                if len(ma_arr[ma]) < int(ma):
                    df.loc[index,colmun_name % ma] = 0
                else:
                    df.loc[index,colmun_name % ma] = self.get_ma(ma_arr[ma])
                    ma_arr[ma].popleft()
        df[::-1].to_csv("%s%s.csv" % (self.with_my_result, code), encoding="gbk",index=None)

    def analyze_all(self):
        time_start = time.time()
        file_list = os.listdir(self.dayline_file_prefix)
        file_count = len(file_list)
        for i in tqdm(range(file_count)):
            file = file_list[i]
            code = file[0:6]
            self.analyze_one(code)
        time_end = time.time()
        time_c= time_end - time_start
        print('time cost: %s Seconds' % time_c)

    def get_ma(self, deq):
        res = 0
        for i in deq:
            res += i
        return res / len(deq)

    def transform_js(self):
        codelist = os.listdir(self.with_my_result)
        for i in tqdm(range(len(codelist))):
            file = codelist[i]
            f = open( "%sjs\\%s.js" % (self.with_my_result, file[0:6]),"w", encoding="utf8")
            f.write('''module.exports = [
                ''')
            try:
                df = pd.read_csv(self.with_my_result + file, encoding="gbk",  error_bad_lines=False)
            except:
                print("ERROR OPENING %s"%file)

            cols = df.columns
            for col in cols:
                if col[0:2] == "ma":
                    df.drop([col], axis=1, inplace=True)
            # print(df)
            # return
            for index,row in df.iterrows():
                f.write('''%s,
                '''% row.to_dict())
            f.write(''']''')
            f.close()

    def count_one(self, code):
        try:
            df = pd.read_csv("%s%s.csv" % (self.with_my_result, code), encoding="gbk", error_bad_lines=False)
        except:
            print("ERROR OPENING %s.csv " % code)
            return
        # 1. 读取 表头
        sum_arr = []
        ma_arr = []
        for col in df.columns:
            if col[0:3] == "sum": sum_arr.append(col)
            elif col[0:2] == "MA": ma_arr.append(col)
        for index,row in df[::-1].iterrows():
            # 2. 判断改天的每个打分，对应每个展望值是否呈上升趋势，True，更新矩阵
            # 2.1 遍历打分
            for daysum in sum_arr:
                # 2.2 判断改日打分是否存在
                # !!!!!!!!!!!!!!!!!!!!!!!!
                # 这里在计算时默认天数不足为 0 ，所以打分一定存在，不作判断，可能会影响结果

                # 2.3 遍历 MA
                for ma in ma_arr:
                    # 2.4 判断 ma 日后的 ma 值是否大于当日的 ma
                    # 2.4.1 判断 ma 日后的数据是否还没有
                    if index - int(ma[3:]) < 0:
                        continue
                    if df.loc[index-int(ma[3:]), ma] > row[ma]:
                        self.res_plus_plus(daysum, ma)
                    else:
                        self.res_reduce_reduce(daysum, ma)

    # 统一处理累加
    def res_plus_plus(self, sum_key, ma_key, score_key):
        score_key = "_%s" % score_key
        if not sum_key in self.res_matrix.keys():
            self.res_matrix[sum_key] = {}
        if not ma_key in self.res_matrix[sum_key].keys():
            self.res_matrix[sum_key][ma_key] = {}
        if not score_key in self.res_matrix[sum_key][ma_key]:
            self.res_matrix[sum_key][ma_key][score_key] = 0
        self.res_matrix[sum_key][ma_key][score_key] += 1

    def res_reduce_reduce(self, sum_key, ma_key, score_key):
        score_key = "_%s" % score_key
        if not sum_key in self.res_matrix_reduce.keys():
            self.res_matrix_reduce[sum_key] = {}
        if not ma_key in self.res_matrix_reduce[sum_key].keys():
            self.res_matrix_reduce[sum_key][ma_key] = {}
        if not score_key in self.res_matrix_reduce[sum_key][ma_key]:
            self.res_matrix_reduce[sum_key][ma_key][score_key] = 0
        self.res_matrix_reduce[sum_key][ma_key][score_key] += 1

    def count(self):
        time_start = time.time()
        file_list = os.listdir(self.dayline_file_prefix)
        file_count = len(file_list)
        for i in tqdm(range(file_count)):
            file = file_list[i]
            code = file[0:6]
            self.count_one(file[0:6])
        time_end = time.time()
        time_c= time_end - time_start
        print('time cost: %s Seconds' % time_c)
        print(self.res_matrix)
        print(self.res_matrix_reduce)
        f = open("%s\\hly_count_res_plus.json" % sys.path[0], "w", encoding="utf-8")
        f.write(json.dumps(self.res_matrix, ensure_ascii=False))
        f.close()

        f = open("%s\\hly_count_res_reduce.json" % sys.path[0], "w", encoding="utf-8")
        f.write(json.dumps(self.res_matrix_reduce, ensure_ascii=False))
        f.close()

    # 将所有操作集中到一起，减少 IO
    def do_all_job_one(self, code):
        # 1. 打开文件
        try:
            df = pd.read_csv("%s%s.csv" % (self.dayline_file_prefix, code), encoding="gbk",  error_bad_lines=False, index_col=0)
        except:
            print("ERROR OPENING %s.csv" % code)
            return
        # 2. 计算打分
        column_name = 'hly_score'
        df = df[::-1]
        df[column_name] = ""
        last_row = pd.Series([])
        for index,row in df.iterrows():
            if last_row.empty: score = 0
            else: score = self.get_score(last_row,row)
            df.loc[index, column_name] = score
            last_row = row

        # 3. 计算加分和, 同时计算 MA
        column_name_sum = "sum_%s"
        column_name_ma = "ma_%s"

        sum_arr = {}    # 用来保存每个天数队列
        ma_arr = {}

        # 3.1 遍历所有天数和，新建列， 置为空白
        for day_sum in self.serial_day_arr:
            df[column_name_sum % day_sum] = ""
            sum_arr[day_sum] = deque([])
        for sum_ma in self.expectation:
            df[column_name_ma % sum_ma] = ""
            ma_arr[sum_ma] = deque([])


        for index,row in df.iterrows():
            # 3.2 对每个 "和" 的天进行计算
            for day in sum_arr:
                sum_arr[day].append(row['hly_score'])
                if len(sum_arr[day]) < int(day):
                    df.loc[index,column_name_sum % day] = 0
                else:
                    df.loc[index,column_name_sum % day] = reduce(lambda x,y:x+y,sum_arr[day])
                    sum_arr[day].popleft()

            # 3.3 对 每个 MA 的天进行计算
            for ma in ma_arr:
                ma_arr[ma].append(row["收盘价"])
                if len(ma_arr[ma]) < int(ma):
                    df.loc[index, column_name_ma % ma] = 0
                else:
                    df.loc[index,column_name_ma % ma] = self.get_ma(ma_arr[ma])
                    ma_arr[ma].popleft()

        # 4. 进行统计

        # 4.1 读取表头
        sum_arr = []
        ma_arr = []
        for col in df.columns:
            if col[0:3] == "sum": sum_arr.append(col)
            elif col[0:2] == "ma": ma_arr.append(col)
        # print(sum_arr)
        # print(ma_arr)
        for index, row in df.iterrows():
            for daysum in sum_arr:
                for ma in ma_arr:
                    # 4.2 判断 ma 日后的 ma 值是否大于当日的 ma
                    # 4.2.1 判断 ma 日后的数据是否还没有
                    if (index-int(ma[3:])) < 0:
                        continue
                    if df.loc[index-int(ma[3:]), ma] > row[ma]:
                        self.res_plus_plus(daysum, ma, row[daysum])
                    else:
                        self.res_reduce_reduce(daysum, ma, row[daysum])

        # 5. 全部处理完毕，保存
        # f = open("%s\\tmp\\%s.json" % (sys.path[0],code), "w", encoding="utf-8")
        # f.write(json.dumps(self.res_matrix, ensure_ascii=False))
        # f.close()
        # print(self.res_matrix)
        # print(self.res_matrix_reduce)
        df[::-1].to_csv("%s%s.csv"%(self.with_my_result, code),encoding="gbk",index=None)

    def yes_run_me(self):
        time_start = time.time()
        file_list = os.listdir(self.dayline_file_prefix)
        file_count = len(file_list)
        for i in tqdm(range(file_count)):
            file = file_list[i]
            code = file[0:6]
            self.do_all_job_one(file[0:6])
        time_end = time.time()
        time_c= time_end - time_start
        print('time cost: %s Seconds' % time_c)
        print(self.res_matrix)
        print(self.res_matrix_reduce)
        f = open("%s\\hly_count_res_plus.json" % sys.path[0], "w", encoding="utf-8")
        f.write(json.dumps(self.res_matrix, ensure_ascii=False))
        f.close()

        f = open("%s\\hly_count_res_reduce.json" % sys.path[0], "w", encoding="utf-8")
        f.write(json.dumps(self.res_matrix_reduce, ensure_ascii=False))
        f.close()

    def run_by_thread(self, thread_num):
        time_start = time.time()
        file_list = os.listdir(self.dayline_file_prefix)
        file_count = len(file_list)
        offset = file_count / thread_num
        offset = math.ceil(offset)
        threads = []
        for i in range(thread_num):
            start = i * offset
            end = (i+1) * offset if (i+1) * offset < file_count else -1
            thread = threading.Thread(target=self.do_block, args=(start, end))
            threads.append(thread)
        for t in threads:
            t.setDaemon(True)
            t.start()
        t.join()
        time_end = time.time()
        time_c= time_end - time_start
        print('time cost: %s Seconds' % time_c)
        print(self.res_matrix)
        print(self.res_matrix_reduce)
        f = open("%s\\hly_count_res_plus.json" % sys.path[0], "w", encoding="utf-8")
        f.write(json.dumps(self.res_matrix, ensure_ascii=False))
        f.close()

        f = open("%s\\hly_count_res_reduce.json" % sys.path[0], "w", encoding="utf-8")
        f.write(json.dumps(self.res_matrix_reduce, ensure_ascii=False))
        f.close()

    def do_block(self, start, end):
        file_list = os.listdir(self.dayline_file_prefix)
        file_list = file_list[start:end]
        for index in tqdm(range(len(file_list))):
            code = file_list[index]
            code = code[0:6]
            self.do_all_job_one(code)

if __name__ == "__main__":
    serial_day_arr = [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    expectation = [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    # expectation = [5,10,15,20,30]
    # serial_day_arr = [6,7,8,9,10]
    hly = HlyScore(serial_day_arr=serial_day_arr,expectation=expectation)
    # hly.calculate_score_all()
    # # hly.calculate_sum_one('000002')
    # hly.calculate_sum_all()
    # hly.analyze_all()
    # hly.transform_js()
    # hly.count()
    # hly.yes_run_me()
    # hly.run_by_thread(8)