You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
59 lines
2.0 KiB
59 lines
2.0 KiB
1 year ago
|
import os
|
||
|
import sys
|
||
|
import json
|
||
|
import pandas as pd
|
||
|
import time
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
class CountInvalidData(object):
|
||
|
|
||
|
def __init__(self, end_date="20140101"):
|
||
|
const_path = sys.path[0].replace("\\clean_data", "")
|
||
|
f = open(const_path + "\\const.json", "r", encoding="utf8")
|
||
|
self.consts = json.loads(f.read())
|
||
|
self.end_date = end_date
|
||
|
self.day_line_file_prefix = self.consts["day_line_file_prefix"]["netease"]
|
||
|
self.COUNT_INVALID_DATA = 0
|
||
|
self.COUNT_INVALID_CODE = 0
|
||
|
|
||
|
def handle_one(self, code):
|
||
|
try:
|
||
|
df = pd.read_csv("%s%s.csv" % (self.day_line_file_prefix, code), encoding="gbk", error_bad_lines=False)
|
||
|
except:
|
||
|
print("ERROR While Opening Code %s" % code )
|
||
|
return
|
||
|
all_right = True
|
||
|
for index,row in df.iterrows():
|
||
|
if row['日期'] < self.end_date: break
|
||
|
flag = True # 标记是否有效
|
||
|
# 遍历行中的每一项
|
||
|
for i,val in row.items():
|
||
|
if val == "None" or val == "NaN" or (val == 0 and (i[:2] != '涨跌')):
|
||
|
flag = False
|
||
|
all_right = False
|
||
|
self.COUNT_INVALID_DATA += 1
|
||
|
break
|
||
|
|
||
|
if not all_right:
|
||
|
self.COUNT_INVALID_CODE += 1
|
||
|
|
||
|
|
||
|
def handle_all(self):
|
||
|
time_start = time.time()
|
||
|
file_list = os.listdir(self.day_line_file_prefix)
|
||
|
file_count = len(file_list)
|
||
|
for i in tqdm(range(file_count)):
|
||
|
file = file_list[i]
|
||
|
code = file[0:6]
|
||
|
self.handle_one(code)
|
||
|
# time.sleep(1)
|
||
|
print("无效的数据数量:", self.COUNT_INVALID_DATA)
|
||
|
print("有无效数据的股票数量:", self.COUNT_INVALID_CODE)
|
||
|
time_end = time.time()
|
||
|
time_c= time_end - time_start #运行所花时间
|
||
|
print('time cost:', time_c, 's')
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
cid = CountInvalidData()
|
||
|
cid.handle_all()
|