|
|
import os
|
|
|
import sys
|
|
|
sys.path.append(os.getcwd())
|
|
|
import time
|
|
|
import logging
|
|
|
import random
|
|
|
import re
|
|
|
import torch
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
import datetime
|
|
|
from dateutil.relativedelta import relativedelta
|
|
|
|
|
|
def create_logger(log_path):
|
|
|
"""
|
|
|
将日志输出到日志文件和控制台
|
|
|
"""
|
|
|
logger = logging.getLogger(__name__)
|
|
|
logger.setLevel(logging.INFO)
|
|
|
|
|
|
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
# 创建一个handler,用于写入日志文件
|
|
|
file_handler = logging.FileHandler(filename=log_path)
|
|
|
file_handler.setFormatter(formatter)
|
|
|
file_handler.setLevel(logging.INFO)
|
|
|
if file_handler not in logger.handlers:
|
|
|
logger.addHandler(file_handler)
|
|
|
|
|
|
# 创建一个handler,用于将日志输出到控制台
|
|
|
console = logging.StreamHandler()
|
|
|
console.setLevel(logging.DEBUG)
|
|
|
console.setFormatter(formatter)
|
|
|
if console not in logger.handlers:
|
|
|
logger.addHandler(console)
|
|
|
|
|
|
return logger
|
|
|
|
|
|
def get_file_name(fname):
|
|
|
"""
|
|
|
获取文件名
|
|
|
"""
|
|
|
return os.path.split(fname)[-1].split(".")[0]
|
|
|
|
|
|
def get_file_size(fname):
|
|
|
"""
|
|
|
获取文件大小MB
|
|
|
"""
|
|
|
fsize = os.path.getsize(fname)
|
|
|
fsize = fsize/float(1024 * 1024)
|
|
|
return round(fsize, 2)
|
|
|
|
|
|
def set_seed(seed):
|
|
|
"""
|
|
|
设置随机数种子
|
|
|
"""
|
|
|
random.seed(seed)
|
|
|
np.random.seed(seed)
|
|
|
torch.manual_seed(seed)
|
|
|
if torch.cuda.is_available():
|
|
|
torch.cuda.manual_seed_all(seed)
|
|
|
|
|
|
|
|
|
def reduce_mem(df):
|
|
|
"""
|
|
|
节省减少内存的一个函数
|
|
|
"""
|
|
|
starttime = time.time()
|
|
|
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
|
|
|
start_mem = df.memory_usage().sum() / 1024**2
|
|
|
for col in df.columns:
|
|
|
col_type = df[col].dtypes
|
|
|
if col_type in numerics:
|
|
|
c_min = df[col].min()
|
|
|
c_max = df[col].max()
|
|
|
if pd.isnull(c_min) or pd.isnull(c_max):
|
|
|
continue
|
|
|
if str(col_type)[:3] == 'int':
|
|
|
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
|
|
|
df[col] = df[col].astype(np.int8)
|
|
|
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
|
|
|
df[col] = df[col].astype(np.int16)
|
|
|
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
|
|
|
df[col] = df[col].astype(np.int32)
|
|
|
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
|
|
|
df[col] = df[col].astype(np.int64)
|
|
|
else:
|
|
|
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
|
|
|
df[col] = df[col].astype(np.float16)
|
|
|
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
|
|
|
df[col] = df[col].astype(np.float32)
|
|
|
else:
|
|
|
df[col] = df[col].astype(np.float64)
|
|
|
end_mem = df.memory_usage().sum() / 1024**2
|
|
|
print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
|
|
|
100*(start_mem-end_mem)/start_mem,
|
|
|
(time.time()-starttime)/60))
|
|
|
return df
|
|
|
|
|
|
def is_number(s):
|
|
|
try:
|
|
|
float(s)
|
|
|
return True
|
|
|
except ValueError:
|
|
|
pass
|
|
|
|
|
|
try:
|
|
|
import unicodedata
|
|
|
unicodedata.numeric(s)
|
|
|
return True
|
|
|
except (TypeError, ValueError):
|
|
|
pass
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
def merge_dict(dict1, dict2):
|
|
|
"""
|
|
|
合并两个字典
|
|
|
"""
|
|
|
res = {**dict1, **dict2}
|
|
|
return res
|
|
|
|
|
|
def random_dict_order(dict_data):
|
|
|
"""
|
|
|
随机打乱字典顺序
|
|
|
"""
|
|
|
key_list = random.sample(dict_data.keys(), len(dict_data))
|
|
|
|
|
|
value_list = []
|
|
|
value_list.clear()
|
|
|
for key in key_list:
|
|
|
item_name = dict_data[key]
|
|
|
value_list.append(item_name)
|
|
|
|
|
|
results = {k: v for k, v in zip(key_list, value_list)}
|
|
|
return results
|
|
|
|
|
|
|
|
|
def get_before_date(n):
|
|
|
"""
|
|
|
获取前N天的日期
|
|
|
"""
|
|
|
today = datetime.datetime.now()
|
|
|
# 计算偏移量
|
|
|
offset = datetime.timedelta(days=-n)
|
|
|
re_date = (today + offset).strftime('%Y-%m-%d')
|
|
|
return re_date
|
|
|
|
|
|
def get_user(myobject_path):
|
|
|
"""
|
|
|
获取前三个月活跃用户
|
|
|
考虑到寒暑假的用户活度:
|
|
|
5,6,11,12月份取前三个月的活跃用户
|
|
|
1,7月份取前四个月的活跃用户
|
|
|
2,3,4,8,9,10取前五个月的活跃用户
|
|
|
传入myshixun.csv路径以获取实训的活跃用户
|
|
|
传入mysubjecy.csv路径以获得实践课程的活跃用户
|
|
|
"""
|
|
|
activate = pd.read_csv(myobject_path,sep='\t',encoding='utf-8')
|
|
|
|
|
|
activate["created_at"] = pd.to_datetime(activate["created_at"] )
|
|
|
|
|
|
if max(activate["created_at"]).month in (2,3,4,8,9,10):
|
|
|
activities = activate[activate["created_at"]>=max(activate["created_at"])-relativedelta(months=+5)]
|
|
|
elif max(activate["created_at"]).month in (1,7):
|
|
|
activities = activate[activate["created_at"]>=max(activate["created_at"])-relativedelta(months=+4)]
|
|
|
else:
|
|
|
activities = activate[activate["created_at"]>=max(activate["created_at"])-relativedelta(months=+3)]
|
|
|
|
|
|
user=activities["user_id"].unique()
|
|
|
return user
|
|
|
|
|
|
#删除特殊字符以及停用词等字符
|
|
|
def extract_word(wordlist):
|
|
|
wlist = []
|
|
|
word = str(wordlist)
|
|
|
r1 = r"[0-9\s\.\!\/_,$%^*(\"\']|[——!,。?、:?;;《》“”~@#¥%……&*()][SEP]—"
|
|
|
# word=re.sub(r1,'',word)
|
|
|
word = re.sub(r1, '', word)
|
|
|
wlist.append(word)
|
|
|
return wlist
|
|
|
|
|
|
stop_list = ['「','?', '」', '‖', '【', '】', '~', ')','-', '|', ' ', '|', '•', '〕', '〔', '˘', '︶', ':', '『', '』', ']',
|
|
|
'[', ']', '[', '°', '〖', '〗', '☺️', '?',
|
|
|
'·', '×', '>', "`・∧・´", 'з', '∠',
|
|
|
'丨', '×', '▷', '▶', '◀', '◁''·', "[SEP]"]
|
|
|
|
|
|
def cut_stop(s):
|
|
|
words = ''
|
|
|
for word in s:
|
|
|
if word not in stop_list:
|
|
|
words += word
|
|
|
else:
|
|
|
words += ""
|
|
|
return words
|
|
|
|
|
|
def finalcut(word_list):
|
|
|
word = extract_word(word_list)
|
|
|
wlist = cut_stop(word)
|
|
|
|
|
|
return wlist
|