import os import sys sys.path.append(os.getcwd()) import time import logging import random import re import torch import numpy as np import pandas as pd import datetime from dateutil.relativedelta import relativedelta def create_logger(log_path): """ 将日志输出到日志文件和控制台 """ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') # 创建一个handler,用于写入日志文件 file_handler = logging.FileHandler(filename=log_path) file_handler.setFormatter(formatter) file_handler.setLevel(logging.INFO) if file_handler not in logger.handlers: logger.addHandler(file_handler) # 创建一个handler,用于将日志输出到控制台 console = logging.StreamHandler() console.setLevel(logging.DEBUG) console.setFormatter(formatter) if console not in logger.handlers: logger.addHandler(console) return logger def get_file_name(fname): """ 获取文件名 """ return os.path.split(fname)[-1].split(".")[0] def get_file_size(fname): """ 获取文件大小MB """ fsize = os.path.getsize(fname) fsize = fsize/float(1024 * 1024) return round(fsize, 2) def set_seed(seed): """ 设置随机数种子 """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) def reduce_mem(df): """ 节省减少内存的一个函数 """ starttime = time.time() numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] start_mem = df.memory_usage().sum() / 1024**2 for col in df.columns: col_type = df[col].dtypes if col_type in numerics: c_min = df[col].min() c_max = df[col].max() if pd.isnull(c_min) or pd.isnull(c_max): continue if str(col_type)[:3] == 'int': if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem, 100*(start_mem-end_mem)/start_mem, (time.time()-starttime)/60)) return df def is_number(s): try: float(s) return True except ValueError: pass try: import unicodedata unicodedata.numeric(s) return True except (TypeError, ValueError): pass return False def merge_dict(dict1, dict2): """ 合并两个字典 """ res = {**dict1, **dict2} return res def random_dict_order(dict_data): """ 随机打乱字典顺序 """ key_list = random.sample(dict_data.keys(), len(dict_data)) value_list = [] value_list.clear() for key in key_list: item_name = dict_data[key] value_list.append(item_name) results = {k: v for k, v in zip(key_list, value_list)} return results def get_before_date(n): """ 获取前N天的日期 """ today = datetime.datetime.now() # 计算偏移量 offset = datetime.timedelta(days=-n) re_date = (today + offset).strftime('%Y-%m-%d') return re_date def get_user(myobject_path): """ 获取前三个月活跃用户 考虑到寒暑假的用户活度: 5,6,11,12月份取前三个月的活跃用户 1,7月份取前四个月的活跃用户 2,3,4,8,9,10取前五个月的活跃用户 传入myshixun.csv路径以获取实训的活跃用户 传入mysubjecy.csv路径以获得实践课程的活跃用户 """ activate = pd.read_csv(myobject_path,sep='\t',encoding='utf-8') activate["created_at"] = pd.to_datetime(activate["created_at"] ) if max(activate["created_at"]).month in (2,3,4,8,9,10): activities = activate[activate["created_at"]>=max(activate["created_at"])-relativedelta(months=+5)] elif max(activate["created_at"]).month in (1,7): activities = activate[activate["created_at"]>=max(activate["created_at"])-relativedelta(months=+4)] else: activities = activate[activate["created_at"]>=max(activate["created_at"])-relativedelta(months=+3)] user=activities["user_id"].unique() return user #删除特殊字符以及停用词等字符 def extract_word(wordlist): wlist = [] word = str(wordlist) r1 = r"[0-9\s\.\!\/_,$%^*(\"\']|[——!,。?、:?;;《》“”~@#¥%……&*()][SEP]—" # word=re.sub(r1,'',word) word = re.sub(r1, '', word) wlist.append(word) return wlist stop_list = ['「','?', '」', '‖', '【', '】', '~', ')','-', '|', ' ', '|', '•', '〕', '〔', '˘', '︶', ':', '『', '』', ']', '[', ']', '[', '°', '〖', '〗', '☺️', '?', '·', '×', '>', "`・∧・´", 'з', '∠', '丨', '×', '▷', '▶', '◀', '◁''·', "[SEP]"] def cut_stop(s): words = '' for word in s: if word not in stop_list: words += word else: words += "" return words def finalcut(word_list): word = extract_word(word_list) wlist = cut_stop(word) return wlist