EduCoder_Study_RS/start_all_train.py

import argparse
import os
from config import samples_mode
from config import samples_mode_flag
from config import logger


def del_output_file(parent_path):
    """
    删除之前的数据和输出
    """
    files_list = os.listdir(parent_path)
    for file_name in files_list:
        if not os.path.isdir(parent_path + file_name) and ('.py' not in file_name):
            if os.path.exists(parent_path + file_name):
                logger.info('删除文件: ' + parent_path + file_name)
                os.system('rm ' + parent_path + file_name)


if __name__  == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--silent', action='store_true', 
        help='nohup python start_all_train.py --silent >./logs/all_model_train.log &')
    args = parser.parse_args()

    silent_mode = args.silent

    if samples_mode_flag == 'full':
        mode_name = '全量数据'
    else:
        mode_name = '增量数据' 

    if silent_mode:
        confirm = 'y'
    else:
        confirm = input('确认要开始重新训练' + mode_name + '的模型吗？y/n' + '\n').strip()

    if confirm == 'y' or confirm == 'Y':

        if silent_mode:
            confirm = 'y'
        else:
            confirm = input('是否删除之前的数据和所有输出？y/n' + '\n').strip()

        if confirm == 'y' or confirm == 'Y':
            parent_path_list = []
            parent_path_list.clear()
            parent_path_list.append('./data/' + samples_mode_flag + '/')
            parent_path_list.append('./data/cold_start_data/shixun/' + samples_mode_flag + '/')
            parent_path_list.append('./data/cold_start_data/subject/' + samples_mode_flag + '/')
            parent_path_list.append('./results/shixun/' + samples_mode_flag + '/')
            parent_path_list.append('./results/subject/' + samples_mode_flag + '/')
            parent_path_list.append('./features/shixun/' + samples_mode_flag + '/')
            parent_path_list.append('./features/subject/' + samples_mode_flag + '/')
            parent_path_list.append('./models/shixun/' + samples_mode_flag + '/')
            parent_path_list.append('./models/subject/' + samples_mode_flag + '/')

            for parent_path in parent_path_list:
                if not os.path.exists(parent_path):
                    os.mkdir(parent_path)
                del_output_file(parent_path)

        # 训练召回模型
        os.system('python ./data_process.py')
        os.system('python ./matching/shixun/build_keywords.py')
        os.system('python ./matching/subject/build_keywords.py')
        os.system('python ./matching/shixun/faiss_word2vec.py')
        os.system('python ./matching/subject/faiss_word2vec.py')
        os.system('python ./matching/shixun/hnsw_faiss.py')
        os.system('python ./matching/subject/hnsw_faiss.py')
        os.system('python ./matching/shixun/item_embedding.py')
        os.system('python ./matching/subject/item_embedding.py')
        os.system('python ./matching/shixun/item_merge_emb.py')
        os.system('python ./matching/subject/item_merge_emb.py')
        os.system('python ./matching/shixun/Item2Vec.py')
        os.system('python ./matching/subject/Item2Vec.py')
        os.system('python ./matching/shixun/cold_start_recall.py')
        os.system('python ./matching/subject/cold_start_recall.py')
        os.system('python ./matching/shixun/item_embedding_recall.py')
        os.system('python ./matching/subject/item_embedding_recall.py')
        os.system('python ./matching/shixun/itemcf_recall.py')
        os.system('python ./matching/subject/itemcf_recall.py')
        os.system('python ./matching/shixun/youtubednn_recall_train.py')
        os.system('python ./matching/subject/youtubednn_recall_train.py')
        os.system('python ./matching/shixun/youtube_usercf_recall.py')
        os.system('python ./matching/subject/youtube_usercf_recall.py')
        os.system('python ./matching/shixun/dssm_recall_train.py')
        os.system('python ./matching/subject/dssm_recall_train.py')
        os.system('python ./matching/shixun/dssm_usercf_recall.py')
        os.system('python ./matching/subject/dssm_usercf_recall.py')
        os.system('python ./matching/shixun/fm_recall_train.py')
        os.system('python ./matching/subject/fm_recall_train.py')
        os.system('python ./matching/shixun/mind_recall_train.py')
        os.system('python ./matching/subject/mind_recall_train.py') 
        os.system('python ./matching/shixun/pinsage_recall_train.py')
        os.system('python ./matching/subject/pinsage_recall_train.py')    

        # 全量数据计算每路离线召回结果耗时太长
        # 只在增量数据召回时合并多路召回的结果
        if samples_mode:
            os.system('python ./matching/shixun/multi_recall_combine.py')
            os.system('python ./matching/subject/multi_recall_combine.py')

        # 排序特征工程
        os.system('python ./ranking/shixun/bert_embedding.py')
        os.system('python ./ranking/subject/bert_embedding.py')
        os.system('python ./ranking/shixun/rank_features_engineering.py')
        os.system('python ./ranking/subject/rank_features_engineering.py')

        # 训练排序模型
        os.system('python ./ranking/shixun/xdeepfm_ranker_train.py')
        os.system('python ./ranking/subject/xdeepfm_ranker_train.py')
        os.system('python ./ranking/shixun/difm_ranker_train.py')
        os.system('python ./ranking/subject/difm_ranker_train.py')
        os.system('python ./ranking/shixun/bst_ranker_train.py')
        os.system('python ./ranking/subject/bst_ranker_train.py')