import os import time import math import json import joblib import random import argparse import numpy as np import tensorflow.compat.v1 as tf tf.disable_v2_behavior() from tqdm import tqdm from functools import partial from sklearn.utils import shuffle from sklearn.metrics import accuracy_score from opt import adam, warmup_cosine, warmup_linear, warmup_constant from datasets import rocstories from analysis import rocstories as rocstories_analysis from text_utils import TextEncoder from utils import encode_dataset, flatten, iter_data, find_trainable_variables, convert_gradient_to_tensor, shape_list, ResultLogger, assign_to_gpu, average_grads, make_path def gelu(x): return 0.5*x*(1+tf.tanh(math.sqrt(2/math.pi)*(x+0.044715*tf.pow(x, 3)))) def swish(x): return x*tf.nn.sigmoid(x) opt_fns = { 'adam':adam, } act_fns = { 'relu':tf.nn.relu, 'swish':swish, 'gelu':gelu } lr_schedules = { 'warmup_cosine':warmup_cosine, 'warmup_linear':warmup_linear, 'warmup_constant':warmup_constant, } def _norm(x, g=None, b=None, e=1e-5, axis=[1]): u = tf.reduce_mean(x, axis=axis, keep_dims=True) s = tf.reduce_mean(tf.square(x-u), axis=axis, keep_dims=True) x = (x - u) * tf.rsqrt(s + e) if g is not None and b is not None: x = x*g + b return x def norm(x, scope, axis=[-1]): with tf.variable_scope(scope): n_state = shape_list(x)[-1] g = tf.get_variable("g", [n_state], initializer=tf.constant_initializer(1)) b = tf.get_variable("b", [n_state], initializer=tf.constant_initializer(0)) return _norm(x, g, b, axis=axis) def dropout(x, pdrop, train): if train and pdrop > 0: x = tf.nn.dropout(x, 1-pdrop) return x def mask_attn_weights(w): n = shape_list(w)[-1] b = tf.matrix_band_part(tf.ones([n, n]), -1, 0) b = tf.reshape(b, [1, 1, n, n]) w = w*b + -1e9*(1-b) return w def _attn(q, k, v, train=False, scale=False): w = tf.matmul(q, k) if scale: n_state = shape_list(v)[-1] w = w*tf.rsqrt(tf.cast(n_state, tf.float32)) w = mask_attn_weights(w) w = tf.nn.softmax(w) w = dropout(w, attn_pdrop, train) a = tf.matmul(w, v) return a def split_states(x, n): x_shape = shape_list(x) m = x_shape[-1] new_x_shape = x_shape[:-1]+[n, m//n] return tf.reshape(x, new_x_shape) def merge_states(x): x_shape = shape_list(x) new_x_shape = x_shape[:-2]+[np.prod(x_shape[-2:])] return tf.reshape(x, new_x_shape) def split_heads(x, n, k=False): if k: return tf.transpose(split_states(x, n), [0, 2, 3, 1]) else: return tf.transpose(split_states(x, n), [0, 2, 1, 3]) def merge_heads(x): return merge_states(tf.transpose(x, [0, 2, 1, 3])) def conv1d(x, scope, nf, rf, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), pad='VALID', train=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] w = tf.get_variable("w", [rf, nx, nf], initializer=w_init) b = tf.get_variable("b", [nf], initializer=b_init) if rf == 1: #faster 1x1 conv c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, shape_list(x)[:-1]+[nf]) else: #was used to train LM c = tf.nn.conv1d(x, w, stride=1, padding=pad)+b return c def attn(x, scope, n_state, n_head, train=False, scale=False): assert n_state%n_head==0 with tf.variable_scope(scope): c = conv1d(x, 'c_attn', n_state*3, 1, train=train) q, k, v = tf.split(c, 3, 2) q = split_heads(q, n_head) k = split_heads(k, n_head, k=True) v = split_heads(v, n_head) a = _attn(q, k, v, train=train, scale=scale) a = merge_heads(a) a = conv1d(a, 'c_proj', n_state, 1, train=train) a = dropout(a, resid_pdrop, train) return a def mlp(x, scope, n_state, train=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] act = act_fns[afn] h = act(conv1d(x, 'c_fc', n_state, 1, train=train)) h2 = conv1d(h, 'c_proj', nx, 1, train=train) h2 = dropout(h2, resid_pdrop, train) return h2 def block(x, scope, train=False, scale=False): with tf.variable_scope(scope): nx = shape_list(x)[-1] a = attn(x, 'attn', nx, n_head, train=train, scale=scale) n = norm(x+a, 'ln_1') m = mlp(n, 'mlp', nx*4, train=train) h = norm(n+m, 'ln_2') return h def embed(X, we): we = convert_gradient_to_tensor(we) e = tf.gather(we, X) h = tf.reduce_sum(e, 2) return h def clf(x, ny, w_init=tf.random_normal_initializer(stddev=0.02), b_init=tf.constant_initializer(0), train=False): with tf.variable_scope('clf'): nx = shape_list(x)[-1] w = tf.get_variable("w", [nx, ny], initializer=w_init) b = tf.get_variable("b", [ny], initializer=b_init) return tf.matmul(x, w)+b def model(X, M, Y, train=False, reuse=False): with tf.variable_scope('model', reuse=reuse): we = tf.get_variable("we", [n_vocab+n_special+n_ctx, n_embd], initializer=tf.random_normal_initializer(stddev=0.02)) we = dropout(we, embd_pdrop, train) X = tf.reshape(X, [-1, n_ctx, 2]) M = tf.reshape(M, [-1, n_ctx]) h = embed(X, we) for layer in range(n_layer): h = block(h, 'h%d'%layer, train=train, scale=True) lm_h = tf.reshape(h[:, :-1], [-1, n_embd]) lm_logits = tf.matmul(lm_h, we, transpose_b=True) lm_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=lm_logits, labels=tf.reshape(X[:, 1:, 0], [-1])) lm_losses = tf.reshape(lm_losses, [shape_list(X)[0], shape_list(X)[1]-1]) lm_losses = tf.reduce_sum(lm_losses*M[:, 1:], 1)/tf.reduce_sum(M[:, 1:], 1) clf_h = tf.reshape(h, [-1, n_embd]) pool_idx = tf.cast(tf.argmax(tf.cast(tf.equal(X[:, :, 0], clf_token), tf.float32), 1), tf.int32) clf_h = tf.gather(clf_h, tf.range(shape_list(X)[0], dtype=tf.int32)*n_ctx+pool_idx) clf_h = tf.reshape(clf_h, [-1, 2, n_embd]) if train and clf_pdrop > 0: shape = shape_list(clf_h) shape[1] = 1 clf_h = tf.nn.dropout(clf_h, 1-clf_pdrop, shape) clf_h = tf.reshape(clf_h, [-1, n_embd]) clf_logits = clf(clf_h, 1, train=train) clf_logits = tf.reshape(clf_logits, [-1, 2]) clf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=clf_logits, labels=Y) return clf_logits, clf_losses, lm_losses def mgpu_train(*xs): gpu_ops = [] gpu_grads = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): do_reuse = True if i > 0 else None with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse): clf_logits, clf_losses, lm_losses = model(*xs, train=True, reuse=do_reuse) if lm_coef > 0: train_loss = tf.reduce_mean(clf_losses) + lm_coef*tf.reduce_mean(lm_losses) else: train_loss = tf.reduce_mean(clf_losses) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) gpu_ops.append([clf_logits, clf_losses, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] grads = average_grads(gpu_grads) grads = [g for g, p in grads] train = opt_fns[opt](params, grads, lr, partial(lr_schedules[lr_schedule], warmup=lr_warmup), n_updates_total, l2=l2, max_grad_norm=max_grad_norm, vector_l2=vector_l2, b1=b1, b2=b2, e=e) return [train]+ops def mgpu_predict(*xs): gpu_ops = [] xs = (tf.split(x, n_gpu, 0) for x in xs) for i, xs in enumerate(zip(*xs)): with tf.device(assign_to_gpu(i, "/gpu:0")), tf.variable_scope(tf.get_variable_scope(), reuse=True): clf_logits, clf_losses, lm_losses = model(*xs, train=False, reuse=True) gpu_ops.append([clf_logits, clf_losses, lm_losses]) ops = [tf.concat(op, 0) for op in zip(*gpu_ops)] return ops def transform_roc(X1, X2, X3): n_batch = len(X1) xmb = np.zeros((n_batch, 2, n_ctx, 2), dtype=np.int32) mmb = np.zeros((n_batch, 2, n_ctx), dtype=np.float32) start = encoder['_start_'] delimiter = encoder['_delimiter_'] for i, (x1, x2, x3), in enumerate(zip(X1, X2, X3)): x12 = [start]+x1[:max_len]+[delimiter]+x2[:max_len]+[clf_token] x13 = [start]+x1[:max_len]+[delimiter]+x3[:max_len]+[clf_token] l12 = len(x12) l13 = len(x13) xmb[i, 0, :l12, 0] = x12 xmb[i, 1, :l13, 0] = x13 mmb[i, 0, :l12] = 1 mmb[i, 1, :l13] = 1 xmb[:, :, :, 1] = np.arange(n_vocab+n_special, n_vocab+n_special+n_ctx) return xmb, mmb def iter_apply(Xs, Ms, Ys): fns = [lambda x:np.concatenate(x, 0), lambda x:float(np.sum(x))] results = [] for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True): n = len(xmb) if n == n_batch_train: res = sess.run([eval_mgpu_logits, eval_mgpu_clf_loss], {X_train:xmb, M_train:mmb, Y_train:ymb}) else: res = sess.run([eval_logits, eval_clf_loss], {X:xmb, M:mmb, Y:ymb}) res = [r*n for r in res] results.append(res) results = zip(*results) return [fn(res) for res, fn in zip(results, fns)] def iter_predict(Xs, Ms): logits = [] for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True): n = len(xmb) if n == n_batch_train: logits.append(sess.run(eval_mgpu_logits, {X_train:xmb, M_train:mmb})) else: logits.append(sess.run(eval_logits, {X:xmb, M:mmb})) logits = np.concatenate(logits, 0) return logits def save(path): ps = sess.run(params) joblib.dump(ps, make_path(path)) def log(): global best_score tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid]) va_logits, va_cost = iter_apply(vaX, vaM, vaY) tr_cost = tr_cost/len(trY[:n_valid]) va_cost = va_cost/n_valid tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1))*100. va_acc = accuracy_score(vaY, np.argmax(va_logits, 1))*100. logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc) print('%d %d %.3f %.3f %.2f %.2f'%(n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc)) if submit: score = va_acc if score > best_score: best_score = score save(os.path.join(save_dir, desc, 'best_params.jl')) argmax = lambda x:np.argmax(x, 1) pred_fns = { 'rocstories':argmax, } filenames = { 'rocstories':'ROCStories.tsv', } label_decoders = { 'rocstories':None, } def predict(): filename = filenames[dataset] pred_fn = pred_fns[dataset] label_decoder = label_decoders[dataset] predictions = pred_fn(iter_predict(teX, teM)) if label_decoder is not None: predictions = [label_decoder[prediction] for prediction in predictions] path = os.path.join(submission_dir, filename) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w') as f: f.write('{}\t{}\n'.format('index', 'prediction')) for i, prediction in enumerate(predictions): f.write('{}\t{}\n'.format(i, prediction)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--desc', type=str) parser.add_argument('--dataset', type=str) parser.add_argument('--log_dir', type=str, default='log/') parser.add_argument('--save_dir', type=str, default='save/') parser.add_argument('--data_dir', type=str, default='data/') parser.add_argument('--submission_dir', type=str, default='submission/') parser.add_argument('--submit', action='store_true') parser.add_argument('--analysis', action='store_true') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--n_iter', type=int, default=3) parser.add_argument('--n_batch', type=int, default=8) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--lr', type=float, default=6.25e-5) parser.add_argument('--lr_warmup', type=float, default=0.002) parser.add_argument('--n_ctx', type=int, default=512) parser.add_argument('--n_embd', type=int, default=768) parser.add_argument('--n_head', type=int, default=12) parser.add_argument('--n_layer', type=int, default=12) parser.add_argument('--embd_pdrop', type=float, default=0.1) parser.add_argument('--attn_pdrop', type=float, default=0.1) parser.add_argument('--resid_pdrop', type=float, default=0.1) parser.add_argument('--clf_pdrop', type=float, default=0.1) parser.add_argument('--l2', type=float, default=0.01) parser.add_argument('--vector_l2', action='store_true') parser.add_argument('--n_gpu', type=int, default=4) parser.add_argument('--opt', type=str, default='adam') parser.add_argument('--afn', type=str, default='gelu') parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--encoder_path', type=str, default='model/encoder_bpe_40000.json') parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe') parser.add_argument('--n_transfer', type=int, default=12) parser.add_argument('--lm_coef', type=float, default=0.5) parser.add_argument('--b1', type=float, default=0.9) parser.add_argument('--b2', type=float, default=0.999) parser.add_argument('--e', type=float, default=1e-8) args = parser.parse_args() print(args) globals().update(args.__dict__) random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder) n_y = 2 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx//2-2 n_ctx = min(max([len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(trX1, trX2, trX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(vaX1, vaX2, vaX3)]+[len(x1[:max_len])+max(len(x2[:max_len]), len(x3[:max_len])) for x1, x2, x3 in zip(teX1, teX2, teX3)])+3, n_ctx) trX, trM = transform_roc(trX1, trX2, trX3) vaX, vaM = transform_roc(vaX1, vaX2, vaX3) if submit: teX, teM = transform_roc(teX1, teX2, teX3) n_train = len(trY) n_valid = len(vaY) n_batch_train = n_batch*n_gpu n_updates_total = (n_train//n_batch_train)*n_iter tf.compat.v1.disable_eager_execution() X_train = tf.placeholder(tf.int32, [n_batch_train, 2, n_ctx, 2]) M_train = tf.placeholder(tf.float32, [n_batch_train, 2, n_ctx]) X = tf.placeholder(tf.int32, [None, 2, n_ctx, 2]) M = tf.placeholder(tf.float32, [None, 2, n_ctx]) Y_train = tf.placeholder(tf.int32, [n_batch_train]) Y = tf.placeholder(tf.int32, [None]) train, logits, clf_losses, lm_losses = mgpu_train(X_train, M_train, Y_train) clf_loss = tf.reduce_mean(clf_losses) params = find_trainable_variables('model') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(tf.global_variables_initializer()) shapes = json.load(open('model/params_shapes.json')) offsets = np.cumsum([np.prod(shape) for shape in shapes]) init_params = [np.load('model/params_{}.npy'.format(n)) for n in range(10)] init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] init_params[0] = init_params[0][:n_ctx] init_params[0] = np.concatenate([init_params[1], (np.random.randn(n_special, n_embd)*0.02).astype(np.float32), init_params[0]], 0) del init_params[1] if n_transfer == -1: n_transfer = 0 else: n_transfer = 1+n_transfer*12 sess.run([p.assign(ip) for p, ip in zip(params[:n_transfer], init_params[:n_transfer])]) eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train) eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=True) eval_clf_loss = tf.reduce_mean(eval_clf_losses) eval_mgpu_clf_loss = tf.reduce_mean(eval_mgpu_clf_losses) n_updates = 0 n_epochs = 0 if dataset != 'stsb': trYt = trY if submit: save(os.path.join(save_dir, desc, 'best_params.jl')) best_score = 0 for i in range(n_iter): for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random), n_batch=n_batch_train, truncate=True, verbose=True): cost, _ = sess.run([clf_loss, train], {X_train:xmb, M_train:mmb, Y_train:ymb}) n_updates += 1 if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0: log() n_epochs += 1 log() if submit: sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(os.path.join(save_dir, desc, 'best_params.jl')))]) predict() if analysis: rocstories_analysis(data_dir, os.path.join(submission_dir, 'ROCStories.tsv'), os.path.join(log_dir, 'rocstories.jsonl'))