diff --git a/utils/misc.py b/utils/misc.py
new file mode 100644
index 0000000..90e3c19
--- /dev/null
+++ b/utils/misc.py
@@ -0,0 +1,255 @@
+import os
+from math import ceil
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Variable
+
+
+def check_mkdir(dir_name):
+    if not os.path.exists(dir_name):
+        os.mkdir(dir_name)
+
+
+def initialize_weights(*models):
+    for model in models:
+        for module in model.modules():
+            if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
+                nn.init.kaiming_normal(module.weight)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+            elif isinstance(module, nn.BatchNorm2d):
+                module.weight.data.fill_(1)
+                module.bias.data.zero_()
+
+
+def get_upsampling_weight(in_channels, out_channels, kernel_size):
+    factor = (kernel_size + 1) // 2
+    if kernel_size % 2 == 1:
+        center = factor - 1
+    else:
+        center = factor - 0.5
+    og = np.ogrid[:kernel_size, :kernel_size]
+    filt = (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor)
+    weight = np.zeros((in_channels, out_channels, kernel_size, kernel_size), dtype=np.float64)
+    weight[list(range(in_channels)), list(range(out_channels)), :, :] = filt
+    return torch.from_numpy(weight).float()
+
+
+class CrossEntropyLoss2d(nn.Module):
+    def __init__(self, weight=None, size_average=True, ignore_index=255):
+        super(CrossEntropyLoss2d, self).__init__()
+        self.nll_loss = nn.NLLLoss2d(weight, size_average, ignore_index)
+
+    def forward(self, inputs, targets):
+        return self.nll_loss(F.log_softmax(inputs), targets)
+
+
+class FocalLoss2d(nn.Module):
+    def __init__(self, gamma=2, weight=None, size_average=True, ignore_index=255):
+        super(FocalLoss2d, self).__init__()
+        self.gamma = gamma
+        self.nll_loss = nn.NLLLoss2d(weight, size_average, ignore_index)
+
+    def forward(self, inputs, targets):
+        return self.nll_loss((1 - F.softmax(inputs)) ** self.gamma * F.log_softmax(inputs), targets)
+
+
+def _fast_hist(label_pred, label_true, num_classes):
+    mask = (label_true >= 0) & (label_true < num_classes)
+    hist = np.bincount(
+        num_classes * label_true[mask].astype(int) +
+        label_pred[mask], minlength=num_classes ** 2).reshape(num_classes, num_classes)
+    return hist
+
+
+def evaluate(predictions, gts, num_classes):
+    hist = np.zeros((num_classes, num_classes))
+    for lp, lt in zip(predictions, gts):
+        hist += _fast_hist(lp.flatten(), lt.flatten(), num_classes)
+    # axis 0: gt, axis 1: prediction
+    acc = np.diag(hist).sum() / hist.sum()
+    acc_cls = np.diag(hist) / hist.sum(axis=1)
+    acc_cls = np.nanmean(acc_cls)
+    iu = np.diag(hist) / (hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist))
+    mean_iu = np.nanmean(iu)
+    freq = hist.sum(axis=1) / hist.sum()
+    fwavacc = (freq[freq > 0] * iu[freq > 0]).sum()
+    return acc, acc_cls, mean_iu, fwavacc
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+class PolyLR(object):
+    def __init__(self, optimizer, curr_iter, max_iter, lr_decay):
+        self.max_iter = float(max_iter)
+        self.init_lr_groups = []
+        for p in optimizer.param_groups:
+            self.init_lr_groups.append(p['lr'])
+        self.param_groups = optimizer.param_groups
+        self.curr_iter = curr_iter
+        self.lr_decay = lr_decay
+
+    def step(self):
+        for idx, p in enumerate(self.param_groups):
+            p['lr'] = self.init_lr_groups[idx] * (1 - self.curr_iter / self.max_iter) ** self.lr_decay
+
+
+# just a try, not recommend to use
+class Conv2dDeformable(nn.Module):
+    def __init__(self, regular_filter, cuda=True):
+        super(Conv2dDeformable, self).__init__()
+        assert isinstance(regular_filter, nn.Conv2d)
+        self.regular_filter = regular_filter
+        self.offset_filter = nn.Conv2d(regular_filter.in_channels, 2 * regular_filter.in_channels, kernel_size=3,
+                                       padding=1, bias=False)
+        self.offset_filter.weight.data.normal_(0, 0.0005)
+        self.input_shape = None
+        self.grid_w = None
+        self.grid_h = None
+        self.cuda = cuda
+
+    def forward(self, x):
+        x_shape = x.size()  # (b, c, h, w)
+        offset = self.offset_filter(x)  # (b, 2*c, h, w)
+        offset_w, offset_h = torch.split(offset, self.regular_filter.in_channels, 1)  # (b, c, h, w)
+        offset_w = offset_w.contiguous().view(-1, int(x_shape[2]), int(x_shape[3]))  # (b*c, h, w)
+        offset_h = offset_h.contiguous().view(-1, int(x_shape[2]), int(x_shape[3]))  # (b*c, h, w)
+        if not self.input_shape or self.input_shape != x_shape:
+            self.input_shape = x_shape
+            grid_w, grid_h = np.meshgrid(np.linspace(-1, 1, x_shape[3]), np.linspace(-1, 1, x_shape[2]))  # (h, w)
+            grid_w = torch.Tensor(grid_w)
+            grid_h = torch.Tensor(grid_h)
+            if self.cuda:
+                grid_w = grid_w.cuda()
+                grid_h = grid_h.cuda()
+            self.grid_w = nn.Parameter(grid_w)
+            self.grid_h = nn.Parameter(grid_h)
+        offset_w = offset_w + self.grid_w  # (b*c, h, w)
+        offset_h = offset_h + self.grid_h  # (b*c, h, w)
+        x = x.contiguous().view(-1, int(x_shape[2]), int(x_shape[3])).unsqueeze(1)  # (b*c, 1, h, w)
+        x = F.grid_sample(x, torch.stack((offset_h, offset_w), 3))  # (b*c, h, w)
+        x = x.contiguous().view(-1, int(x_shape[1]), int(x_shape[2]), int(x_shape[3]))  # (b, c, h, w)
+        x = self.regular_filter(x)
+        return x
+
+
+def sliced_forward(single_forward):
+    def _pad(x, crop_size):
+        h, w = x.size()[2:]
+        pad_h = max(crop_size - h, 0)
+        pad_w = max(crop_size - w, 0)
+        x = F.pad(x, (0, pad_w, 0, pad_h))
+        return x, pad_h, pad_w
+
+    def wrapper(self, x):
+        batch_size, _, ori_h, ori_w = x.size()
+        if self.training and self.use_aux:
+            outputs_all_scales = Variable(torch.zeros((batch_size, self.num_classes, ori_h, ori_w))).cuda()
+            aux_all_scales = Variable(torch.zeros((batch_size, self.num_classes, ori_h, ori_w))).cuda()
+            for s in self.scales:
+                new_size = (int(ori_h * s), int(ori_w * s))
+                scaled_x = F.upsample(x, size=new_size, mode='bilinear')
+                scaled_x = Variable(scaled_x).cuda()
+                scaled_h, scaled_w = scaled_x.size()[2:]
+                long_size = max(scaled_h, scaled_w)
+                print(scaled_x.size())
+
+                if long_size > self.crop_size:
+                    count = torch.zeros((scaled_h, scaled_w))
+                    outputs = Variable(torch.zeros((batch_size, self.num_classes, scaled_h, scaled_w))).cuda()
+                    aux_outputs = Variable(torch.zeros((batch_size, self.num_classes, scaled_h, scaled_w))).cuda()
+                    stride = int(ceil(self.crop_size * self.stride_rate))
+                    h_step_num = int(ceil((scaled_h - self.crop_size) / stride)) + 1
+                    w_step_num = int(ceil((scaled_w - self.crop_size) / stride)) + 1
+                    for yy in range(h_step_num):
+                        for xx in range(w_step_num):
+                            sy, sx = yy * stride, xx * stride
+                            ey, ex = sy + self.crop_size, sx + self.crop_size
+                            x_sub = scaled_x[:, :, sy: ey, sx: ex]
+                            x_sub, pad_h, pad_w = _pad(x_sub, self.crop_size)
+                            print(x_sub.size())
+                            outputs_sub, aux_sub = single_forward(self, x_sub)
+
+                            if sy + self.crop_size > scaled_h:
+                                outputs_sub = outputs_sub[:, :, : -pad_h, :]
+                                aux_sub = aux_sub[:, :, : -pad_h, :]
+
+                            if sx + self.crop_size > scaled_w:
+                                outputs_sub = outputs_sub[:, :, :, : -pad_w]
+                                aux_sub = aux_sub[:, :, :, : -pad_w]
+
+                            outputs[:, :, sy: ey, sx: ex] = outputs_sub
+                            aux_outputs[:, :, sy: ey, sx: ex] = aux_sub
+
+                            count[sy: ey, sx: ex] += 1
+                    count = Variable(count).cuda()
+                    outputs = (outputs / count)
+                    aux_outputs = (outputs / count)
+                else:
+                    scaled_x, pad_h, pad_w = _pad(scaled_x, self.crop_size)
+                    outputs, aux_outputs = single_forward(self, scaled_x)
+                    outputs = outputs[:, :, : -pad_h, : -pad_w]
+                    aux_outputs = aux_outputs[:, :, : -pad_h, : -pad_w]
+                outputs_all_scales += outputs
+                aux_all_scales += aux_outputs
+            return outputs_all_scales / len(self.scales), aux_all_scales
+        else:
+            outputs_all_scales = Variable(torch.zeros((batch_size, self.num_classes, ori_h, ori_w))).cuda()
+            for s in self.scales:
+                new_size = (int(ori_h * s), int(ori_w * s))
+                scaled_x = F.upsample(x, size=new_size, mode='bilinear')
+                scaled_h, scaled_w = scaled_x.size()[2:]
+                long_size = max(scaled_h, scaled_w)
+
+                if long_size > self.crop_size:
+                    count = torch.zeros((scaled_h, scaled_w))
+                    outputs = Variable(torch.zeros((batch_size, self.num_classes, scaled_h, scaled_w))).cuda()
+                    stride = int(ceil(self.crop_size * self.stride_rate))
+                    h_step_num = int(ceil((scaled_h - self.crop_size) / stride)) + 1
+                    w_step_num = int(ceil((scaled_w - self.crop_size) / stride)) + 1
+                    for yy in range(h_step_num):
+                        for xx in range(w_step_num):
+                            sy, sx = yy * stride, xx * stride
+                            ey, ex = sy + self.crop_size, sx + self.crop_size
+                            x_sub = scaled_x[:, :, sy: ey, sx: ex]
+                            x_sub, pad_h, pad_w = _pad(x_sub, self.crop_size)
+
+                            outputs_sub = single_forward(self, x_sub)
+
+                            if sy + self.crop_size > scaled_h:
+                                outputs_sub = outputs_sub[:, :, : -pad_h, :]
+
+                            if sx + self.crop_size > scaled_w:
+                                outputs_sub = outputs_sub[:, :, :, : -pad_w]
+
+                            outputs[:, :, sy: ey, sx: ex] = outputs_sub
+
+                            count[sy: ey, sx: ex] += 1
+                    count = Variable(count).cuda()
+                    outputs = (outputs / count)
+                else:
+                    scaled_x, pad_h, pad_w = _pad(scaled_x, self.crop_size)
+                    outputs = single_forward(self, scaled_x)
+                    outputs = outputs[:, :, : -pad_h, : -pad_w]
+                outputs_all_scales += outputs
+            return outputs_all_scales
+
+    return wrapper