diff --git a/src/PaddleClas/ppcls/__init__.py b/src/PaddleClas/ppcls/__init__.py
new file mode 100644
index 0000000..d6cdb6f
--- /dev/null
+++ b/src/PaddleClas/ppcls/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import optimizer
+
+from .arch import *
+from .optimizer import *
+from .data import *
+from .utils import *
diff --git a/src/PaddleClas/ppcls/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..d088393
Binary files /dev/null and b/src/PaddleClas/ppcls/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/__init__.py b/src/PaddleClas/ppcls/arch/__init__.py
new file mode 100644
index 0000000..2d5e29d
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/__init__.py
@@ -0,0 +1,134 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import copy
+import importlib
+
+import paddle.nn as nn
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
+from . import backbone, gears
+from .backbone import *
+from .gears import build_gear
+from .utils import *
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils import logger
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.arch.slim import prune_model, quantize_model
+
+__all__ = ["build_model", "RecModel", "DistillationModel"]
+
+
+def build_model(config):
+    arch_config = copy.deepcopy(config["Arch"])
+    model_type = arch_config.pop("name")
+    mod = importlib.import_module(__name__)
+    arch = getattr(mod, model_type)(**arch_config)
+    if isinstance(arch, TheseusLayer):
+        prune_model(config, arch)
+        quantize_model(config, arch)
+    return arch
+
+
+def apply_to_static(config, model):
+    support_to_static = config['Global'].get('to_static', False)
+
+    if support_to_static:
+        specs = None
+        if 'image_shape' in config['Global']:
+            specs = [InputSpec([None] + config['Global']['image_shape'])]
+        model = to_static(model, input_spec=specs)
+        logger.info("Successfully to apply @to_static with specs: {}".format(
+            specs))
+    return model
+
+
+class RecModel(TheseusLayer):
+    def __init__(self, **config):
+        super().__init__()
+        backbone_config = config["Backbone"]
+        backbone_name = backbone_config.pop("name")
+        self.backbone = eval(backbone_name)(**backbone_config)
+        if "BackboneStopLayer" in config:
+            backbone_stop_layer = config["BackboneStopLayer"]["name"]
+            self.backbone.stop_after(backbone_stop_layer)
+
+        if "Neck" in config:
+            self.neck = build_gear(config["Neck"])
+        else:
+            self.neck = None
+
+        if "Head" in config:
+            self.head = build_gear(config["Head"])
+        else:
+            self.head = None
+
+    def forward(self, x, label=None):
+        out = dict()
+        x = self.backbone(x)
+        out["backbone"] = x
+        if self.neck is not None:
+            x = self.neck(x)
+            out["neck"] = x
+        out["features"] = x
+        if self.head is not None:
+            y = self.head(x, label)
+            out["logits"] = y
+        return out
+
+
+class DistillationModel(nn.Layer):
+    def __init__(self,
+                 models=None,
+                 pretrained_list=None,
+                 freeze_params_list=None,
+                 **kargs):
+        super().__init__()
+        assert isinstance(models, list)
+        self.model_list = []
+        self.model_name_list = []
+        if pretrained_list is not None:
+            assert len(pretrained_list) == len(models)
+
+        if freeze_params_list is None:
+            freeze_params_list = [False] * len(models)
+        assert len(freeze_params_list) == len(models)
+        for idx, model_config in enumerate(models):
+            assert len(model_config) == 1
+            key = list(model_config.keys())[0]
+            model_config = model_config[key]
+            model_name = model_config.pop("name")
+            model = eval(model_name)(**model_config)
+
+            if freeze_params_list[idx]:
+                for param in model.parameters():
+                    param.trainable = False
+            self.model_list.append(self.add_sublayer(key, model))
+            self.model_name_list.append(key)
+
+        if pretrained_list is not None:
+            for idx, pretrained in enumerate(pretrained_list):
+                if pretrained is not None:
+                    load_dygraph_pretrain(
+                        self.model_name_list[idx], path=pretrained)
+
+    def forward(self, x, label=None):
+        result_dict = dict()
+        for idx, model_name in enumerate(self.model_name_list):
+            if label is None:
+                result_dict[model_name] = self.model_list[idx](x)
+            else:
+                result_dict[model_name] = self.model_list[idx](x, label)
+        return result_dict
diff --git a/src/PaddleClas/ppcls/arch/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/arch/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..8585526
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/__pycache__/utils.cpython-39.pyc b/src/PaddleClas/ppcls/arch/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000..e1b3292
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/__pycache__/utils.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/__init__.py b/src/PaddleClas/ppcls/arch/backbone/__init__.py
new file mode 100644
index 0000000..1bd23a9
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/__init__.py
@@ -0,0 +1,83 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import inspect
+
+from ppcls.arch.backbone.legendary_models.mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+from ppcls.arch.backbone.legendary_models.mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+from ppcls.arch.backbone.legendary_models.resnet import ResNet18, ResNet18_vd, ResNet34, ResNet34_vd, ResNet50, ResNet50_vd, ResNet101, ResNet101_vd, ResNet152, ResNet152_vd, ResNet200_vd
+from ppcls.arch.backbone.legendary_models.vgg import VGG11, VGG13, VGG16, VGG19
+from ppcls.arch.backbone.legendary_models.inception_v3 import InceptionV3
+from ppcls.arch.backbone.legendary_models.hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W60_C, HRNet_W64_C, SE_HRNet_W64_C
+from ppcls.arch.backbone.legendary_models.pp_lcnet import PPLCNet_x0_25, PPLCNet_x0_35, PPLCNet_x0_5, PPLCNet_x0_75, PPLCNet_x1_0, PPLCNet_x1_5, PPLCNet_x2_0, PPLCNet_x2_5
+from ppcls.arch.backbone.legendary_models.esnet import ESNet_x0_25, ESNet_x0_5, ESNet_x0_75, ESNet_x1_0
+
+from ppcls.arch.backbone.model_zoo.resnet_vc import ResNet50_vc
+from ppcls.arch.backbone.model_zoo.resnext import ResNeXt50_32x4d, ResNeXt50_64x4d, ResNeXt101_32x4d, ResNeXt101_64x4d, ResNeXt152_32x4d, ResNeXt152_64x4d
+from ppcls.arch.backbone.model_zoo.resnext_vd import ResNeXt50_vd_32x4d, ResNeXt50_vd_64x4d, ResNeXt101_vd_32x4d, ResNeXt101_vd_64x4d, ResNeXt152_vd_32x4d, ResNeXt152_vd_64x4d
+from ppcls.arch.backbone.model_zoo.res2net import Res2Net50_26w_4s, Res2Net50_14w_8s
+from ppcls.arch.backbone.model_zoo.res2net_vd import Res2Net50_vd_26w_4s, Res2Net101_vd_26w_4s, Res2Net200_vd_26w_4s
+from ppcls.arch.backbone.model_zoo.se_resnet_vd import SE_ResNet18_vd, SE_ResNet34_vd, SE_ResNet50_vd
+from ppcls.arch.backbone.model_zoo.se_resnext_vd import SE_ResNeXt50_vd_32x4d, SE_ResNeXt50_vd_32x4d, SENet154_vd
+from ppcls.arch.backbone.model_zoo.se_resnext import SE_ResNeXt50_32x4d, SE_ResNeXt101_32x4d, SE_ResNeXt152_64x4d
+from ppcls.arch.backbone.model_zoo.dpn import DPN68, DPN92, DPN98, DPN107, DPN131
+from ppcls.arch.backbone.model_zoo.densenet import DenseNet121, DenseNet161, DenseNet169, DenseNet201, DenseNet264
+from ppcls.arch.backbone.model_zoo.efficientnet import EfficientNetB0, EfficientNetB1, EfficientNetB2, EfficientNetB3, EfficientNetB4, EfficientNetB5, EfficientNetB6, EfficientNetB7, EfficientNetB0_small
+from ppcls.arch.backbone.model_zoo.resnest import ResNeSt50_fast_1s1x64d, ResNeSt50, ResNeSt101
+from ppcls.arch.backbone.model_zoo.googlenet import GoogLeNet
+from ppcls.arch.backbone.model_zoo.mobilenet_v2 import MobileNetV2_x0_25, MobileNetV2_x0_5, MobileNetV2_x0_75, MobileNetV2, MobileNetV2_x1_5, MobileNetV2_x2_0
+from ppcls.arch.backbone.model_zoo.shufflenet_v2 import ShuffleNetV2_x0_25, ShuffleNetV2_x0_33, ShuffleNetV2_x0_5, ShuffleNetV2_x1_0, ShuffleNetV2_x1_5, ShuffleNetV2_x2_0, ShuffleNetV2_swish
+from ppcls.arch.backbone.model_zoo.ghostnet import GhostNet_x0_5, GhostNet_x1_0, GhostNet_x1_3
+from ppcls.arch.backbone.model_zoo.alexnet import AlexNet
+from ppcls.arch.backbone.model_zoo.inception_v4 import InceptionV4
+from ppcls.arch.backbone.model_zoo.xception import Xception41, Xception65, Xception71
+from ppcls.arch.backbone.model_zoo.xception_deeplab import Xception41_deeplab, Xception65_deeplab
+from ppcls.arch.backbone.model_zoo.resnext101_wsl import ResNeXt101_32x8d_wsl, ResNeXt101_32x16d_wsl, ResNeXt101_32x32d_wsl, ResNeXt101_32x48d_wsl
+from ppcls.arch.backbone.model_zoo.squeezenet import SqueezeNet1_0, SqueezeNet1_1
+from ppcls.arch.backbone.model_zoo.darknet import DarkNet53
+from ppcls.arch.backbone.model_zoo.regnet import RegNetX_200MF, RegNetX_4GF, RegNetX_32GF, RegNetY_200MF, RegNetY_4GF, RegNetY_32GF
+from ppcls.arch.backbone.model_zoo.vision_transformer import ViT_small_patch16_224, ViT_base_patch16_224, ViT_base_patch16_384, ViT_base_patch32_384, ViT_large_patch16_224, ViT_large_patch16_384, ViT_large_patch32_384
+from ppcls.arch.backbone.model_zoo.distilled_vision_transformer import DeiT_tiny_patch16_224, DeiT_small_patch16_224, DeiT_base_patch16_224, DeiT_tiny_distilled_patch16_224, DeiT_small_distilled_patch16_224, DeiT_base_distilled_patch16_224, DeiT_base_patch16_384, DeiT_base_distilled_patch16_384
+from ppcls.arch.backbone.model_zoo.swin_transformer import SwinTransformer_tiny_patch4_window7_224, SwinTransformer_small_patch4_window7_224, SwinTransformer_base_patch4_window7_224, SwinTransformer_base_patch4_window12_384, SwinTransformer_large_patch4_window7_224, SwinTransformer_large_patch4_window12_384
+from ppcls.arch.backbone.model_zoo.mixnet import MixNet_S, MixNet_M, MixNet_L
+from ppcls.arch.backbone.model_zoo.rexnet import ReXNet_1_0, ReXNet_1_3, ReXNet_1_5, ReXNet_2_0, ReXNet_3_0
+from ppcls.arch.backbone.model_zoo.gvt import pcpvt_small, pcpvt_base, pcpvt_large, alt_gvt_small, alt_gvt_base, alt_gvt_large
+from ppcls.arch.backbone.model_zoo.levit import LeViT_128S, LeViT_128, LeViT_192, LeViT_256, LeViT_384
+from ppcls.arch.backbone.model_zoo.dla import DLA34, DLA46_c, DLA46x_c, DLA60, DLA60x, DLA60x_c, DLA102, DLA102x, DLA102x2, DLA169
+from ppcls.arch.backbone.model_zoo.rednet import RedNet26, RedNet38, RedNet50, RedNet101, RedNet152
+from ppcls.arch.backbone.model_zoo.tnt import TNT_small
+from ppcls.arch.backbone.model_zoo.hardnet import HarDNet68, HarDNet85, HarDNet39_ds, HarDNet68_ds
+from ppcls.arch.backbone.model_zoo.cspnet import CSPDarkNet53
+from ppcls.arch.backbone.model_zoo.pvt_v2 import PVT_V2_B0, PVT_V2_B1, PVT_V2_B2_Linear, PVT_V2_B2, PVT_V2_B3, PVT_V2_B4, PVT_V2_B5
+from ppcls.arch.backbone.model_zoo.repvgg import RepVGG_A0, RepVGG_A1, RepVGG_A2, RepVGG_B0, RepVGG_B1, RepVGG_B2, RepVGG_B1g2, RepVGG_B1g4, RepVGG_B2g4, RepVGG_B3g4
+from ppcls.arch.backbone.variant_models.resnet_variant import ResNet50_last_stage_stride1
+from ppcls.arch.backbone.variant_models.vgg_variant import VGG19Sigmoid
+from ppcls.arch.backbone.variant_models.pp_lcnet_variant import PPLCNet_x2_5_Tanh
+
+
+# help whl get all the models' api (class type) and components' api (func type)
+def get_apis():
+    current_func = sys._getframe().f_code.co_name
+    current_module = sys.modules[__name__]
+    api = []
+    for _, obj in inspect.getmembers(current_module,
+                                     inspect.isclass) + inspect.getmembers(
+                                         current_module, inspect.isfunction):
+        api.append(obj.__name__)
+    api.remove(current_func)
+    return api
+
+
+__all__ = get_apis()
diff --git a/src/PaddleClas/ppcls/arch/backbone/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..ed10327
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/base/__init__.py b/src/PaddleClas/ppcls/arch/backbone/base/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/PaddleClas/ppcls/arch/backbone/base/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/base/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..c69b585
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/base/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/base/__pycache__/theseus_layer.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/base/__pycache__/theseus_layer.cpython-39.pyc
new file mode 100644
index 0000000..fd599a5
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/base/__pycache__/theseus_layer.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/base/theseus_layer.py b/src/PaddleClas/ppcls/arch/backbone/base/theseus_layer.py
new file mode 100644
index 0000000..908d944
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/base/theseus_layer.py
@@ -0,0 +1,301 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, List, Dict, Union, Callable, Any
+
+from paddle import nn
+from ppcls.utils import logger
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class TheseusLayer(nn.Layer):
+    def __init__(self, *args, **kwargs):
+        super(TheseusLayer, self).__init__()
+        self.res_dict = {}
+        self.res_name = self.full_name()
+        self.pruner = None
+        self.quanter = None
+
+    def _return_dict_hook(self, layer, input, output):
+        res_dict = {"output": output}
+        # 'list' is needed to avoid error raised by popping self.res_dict
+        for res_key in list(self.res_dict):
+            # clear the res_dict because the forward process may change according to input
+            res_dict[res_key] = self.res_dict.pop(res_key)
+        return res_dict
+
+    def init_res(self,
+                 stages_pattern,
+                 return_patterns=None,
+                 return_stages=None):
+        if return_patterns and return_stages:
+            msg = f"The 'return_patterns' would be ignored when 'return_stages' is set."
+            logger.warning(msg)
+            return_stages = None
+
+        if return_stages is True:
+            return_patterns = stages_pattern
+        # return_stages is int or bool
+        if type(return_stages) is int:
+            return_stages = [return_stages]
+        if isinstance(return_stages, list):
+            if max(return_stages) > len(stages_pattern) or min(
+                    return_stages) < 0:
+                msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}."
+                logger.warning(msg)
+                return_stages = [
+                    val for val in return_stages
+                    if val >= 0 and val < len(stages_pattern)
+                ]
+            return_patterns = [stages_pattern[i] for i in return_stages]
+
+        if return_patterns:
+            self.update_res(return_patterns)
+
+    def replace_sub(self, *args, **kwargs) -> None:
+        msg = "The function 'replace_sub()' is deprecated, please use 'upgrade_sublayer()' instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
+
+    def upgrade_sublayer(self,
+                         layer_name_pattern: Union[str, List[str]],
+                         handle_func: Callable[[nn.Layer, str], nn.Layer]
+                         ) -> Dict[str, nn.Layer]:
+        """use 'handle_func' to modify the sub-layer(s) specified by 'layer_name_pattern'.
+
+        Args:
+            layer_name_pattern (Union[str, List[str]]): The name of layer to be modified by 'handle_func'.
+            handle_func (Callable[[nn.Layer, str], nn.Layer]): The function to modify target layer specified by 'layer_name_pattern'. The formal params are the layer(nn.Layer) and pattern(str) that is (a member of) layer_name_pattern (when layer_name_pattern is List type). And the return is the layer processed.
+
+        Returns:
+            Dict[str, nn.Layer]: The key is the pattern and corresponding value is the result returned by 'handle_func()'.
+
+        Examples:
+
+            from paddle import nn
+            import paddleclas
+
+            def rep_func(layer: nn.Layer, pattern: str):
+                new_layer = nn.Conv2D(
+                    in_channels=layer._in_channels,
+                    out_channels=layer._out_channels,
+                    kernel_size=5,
+                    padding=2
+                )
+                return new_layer
+
+            net = paddleclas.MobileNetV1()
+            res = net.replace_sub(layer_name_pattern=["blocks[11].depthwise_conv.conv", "blocks[12].depthwise_conv.conv"], handle_func=rep_func)
+            print(res)
+            # {'blocks[11].depthwise_conv.conv': the corresponding new_layer, 'blocks[12].depthwise_conv.conv': the corresponding new_layer}
+        """
+
+        if not isinstance(layer_name_pattern, list):
+            layer_name_pattern = [layer_name_pattern]
+
+        hit_layer_pattern_list = []
+        for pattern in layer_name_pattern:
+            # parse pattern to find target layer and its parent
+            layer_list = parse_pattern_str(pattern=pattern, parent_layer=self)
+            if not layer_list:
+                continue
+            sub_layer_parent = layer_list[-2]["layer"] if len(
+                layer_list) > 1 else self
+
+            sub_layer = layer_list[-1]["layer"]
+            sub_layer_name = layer_list[-1]["name"]
+            sub_layer_index = layer_list[-1]["index"]
+
+            new_sub_layer = handle_func(sub_layer, pattern)
+
+            if sub_layer_index:
+                getattr(sub_layer_parent,
+                        sub_layer_name)[sub_layer_index] = new_sub_layer
+            else:
+                setattr(sub_layer_parent, sub_layer_name, new_sub_layer)
+
+            hit_layer_pattern_list.append(pattern)
+        return hit_layer_pattern_list
+
+    def stop_after(self, stop_layer_name: str) -> bool:
+        """stop forward and backward after 'stop_layer_name'.
+
+        Args:
+            stop_layer_name (str): The name of layer that stop forward and backward after this layer.
+
+        Returns:
+            bool: 'True' if successful, 'False' otherwise.
+        """
+
+        layer_list = parse_pattern_str(stop_layer_name, self)
+        if not layer_list:
+            return False
+
+        parent_layer = self
+        for layer_dict in layer_list:
+            name, index = layer_dict["name"], layer_dict["index"]
+            if not set_identity(parent_layer, name, index):
+                msg = f"Failed to set the layers that after stop_layer_name('{stop_layer_name}') to IdentityLayer. The error layer's name is '{name}'."
+                logger.warning(msg)
+                return False
+            parent_layer = layer_dict["layer"]
+
+        return True
+
+    def update_res(
+            self,
+            return_patterns: Union[str, List[str]]) -> Dict[str, nn.Layer]:
+        """update the result(s) to be returned.
+
+        Args:
+            return_patterns (Union[str, List[str]]): The name of layer to return output.
+
+        Returns:
+            Dict[str, nn.Layer]: The pattern(str) and corresponding layer(nn.Layer) that have been set successfully.
+        """
+
+        # clear res_dict that could have been set
+        self.res_dict = {}
+
+        class Handler(object):
+            def __init__(self, res_dict):
+                # res_dict is a reference
+                self.res_dict = res_dict
+
+            def __call__(self, layer, pattern):
+                layer.res_dict = self.res_dict
+                layer.res_name = pattern
+                if hasattr(layer, "hook_remove_helper"):
+                    layer.hook_remove_helper.remove()
+                layer.hook_remove_helper = layer.register_forward_post_hook(
+                    save_sub_res_hook)
+                return layer
+
+        handle_func = Handler(self.res_dict)
+
+        hit_layer_pattern_list = self.upgrade_sublayer(
+            return_patterns, handle_func=handle_func)
+
+        if hasattr(self, "hook_remove_helper"):
+            self.hook_remove_helper.remove()
+        self.hook_remove_helper = self.register_forward_post_hook(
+            self._return_dict_hook)
+
+        return hit_layer_pattern_list
+
+
+def save_sub_res_hook(layer, input, output):
+    layer.res_dict[layer.res_name] = output
+
+
+def set_identity(parent_layer: nn.Layer,
+                 layer_name: str,
+                 layer_index: str=None) -> bool:
+    """set the layer specified by layer_name and layer_index to Indentity.
+
+    Args:
+        parent_layer (nn.Layer): The parent layer of target layer specified by layer_name and layer_index.
+        layer_name (str): The name of target layer to be set to Indentity.
+        layer_index (str, optional): The index of target layer to be set to Indentity in parent_layer. Defaults to None.
+
+    Returns:
+        bool: True if successfully, False otherwise.
+    """
+
+    stop_after = False
+    for sub_layer_name in parent_layer._sub_layers:
+        if stop_after:
+            parent_layer._sub_layers[sub_layer_name] = Identity()
+            continue
+        if sub_layer_name == layer_name:
+            stop_after = True
+
+    if layer_index and stop_after:
+        stop_after = False
+        for sub_layer_index in parent_layer._sub_layers[
+                layer_name]._sub_layers:
+            if stop_after:
+                parent_layer._sub_layers[layer_name][
+                    sub_layer_index] = Identity()
+                continue
+            if layer_index == sub_layer_index:
+                stop_after = True
+
+    return stop_after
+
+
+def parse_pattern_str(pattern: str, parent_layer: nn.Layer) -> Union[
+        None, List[Dict[str, Union[nn.Layer, str, None]]]]:
+    """parse the string type pattern.
+
+    Args:
+        pattern (str): The pattern to discribe layer.
+        parent_layer (nn.Layer): The root layer relative to the pattern.
+
+    Returns:
+        Union[None, List[Dict[str, Union[nn.Layer, str, None]]]]: None if failed. If successfully, the members are layers parsed in order:
+                                                                [
+                                                                    {"layer": first layer, "name": first layer's name parsed, "index": first layer's index parsed if exist},
+                                                                    {"layer": second layer, "name": second layer's name parsed, "index": second layer's index parsed if exist},
+                                                                    ...
+                                                                ]
+    """
+
+    pattern_list = pattern.split(".")
+    if not pattern_list:
+        msg = f"The pattern('{pattern}') is illegal. Please check and retry."
+        logger.warning(msg)
+        return None
+
+    layer_list = []
+    while len(pattern_list) > 0:
+        if '[' in pattern_list[0]:
+            target_layer_name = pattern_list[0].split('[')[0]
+            target_layer_index = pattern_list[0].split('[')[1].split(']')[0]
+        else:
+            target_layer_name = pattern_list[0]
+            target_layer_index = None
+
+        target_layer = getattr(parent_layer, target_layer_name, None)
+
+        if target_layer is None:
+            msg = f"Not found layer named('{target_layer_name}') specifed in pattern('{pattern}')."
+            logger.warning(msg)
+            return None
+
+        if target_layer_index and target_layer:
+            if int(target_layer_index) < 0 or int(target_layer_index) >= len(
+                    target_layer):
+                msg = f"Not found layer by index('{target_layer_index}') specifed in pattern('{pattern}'). The index should < {len(target_layer)} and > 0."
+                logger.warning(msg)
+                return None
+
+            target_layer = target_layer[target_layer_index]
+
+        layer_list.append({
+            "layer": target_layer,
+            "name": target_layer_name,
+            "index": target_layer_index
+        })
+
+        pattern_list = pattern_list[1:]
+        parent_layer = target_layer
+    return layer_list
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__init__.py b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__init__.py
new file mode 100644
index 0000000..1f837da
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__init__.py
@@ -0,0 +1,6 @@
+from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd
+from .hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W64_C
+from .mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+from .mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+from .inception_v3 import InceptionV3
+from .vgg import VGG11, VGG13, VGG16, VGG19
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..118b131
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/esnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/esnet.cpython-39.pyc
new file mode 100644
index 0000000..135d838
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/esnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/hrnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/hrnet.cpython-39.pyc
new file mode 100644
index 0000000..d1ee9ed
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/hrnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/inception_v3.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/inception_v3.cpython-39.pyc
new file mode 100644
index 0000000..a330df1
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/inception_v3.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/mobilenet_v1.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/mobilenet_v1.cpython-39.pyc
new file mode 100644
index 0000000..b5ec613
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/mobilenet_v1.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/mobilenet_v3.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/mobilenet_v3.cpython-39.pyc
new file mode 100644
index 0000000..54ad9f4
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/mobilenet_v3.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/pp_lcnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/pp_lcnet.cpython-39.pyc
new file mode 100644
index 0000000..8b2be9d
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/pp_lcnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/resnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/resnet.cpython-39.pyc
new file mode 100644
index 0000000..3e2cd63
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/resnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/vgg.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/vgg.cpython-39.pyc
new file mode 100644
index 0000000..0739199
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/legendary_models/__pycache__/vgg.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/esnet.py b/src/PaddleClas/ppcls/arch/backbone/legendary_models/esnet.py
new file mode 100644
index 0000000..e05e0ce
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/legendary_models/esnet.py
@@ -0,0 +1,369 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import math
+import paddle
+from paddle import ParamAttr, reshape, transpose, concat, split
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ESNet_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x0_25_pretrained.pdparams",
+    "ESNet_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x0_5_pretrained.pdparams",
+    "ESNet_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x0_75_pretrained.pdparams",
+    "ESNet_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x1_0_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {"ESNet": ["blocks[2]", "blocks[9]", "blocks[12]"]}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    channels_per_group = num_channels // groups
+    x = reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+    x = transpose(x=x, perm=[0, 2, 1, 3, 4])
+    x = reshape(x=x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 if_act=True):
+        super().__init__()
+        self.conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = BatchNorm(
+            out_channels,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.hardswish = nn.Hardswish()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.hardswish(x)
+        return x
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class ESBlock1(TheseusLayer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.pw_1_1 = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1)
+        self.dw_1 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=1,
+            groups=out_channels // 2,
+            if_act=False)
+        self.se = SEModule(out_channels)
+
+        self.pw_1_2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1)
+
+    def forward(self, x):
+        x1, x2 = split(
+            x, num_or_sections=[x.shape[1] // 2, x.shape[1] // 2], axis=1)
+        x2 = self.pw_1_1(x2)
+        x3 = self.dw_1(x2)
+        x3 = concat([x2, x3], axis=1)
+        x3 = self.se(x3)
+        x3 = self.pw_1_2(x3)
+        x = concat([x1, x3], axis=1)
+        return channel_shuffle(x, 2)
+
+
+class ESBlock2(TheseusLayer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        # branch1
+        self.dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=2,
+            groups=in_channels,
+            if_act=False)
+        self.pw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1)
+        # branch2
+        self.pw_2_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1)
+        self.dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=2,
+            groups=out_channels // 2,
+            if_act=False)
+        self.se = SEModule(out_channels // 2)
+        self.pw_2_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1)
+        self.concat_dw = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            groups=out_channels)
+        self.concat_pw = ConvBNLayer(
+            in_channels=out_channels, out_channels=out_channels, kernel_size=1)
+
+    def forward(self, x):
+        x1 = self.dw_1(x)
+        x1 = self.pw_1(x1)
+        x2 = self.pw_2_1(x)
+        x2 = self.dw_2(x2)
+        x2 = self.se(x2)
+        x2 = self.pw_2_2(x2)
+        x = concat([x1, x2], axis=1)
+        x = self.concat_dw(x)
+        x = self.concat_pw(x)
+        return x
+
+
+class ESNet(TheseusLayer):
+    def __init__(self,
+                 stages_pattern,
+                 class_num=1000,
+                 scale=1.0,
+                 dropout_prob=0.2,
+                 class_expand=1280,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+        self.scale = scale
+        self.class_num = class_num
+        self.class_expand = class_expand
+        stage_repeats = [3, 7, 3]
+        stage_out_channels = [
+            -1, 24, make_divisible(116 * scale), make_divisible(232 * scale),
+            make_divisible(464 * scale), 1024
+        ]
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2)
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = ESBlock2(
+                        in_channels=stage_out_channels[stage_id + 1],
+                        out_channels=stage_out_channels[stage_id + 2])
+                else:
+                    block = ESBlock1(
+                        in_channels=stage_out_channels[stage_id + 2],
+                        out_channels=stage_out_channels[stage_id + 2])
+                block_list.append(block)
+        self.blocks = nn.Sequential(*block_list)
+
+        self.conv2 = ConvBNLayer(
+            in_channels=stage_out_channels[-2],
+            out_channels=stage_out_channels[-1],
+            kernel_size=1)
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(
+            in_channels=stage_out_channels[-1],
+            out_channels=self.class_expand,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+        self.hardswish = nn.Hardswish()
+        self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = Linear(self.class_expand, self.class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.max_pool(x)
+        x = self.blocks(x)
+        x = self.conv2(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ESNet_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x0_25` model depends on args.
+    """
+    model = ESNet(
+        scale=0.25, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x0_25"], use_ssld)
+    return model
+
+
+def ESNet_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x0_5` model depends on args.
+    """
+    model = ESNet(
+        scale=0.5, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x0_5"], use_ssld)
+    return model
+
+
+def ESNet_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x0_75` model depends on args.
+    """
+    model = ESNet(
+        scale=0.75, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x0_75"], use_ssld)
+    return model
+
+
+def ESNet_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x1_0` model depends on args.
+    """
+    model = ESNet(
+        scale=1.0, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x1_0"], use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/hrnet.py b/src/PaddleClas/ppcls/arch/backbone/legendary_models/hrnet.py
new file mode 100644
index 0000000..c3f7759
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/legendary_models/hrnet.py
@@ -0,0 +1,794 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn
+from paddle import ParamAttr
+from paddle.nn.functional import upsample
+from paddle.nn.initializer import Uniform
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer, Identity
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "HRNet_W18_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W18_C_pretrained.pdparams",
+    "HRNet_W30_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W30_C_pretrained.pdparams",
+    "HRNet_W32_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W32_C_pretrained.pdparams",
+    "HRNet_W40_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W40_C_pretrained.pdparams",
+    "HRNet_W44_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W44_C_pretrained.pdparams",
+    "HRNet_W48_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W48_C_pretrained.pdparams",
+    "HRNet_W64_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W64_C_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {"HRNet": ["st4"]}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return Identity()
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act="relu"):
+        super().__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+        self.bn = nn.BatchNorm(num_filters, act=None)
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 has_se,
+                 stride=1,
+                 downsample=False):
+        super().__init__()
+
+        self.has_se = has_se
+        self.downsample = downsample
+
+        self.conv1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu")
+        self.conv3 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if self.downsample:
+            self.conv_down = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                act=None)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters * 4,
+                num_filters=num_filters * 4,
+                reduction_ratio=16)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, res_dict=None):
+        residual = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        if self.downsample:
+            residual = self.conv_down(residual)
+        if self.has_se:
+            x = self.se(x)
+        x = paddle.add(x=residual, y=x)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, num_channels, num_filters, has_se=False):
+        super().__init__()
+
+        self.has_se = has_se
+
+        self.conv1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=1,
+            act="relu")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=1,
+            act=None)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters,
+                num_filters=num_filters,
+                reduction_ratio=16)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.has_se:
+            x = self.se(x)
+
+        x = paddle.add(x=residual, y=x)
+        x = self.relu(x)
+        return x
+
+
+class SELayer(TheseusLayer):
+    def __init__(self, num_channels, num_filters, reduction_ratio):
+        super().__init__()
+
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.fc_squeeze = nn.Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+        self.relu = nn.ReLU()
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.fc_excitation = nn.Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x, res_dict=None):
+        residual = x
+        x = self.avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self.fc_squeeze(x)
+        x = self.relu(x)
+        x = self.fc_excitation(x)
+        x = self.sigmoid(x)
+        x = paddle.unsqueeze(x, axis=[2, 3])
+        x = residual * x
+        return x
+
+
+class Stage(TheseusLayer):
+    def __init__(self, num_modules, num_filters, has_se=False):
+        super().__init__()
+
+        self._num_modules = num_modules
+
+        self.stage_func_list = nn.LayerList()
+        for i in range(num_modules):
+            self.stage_func_list.append(
+                HighResolutionModule(
+                    num_filters=num_filters, has_se=has_se))
+
+    def forward(self, x, res_dict=None):
+        x = x
+        for idx in range(self._num_modules):
+            x = self.stage_func_list[idx](x)
+        return x
+
+
+class HighResolutionModule(TheseusLayer):
+    def __init__(self, num_filters, has_se=False):
+        super().__init__()
+
+        self.basic_block_list = nn.LayerList()
+
+        for i in range(len(num_filters)):
+            self.basic_block_list.append(
+                nn.Sequential(* [
+                    BasicBlock(
+                        num_channels=num_filters[i],
+                        num_filters=num_filters[i],
+                        has_se=has_se) for j in range(4)
+                ]))
+
+        self.fuse_func = FuseLayers(
+            in_channels=num_filters, out_channels=num_filters)
+
+    def forward(self, x, res_dict=None):
+        out = []
+        for idx, xi in enumerate(x):
+            basic_block_list = self.basic_block_list[idx]
+            for basic_block_func in basic_block_list:
+                xi = basic_block_func(xi)
+            out.append(xi)
+        out = self.fuse_func(out)
+        return out
+
+
+class FuseLayers(TheseusLayer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self._actual_ch = len(in_channels)
+        self._in_channels = in_channels
+
+        self.residual_func_list = nn.LayerList()
+        self.relu = nn.ReLU()
+        for i in range(len(in_channels)):
+            for j in range(len(in_channels)):
+                if j > i:
+                    self.residual_func_list.append(
+                        ConvBNLayer(
+                            num_channels=in_channels[j],
+                            num_filters=out_channels[i],
+                            filter_size=1,
+                            stride=1,
+                            act=None))
+                elif j < i:
+                    pre_num_filters = in_channels[j]
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            self.residual_func_list.append(
+                                ConvBNLayer(
+                                    num_channels=pre_num_filters,
+                                    num_filters=out_channels[i],
+                                    filter_size=3,
+                                    stride=2,
+                                    act=None))
+                            pre_num_filters = out_channels[i]
+                        else:
+                            self.residual_func_list.append(
+                                ConvBNLayer(
+                                    num_channels=pre_num_filters,
+                                    num_filters=out_channels[j],
+                                    filter_size=3,
+                                    stride=2,
+                                    act="relu"))
+                            pre_num_filters = out_channels[j]
+
+    def forward(self, x, res_dict=None):
+        out = []
+        residual_func_idx = 0
+        for i in range(len(self._in_channels)):
+            residual = x[i]
+            for j in range(len(self._in_channels)):
+                if j > i:
+                    xj = self.residual_func_list[residual_func_idx](x[j])
+                    residual_func_idx += 1
+
+                    xj = upsample(xj, scale_factor=2**(j - i), mode="nearest")
+                    residual = paddle.add(x=residual, y=xj)
+                elif j < i:
+                    xj = x[j]
+                    for k in range(i - j):
+                        xj = self.residual_func_list[residual_func_idx](xj)
+                        residual_func_idx += 1
+
+                    residual = paddle.add(x=residual, y=xj)
+
+            residual = self.relu(residual)
+            out.append(residual)
+
+        return out
+
+
+class LastClsOut(TheseusLayer):
+    def __init__(self,
+                 num_channel_list,
+                 has_se,
+                 num_filters_list=[32, 64, 128, 256]):
+        super().__init__()
+
+        self.func_list = nn.LayerList()
+        for idx in range(len(num_channel_list)):
+            self.func_list.append(
+                BottleneckBlock(
+                    num_channels=num_channel_list[idx],
+                    num_filters=num_filters_list[idx],
+                    has_se=has_se,
+                    downsample=True))
+
+    def forward(self, x, res_dict=None):
+        out = []
+        for idx, xi in enumerate(x):
+            xi = self.func_list[idx](xi)
+            out.append(xi)
+        return out
+
+
+class HRNet(TheseusLayer):
+    """
+    HRNet
+    Args:
+        width: int=18. Base channel number of HRNet.
+        has_se: bool=False. If 'True', add se module to HRNet.
+        class_num: int=1000. Output num of last fc layer.
+    Returns:
+        model: nn.Layer. Specific HRNet model depends on args.
+    """
+
+    def __init__(self,
+                 stages_pattern,
+                 width=18,
+                 has_se=False,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.width = width
+        self.has_se = has_se
+        self._class_num = class_num
+
+        channels_2 = [self.width, self.width * 2]
+        channels_3 = [self.width, self.width * 2, self.width * 4]
+        channels_4 = [
+            self.width, self.width * 2, self.width * 4, self.width * 8
+        ]
+
+        self.conv_layer1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=3,
+            stride=2,
+            act="relu")
+
+        self.conv_layer1_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=64,
+            filter_size=3,
+            stride=2,
+            act="relu")
+
+        self.layer1 = nn.Sequential(* [
+            BottleneckBlock(
+                num_channels=64 if i == 0 else 256,
+                num_filters=64,
+                has_se=has_se,
+                stride=1,
+                downsample=True if i == 0 else False) for i in range(4)
+        ])
+
+        self.conv_tr1_1 = ConvBNLayer(
+            num_channels=256, num_filters=width, filter_size=3)
+        self.conv_tr1_2 = ConvBNLayer(
+            num_channels=256, num_filters=width * 2, filter_size=3, stride=2)
+
+        self.st2 = Stage(
+            num_modules=1, num_filters=channels_2, has_se=self.has_se)
+
+        self.conv_tr2 = ConvBNLayer(
+            num_channels=width * 2,
+            num_filters=width * 4,
+            filter_size=3,
+            stride=2)
+        self.st3 = Stage(
+            num_modules=4, num_filters=channels_3, has_se=self.has_se)
+
+        self.conv_tr3 = ConvBNLayer(
+            num_channels=width * 4,
+            num_filters=width * 8,
+            filter_size=3,
+            stride=2)
+
+        self.st4 = Stage(
+            num_modules=3, num_filters=channels_4, has_se=self.has_se)
+
+        # classification
+        num_filters_list = [32, 64, 128, 256]
+        self.last_cls = LastClsOut(
+            num_channel_list=channels_4,
+            has_se=self.has_se,
+            num_filters_list=num_filters_list)
+
+        last_num_filters = [256, 512, 1024]
+        self.cls_head_conv_list = nn.LayerList()
+        for idx in range(3):
+            self.cls_head_conv_list.append(
+                ConvBNLayer(
+                    num_channels=num_filters_list[idx] * 4,
+                    num_filters=last_num_filters[idx],
+                    filter_size=3,
+                    stride=2))
+
+        self.conv_last = ConvBNLayer(
+            num_channels=1024, num_filters=2048, filter_size=1, stride=1)
+
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+
+        self.fc = nn.Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv_layer1_1(x)
+        x = self.conv_layer1_2(x)
+
+        x = self.layer1(x)
+
+        tr1_1 = self.conv_tr1_1(x)
+        tr1_2 = self.conv_tr1_2(x)
+        x = self.st2([tr1_1, tr1_2])
+
+        tr2 = self.conv_tr2(x[-1])
+        x.append(tr2)
+        x = self.st3(x)
+
+        tr3 = self.conv_tr3(x[-1])
+        x.append(tr3)
+        x = self.st4(x)
+
+        x = self.last_cls(x)
+
+        y = x[0]
+        for idx in range(3):
+            y = paddle.add(x[idx + 1], self.cls_head_conv_list[idx](y))
+
+        y = self.conv_last(y)
+        y = self.avg_pool(y)
+        y = paddle.reshape(y, shape=[-1, y.shape[1]])
+        y = self.fc(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def HRNet_W18_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W18_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W18_C` model depends on args.
+    """
+    model = HRNet(
+        width=18, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W18_C"], use_ssld)
+    return model
+
+
+def HRNet_W30_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W30_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W30_C` model depends on args.
+    """
+    model = HRNet(
+        width=30, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W30_C"], use_ssld)
+    return model
+
+
+def HRNet_W32_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W32_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W32_C` model depends on args.
+    """
+    model = HRNet(
+        width=32, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W32_C"], use_ssld)
+    return model
+
+
+def HRNet_W40_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W40_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W40_C` model depends on args.
+    """
+    model = HRNet(
+        width=40, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W40_C"], use_ssld)
+    return model
+
+
+def HRNet_W44_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W44_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W44_C` model depends on args.
+    """
+    model = HRNet(
+        width=44, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W44_C"], use_ssld)
+    return model
+
+
+def HRNet_W48_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W48_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W48_C` model depends on args.
+    """
+    model = HRNet(
+        width=48, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W48_C"], use_ssld)
+    return model
+
+
+def HRNet_W60_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W60_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W60_C` model depends on args.
+    """
+    model = HRNet(
+        width=60, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W60_C"], use_ssld)
+    return model
+
+
+def HRNet_W64_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W64_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W64_C` model depends on args.
+    """
+    model = HRNet(
+        width=64, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W64_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W18_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W18_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W18_C` model depends on args.
+    """
+    model = HRNet(
+        width=18,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W18_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W30_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W30_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W30_C` model depends on args.
+    """
+    model = HRNet(
+        width=30,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W30_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W32_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W32_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W32_C` model depends on args.
+    """
+    model = HRNet(
+        width=32,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W32_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W40_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W40_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W40_C` model depends on args.
+    """
+    model = HRNet(
+        width=40,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W40_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W44_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W44_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W44_C` model depends on args.
+    """
+    model = HRNet(
+        width=44,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W44_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W48_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W48_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W48_C` model depends on args.
+    """
+    model = HRNet(
+        width=48,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W48_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W60_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W60_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W60_C` model depends on args.
+    """
+    model = HRNet(
+        width=60,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W60_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W64_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W64_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W64_C` model depends on args.
+    """
+    model = HRNet(
+        width=64,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W64_C"], use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/inception_v3.py b/src/PaddleClas/ppcls/arch/backbone/legendary_models/inception_v3.py
new file mode 100644
index 0000000..5575f8c
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/legendary_models/inception_v3.py
@@ -0,0 +1,557 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "InceptionV3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/InceptionV3_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {
+    "InceptionV3": [
+        "inception_block_list[2]", "inception_block_list[3]",
+        "inception_block_list[7]", "inception_block_list[8]",
+        "inception_block_list[10]"
+    ]
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+InceptionV3 config: dict.
+    key: inception blocks of InceptionV3.
+    values: conv num in different blocks.
+'''
+NET_CONFIG = {
+    "inception_a": [[192, 256, 288], [32, 64, 64]],
+    "inception_b": [288],
+    "inception_c": [[768, 768, 768, 768], [128, 160, 160, 192]],
+    "inception_d": [768],
+    "inception_e": [1280, 2048]
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act="relu"):
+        super().__init__()
+        self.act = act
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class InceptionStem(TheseusLayer):
+    def __init__(self):
+        super().__init__()
+        self.conv_1a_3x3 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.conv_2a_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act="relu")
+        self.conv_2b_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            padding=1,
+            act="relu")
+
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self.conv_3b_1x1 = ConvBNLayer(
+            num_channels=64, num_filters=80, filter_size=1, act="relu")
+        self.conv_4a_3x3 = ConvBNLayer(
+            num_channels=80, num_filters=192, filter_size=3, act="relu")
+
+    def forward(self, x):
+        x = self.conv_1a_3x3(x)
+        x = self.conv_2a_3x3(x)
+        x = self.conv_2b_3x3(x)
+        x = self.max_pool(x)
+        x = self.conv_3b_1x1(x)
+        x = self.conv_4a_3x3(x)
+        x = self.max_pool(x)
+        return x
+
+
+class InceptionA(TheseusLayer):
+    def __init__(self, num_channels, pool_features):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=48,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_2 = ConvBNLayer(
+            num_channels=48,
+            num_filters=64,
+            filter_size=5,
+            padding=2,
+            act="relu")
+
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=pool_features,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+        x = paddle.concat(
+            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+        return x
+
+
+class InceptionB(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3(x)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+
+        x = paddle.concat([branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+        return x
+
+
+class InceptionC(TheseusLayer):
+    def __init__(self, num_channels, channels_7x7):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+        self.branch7x7_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            stride=1,
+            act="relu")
+        self.branch7x7_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            stride=1,
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(7, 1),
+            stride=1,
+            padding=(3, 0),
+            act="relu")
+
+        self.branch7x7dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            act="relu")
+        self.branch7x7dbl_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7dbl_4 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_5 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+
+        x = paddle.concat(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+
+        return x
+
+
+class InceptionD(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=320,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch7x7x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch7x7x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7x3_3 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7x3_4 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+
+        branch_pool = self.branch_pool(x)
+
+        x = paddle.concat([branch3x3, branch7x7x3, branch_pool], axis=1)
+        return x
+
+
+class InceptionE(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=320,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3_2b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=448,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=448,
+            num_filters=384,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3dbl_3b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = paddle.concat(branch3x3, axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = paddle.concat(branch3x3dbl, axis=1)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+
+        x = paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        return x
+
+
+class Inception_V3(TheseusLayer):
+    """
+    Inception_V3
+    Args:
+        config: dict. config of Inception_V3.
+        class_num: int=1000. The number of classes.
+        pretrained: (True or False) or path of pretrained_model. Whether to load the pretrained model.
+    Returns:
+        model: nn.Layer. Specific Inception_V3 model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.inception_a_list = config["inception_a"]
+        self.inception_c_list = config["inception_c"]
+        self.inception_b_list = config["inception_b"]
+        self.inception_d_list = config["inception_d"]
+        self.inception_e_list = config["inception_e"]
+
+        self.inception_stem = InceptionStem()
+
+        self.inception_block_list = nn.LayerList()
+        for i in range(len(self.inception_a_list[0])):
+            inception_a = InceptionA(self.inception_a_list[0][i],
+                                     self.inception_a_list[1][i])
+            self.inception_block_list.append(inception_a)
+
+        for i in range(len(self.inception_b_list)):
+            inception_b = InceptionB(self.inception_b_list[i])
+            self.inception_block_list.append(inception_b)
+
+        for i in range(len(self.inception_c_list[0])):
+            inception_c = InceptionC(self.inception_c_list[0][i],
+                                     self.inception_c_list[1][i])
+            self.inception_block_list.append(inception_c)
+
+        for i in range(len(self.inception_d_list)):
+            inception_d = InceptionD(self.inception_d_list[i])
+            self.inception_block_list.append(inception_d)
+
+        for i in range(len(self.inception_e_list)):
+            inception_e = InceptionE(self.inception_e_list[i])
+            self.inception_block_list.append(inception_e)
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.dropout = Dropout(p=0.2, mode="downscale_in_infer")
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self.fc = Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr())
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.inception_stem(x)
+        for inception_block in self.inception_block_list:
+            x = inception_block(x)
+        x = self.avg_pool(x)
+        x = paddle.reshape(x, shape=[-1, 2048])
+        x = self.dropout(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def InceptionV3(pretrained=False, use_ssld=False, **kwargs):
+    """
+    InceptionV3
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `InceptionV3` model
+    """
+    model = Inception_V3(
+        NET_CONFIG,
+        stages_pattern=MODEL_STAGES_PATTERN["InceptionV3"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["InceptionV3"], use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/mobilenet_v1.py b/src/PaddleClas/ppcls/arch/backbone/legendary_models/mobilenet_v1.py
new file mode 100644
index 0000000..9767d69
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/legendary_models/mobilenet_v1.py
@@ -0,0 +1,257 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, ReLU, Flatten
+from paddle.nn import AdaptiveAvgPool2D
+from paddle.nn.initializer import KaimingNormal
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "MobileNetV1_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_25_pretrained.pdparams",
+    "MobileNetV1_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_5_pretrained.pdparams",
+    "MobileNetV1_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_75_pretrained.pdparams",
+    "MobileNetV1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV1": ["blocks[0]", "blocks[2]", "blocks[4]", "blocks[10]"]
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 num_groups=1):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class DepthwiseSeparable(TheseusLayer):
+    def __init__(self, num_channels, num_filters1, num_filters2, num_groups,
+                 stride, scale):
+        super().__init__()
+
+        self.depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale))
+
+        self.pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
+
+
+class MobileNet(TheseusLayer):
+    """
+    MobileNet
+    Args:
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific MobileNet model depends on args.
+    """
+
+    def __init__(self,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+        self.scale = scale
+
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+
+        #num_channels, num_filters1, num_filters2, num_groups, stride
+        self.cfg = [[int(32 * scale), 32, 64, 32, 1],
+                    [int(64 * scale), 64, 128, 64, 2],
+                    [int(128 * scale), 128, 128, 128, 1],
+                    [int(128 * scale), 128, 256, 128, 2],
+                    [int(256 * scale), 256, 256, 256, 1],
+                    [int(256 * scale), 256, 512, 256, 2],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 1024, 512, 2],
+                    [int(1024 * scale), 1024, 1024, 1024, 1]]
+
+        self.blocks = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=params[0],
+                num_filters1=params[1],
+                num_filters2=params[2],
+                num_groups=params[3],
+                stride=params[4],
+                scale=scale) for params in self.cfg
+        ])
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.flatten = Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = Linear(
+            int(1024 * scale),
+            class_num,
+            weight_attr=ParamAttr(initializer=KaimingNormal()))
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV1_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_25` model depends on args.
+    """
+    model = MobileNet(
+        scale=0.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_25"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_5` model depends on args.
+    """
+    model = MobileNet(
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_75` model depends on args.
+    """
+    model = MobileNet(
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1` model depends on args.
+    """
+    model = MobileNet(
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1"], use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/mobilenet_v3.py b/src/PaddleClas/ppcls/arch/backbone/legendary_models/mobilenet_v3.py
new file mode 100644
index 0000000..836c54c
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/legendary_models/mobilenet_v3.py
@@ -0,0 +1,586 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "MobileNetV3_small_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_35_pretrained.pdparams",
+    "MobileNetV3_small_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_5_pretrained.pdparams",
+    "MobileNetV3_small_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_75_pretrained.pdparams",
+    "MobileNetV3_small_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_pretrained.pdparams",
+    "MobileNetV3_small_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_25_pretrained.pdparams",
+    "MobileNetV3_large_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_35_pretrained.pdparams",
+    "MobileNetV3_large_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_5_pretrained.pdparams",
+    "MobileNetV3_large_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_75_pretrained.pdparams",
+    "MobileNetV3_large_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_pretrained.pdparams",
+    "MobileNetV3_large_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_25_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV3_small":
+    ["blocks[0]", "blocks[2]", "blocks[7]", "blocks[10]"],
+    "MobileNetV3_large":
+    ["blocks[0]", "blocks[2]", "blocks[5]", "blocks[11]", "blocks[14]"]
+}
+
+__all__ = MODEL_URLS.keys()
+
+# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively.
+# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.
+# k: kernel_size
+# exp: middle channel number in depthwise block
+# c: output channel number in depthwise block
+# se: whether to use SE block
+# act: which activation to use
+# s: stride in depthwise block
+NET_CONFIG = {
+    "large": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, False, "relu", 1],
+        [3, 64, 24, False, "relu", 2],
+        [3, 72, 24, False, "relu", 1],
+        [5, 72, 40, True, "relu", 2],
+        [5, 120, 40, True, "relu", 1],
+        [5, 120, 40, True, "relu", 1],
+        [3, 240, 80, False, "hardswish", 2],
+        [3, 200, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 480, 112, True, "hardswish", 1],
+        [3, 672, 112, True, "hardswish", 1],
+        [5, 672, 160, True, "hardswish", 2],
+        [5, 960, 160, True, "hardswish", 1],
+        [5, 960, 160, True, "hardswish", 1],
+    ],
+    "small": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, True, "relu", 2],
+        [3, 72, 24, False, "relu", 2],
+        [3, 88, 24, False, "relu", 1],
+        [5, 96, 40, True, "hardswish", 2],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 120, 48, True, "hardswish", 1],
+        [5, 144, 48, True, "hardswish", 1],
+        [5, 288, 96, True, "hardswish", 2],
+        [5, 576, 96, True, "hardswish", 1],
+        [5, 576, 96, True, "hardswish", 1],
+    ]
+}
+# first conv output channel number in MobileNetV3
+STEM_CONV_NUMBER = 16
+# last second conv output channel for "small"
+LAST_SECOND_CONV_SMALL = 576
+# last second conv output channel for "large"
+LAST_SECOND_CONV_LARGE = 960
+# last conv output channel number for "large" and "small"
+LAST_CONV = 1280
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class MobileNetV3(TheseusLayer):
+    """
+    MobileNetV3
+    Args:
+        config: list. MobileNetV3 depthwise blocks config.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+        inplanes: int=16. The output channel number of first convolution layer.
+        class_squeeze: int=960. The output channel number of penultimate convolution layer. 
+        class_expand: int=1280. The output channel number of last convolution layer. 
+        dropout_prob: float=0.2.  Probability of setting units to zero.
+    Returns:
+        model: nn.Layer. Specific MobileNetV3 model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=1000,
+                 inplanes=STEM_CONV_NUMBER,
+                 class_squeeze=LAST_SECOND_CONV_LARGE,
+                 class_expand=LAST_CONV,
+                 dropout_prob=0.2,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.cfg = config
+        self.scale = scale
+        self.inplanes = inplanes
+        self.class_squeeze = class_squeeze
+        self.class_expand = class_expand
+        self.class_num = class_num
+
+        self.conv = ConvBNLayer(
+            in_c=3,
+            out_c=_make_divisible(self.inplanes * self.scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.blocks = nn.Sequential(* [
+            ResidualUnit(
+                in_c=_make_divisible(self.inplanes * self.scale if i == 0 else
+                                     self.cfg[i - 1][2] * self.scale),
+                mid_c=_make_divisible(self.scale * exp),
+                out_c=_make_divisible(self.scale * c),
+                filter_size=k,
+                stride=s,
+                use_se=se,
+                act=act) for i, (k, exp, c, se, act, s) in enumerate(self.cfg)
+        ])
+
+        self.last_second_conv = ConvBNLayer(
+            in_c=_make_divisible(self.cfg[-1][2] * self.scale),
+            out_c=_make_divisible(self.scale * self.class_squeeze),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(
+            in_channels=_make_divisible(self.scale * self.class_squeeze),
+            out_channels=self.class_expand,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+
+        self.hardswish = nn.Hardswish()
+        if dropout_prob is not None:
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        else:
+            self.dropout = None
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = Linear(self.class_expand, class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.last_second_conv(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+
+        return x
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.act(x)
+        return x
+
+
+class ResidualUnit(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 act=None):
+        super().__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            if_act=True,
+            act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_c)
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+
+    def forward(self, x):
+        identity = x
+        x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, x)
+        return x
+
+
+# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid
+class Hardsigmoid(TheseusLayer):
+    def __init__(self, slope=0.2, offset=0.5):
+        super().__init__()
+        self.slope = slope
+        self.offset = offset
+
+    def forward(self, x):
+        return nn.functional.hardsigmoid(
+            x, slope=self.slope, offset=self.offset)
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        return paddle.multiply(x=identity, y=x)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV3_small_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.35,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_35"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_75
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_0"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_25"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.35,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_35"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_0"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_25"],
+                     use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/pp_lcnet.py b/src/PaddleClas/ppcls/arch/backbone/legendary_models/pp_lcnet.py
new file mode 100644
index 0000000..4017462
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/legendary_models/pp_lcnet.py
@@ -0,0 +1,419 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "PPLCNet_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_25_pretrained.pdparams",
+    "PPLCNet_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_35_pretrained.pdparams",
+    "PPLCNet_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_5_pretrained.pdparams",
+    "PPLCNet_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_75_pretrained.pdparams",
+    "PPLCNet_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams",
+    "PPLCNet_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_5_pretrained.pdparams",
+    "PPLCNet_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_0_pretrained.pdparams",
+    "PPLCNet_x2_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_5_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {
+    "PPLCNet": ["blocks2", "blocks3", "blocks4", "blocks5", "blocks6"]
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se.
+# k: kernel_size
+# in_c: input channel number in depthwise block
+# out_c: output channel number in depthwise block
+# s: stride in depthwise block
+# use_se: whether to use SE block
+
+NET_CONFIG = {
+    "blocks2":
+    #k, in_c, out_c, s, use_se
+    [[3, 16, 32, 1, False]],
+    "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],
+    "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],
+    "blocks5": [[3, 128, 256, 2, False], [5, 256, 256, 1, False],
+                [5, 256, 256, 1, False], [5, 256, 256, 1, False],
+                [5, 256, 256, 1, False], [5, 256, 256, 1, False]],
+    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 num_groups=1):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.hardswish = nn.Hardswish()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.hardswish(x)
+        return x
+
+
+class DepthwiseSeparable(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 dw_size=3,
+                 use_se=False):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_channels,
+            filter_size=dw_size,
+            stride=stride,
+            num_groups=num_channels)
+        if use_se:
+            self.se = SEModule(num_channels)
+        self.pw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            filter_size=1,
+            num_filters=num_filters,
+            stride=1)
+
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class PPLCNet(TheseusLayer):
+    def __init__(self,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=1000,
+                 dropout_prob=0.2,
+                 class_expand=1280,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+        self.scale = scale
+        self.class_expand = class_expand
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=make_divisible(16 * scale),
+            stride=2)
+
+        self.blocks2 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
+        ])
+
+        self.blocks3 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
+        ])
+
+        self.blocks4 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
+        ])
+
+        self.blocks5 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
+        ])
+
+        self.blocks6 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
+        ])
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(
+            in_channels=make_divisible(NET_CONFIG["blocks6"][-1][2] * scale),
+            out_channels=self.class_expand,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+
+        self.hardswish = nn.Hardswish()
+        self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = Linear(self.class_expand, class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        x = self.blocks4(x)
+        x = self.blocks5(x)
+        x = self.blocks6(x)
+
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PPLCNet_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_25` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.25, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_25"], use_ssld)
+    return model
+
+
+def PPLCNet_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_35` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.35, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_35"], use_ssld)
+    return model
+
+
+def PPLCNet_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_5` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.5, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_5"], use_ssld)
+    return model
+
+
+def PPLCNet_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_75` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.75, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_75"], use_ssld)
+    return model
+
+
+def PPLCNet_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x1_0` model depends on args.
+    """
+    model = PPLCNet(
+        scale=1.0, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x1_0"], use_ssld)
+    return model
+
+
+def PPLCNet_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x1_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x1_5` model depends on args.
+    """
+    model = PPLCNet(
+        scale=1.5, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x1_5"], use_ssld)
+    return model
+
+
+def PPLCNet_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x2_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x2_0` model depends on args.
+    """
+    model = PPLCNet(
+        scale=2.0, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x2_0"], use_ssld)
+    return model
+
+
+def PPLCNet_x2_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x2_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x2_5` model depends on args.
+    """
+    model = PPLCNet(
+        scale=2.5, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x2_5"], use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/resnet.py b/src/PaddleClas/ppcls/arch/backbone/legendary_models/resnet.py
new file mode 100644
index 0000000..74c5c5f
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/legendary_models/resnet.py
@@ -0,0 +1,591 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "ResNet18": ["blocks[1]", "blocks[3]", "blocks[5]", "blocks[7]"],
+    "ResNet34": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet50": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet101": ["blocks[2]", "blocks[6]", "blocks[29]", "blocks[32]"],
+    "ResNet152": ["blocks[2]", "blocks[10]", "blocks[46]", "blocks[49]"],
+    "ResNet200": ["blocks[2]", "blocks[14]", "blocks[62]", "blocks[65]"]
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better. 
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, 2]],
+            "vd":
+            [[input_image_channel, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                x = paddle.transpose(x, [0, 2, 3, 1])
+                x.stop_gradient = True
+            x = self.stem(x)
+            x = self.max_pool(x)
+            x = self.blocks(x)
+            x = self.avg_pool(x)
+            x = self.flatten(x)
+            x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["200"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet200"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/legendary_models/vgg.py b/src/PaddleClas/ppcls/arch/backbone/legendary_models/vgg.py
new file mode 100644
index 0000000..74d5cfa
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/legendary_models/vgg.py
@@ -0,0 +1,259 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import MaxPool2D
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "VGG11":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG11_pretrained.pdparams",
+    "VGG13":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG13_pretrained.pdparams",
+    "VGG16":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG16_pretrained.pdparams",
+    "VGG19":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG19_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "VGG": [
+        "conv_block_1", "conv_block_2", "conv_block_3", "conv_block_4",
+        "conv_block_5"
+    ]
+}
+
+__all__ = MODEL_URLS.keys()
+
+# VGG config
+# key: VGG network depth
+# value: conv num in different blocks
+NET_CONFIG = {
+    11: [1, 1, 2, 2, 2],
+    13: [2, 2, 2, 2, 2],
+    16: [2, 2, 3, 3, 3],
+    19: [2, 2, 4, 4, 4]
+}
+
+
+class ConvBlock(TheseusLayer):
+    def __init__(self, input_channels, output_channels, groups):
+        super().__init__()
+
+        self.groups = groups
+        self.conv1 = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        if groups == 2 or groups == 3 or groups == 4:
+            self.conv2 = Conv2D(
+                in_channels=output_channels,
+                out_channels=output_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False)
+        if groups == 3 or groups == 4:
+            self.conv3 = Conv2D(
+                in_channels=output_channels,
+                out_channels=output_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False)
+        if groups == 4:
+            self.conv4 = Conv2D(
+                in_channels=output_channels,
+                out_channels=output_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False)
+
+        self.max_pool = MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self.relu = nn.ReLU()
+
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        x = self.relu(x)
+        if self.groups == 2 or self.groups == 3 or self.groups == 4:
+            x = self.conv2(x)
+            x = self.relu(x)
+        if self.groups == 3 or self.groups == 4:
+            x = self.conv3(x)
+            x = self.relu(x)
+        if self.groups == 4:
+            x = self.conv4(x)
+            x = self.relu(x)
+        x = self.max_pool(x)
+        return x
+
+
+class VGGNet(TheseusLayer):
+    """
+    VGGNet
+    Args:
+        config: list. VGGNet config.
+        stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific VGG model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 stop_grad_layers=0,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.stop_grad_layers = stop_grad_layers
+
+        self.conv_block_1 = ConvBlock(3, 64, config[0])
+        self.conv_block_2 = ConvBlock(64, 128, config[1])
+        self.conv_block_3 = ConvBlock(128, 256, config[2])
+        self.conv_block_4 = ConvBlock(256, 512, config[3])
+        self.conv_block_5 = ConvBlock(512, 512, config[4])
+
+        self.relu = nn.ReLU()
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        for idx, block in enumerate([
+                self.conv_block_1, self.conv_block_2, self.conv_block_3,
+                self.conv_block_4, self.conv_block_5
+        ]):
+            if self.stop_grad_layers >= idx + 1:
+                for param in block.parameters():
+                    param.trainable = False
+
+        self.drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self.fc1 = Linear(7 * 7 * 512, 4096)
+        self.fc2 = Linear(4096, 4096)
+        self.fc3 = Linear(4096, class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, inputs):
+        x = self.conv_block_1(inputs)
+        x = self.conv_block_2(x)
+        x = self.conv_block_3(x)
+        x = self.conv_block_4(x)
+        x = self.conv_block_5(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.drop(x)
+        x = self.fc3(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def VGG11(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG11
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG11` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[11],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG11"], use_ssld)
+    return model
+
+
+def VGG13(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG13
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG13` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[13],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG13"], use_ssld)
+    return model
+
+
+def VGG16(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG16
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG16` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[16],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG16"], use_ssld)
+    return model
+
+
+def VGG19(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG19
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG19` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[19],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG19"], use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__init__.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..63bd229
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/alexnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/alexnet.cpython-39.pyc
new file mode 100644
index 0000000..5b84c99
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/alexnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/cspnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/cspnet.cpython-39.pyc
new file mode 100644
index 0000000..1d996fd
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/cspnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/darknet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/darknet.cpython-39.pyc
new file mode 100644
index 0000000..8323836
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/darknet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/densenet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/densenet.cpython-39.pyc
new file mode 100644
index 0000000..7d8af7d
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/densenet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/distilled_vision_transformer.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/distilled_vision_transformer.cpython-39.pyc
new file mode 100644
index 0000000..64f04d6
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/distilled_vision_transformer.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/dla.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/dla.cpython-39.pyc
new file mode 100644
index 0000000..bb12bae
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/dla.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/dpn.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/dpn.cpython-39.pyc
new file mode 100644
index 0000000..73eb68d
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/dpn.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/efficientnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/efficientnet.cpython-39.pyc
new file mode 100644
index 0000000..ef17886
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/efficientnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/ghostnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/ghostnet.cpython-39.pyc
new file mode 100644
index 0000000..2f81872
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/ghostnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/googlenet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/googlenet.cpython-39.pyc
new file mode 100644
index 0000000..0326e19
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/googlenet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/gvt.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/gvt.cpython-39.pyc
new file mode 100644
index 0000000..54b1d13
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/gvt.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/hardnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/hardnet.cpython-39.pyc
new file mode 100644
index 0000000..5919e76
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/hardnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/inception_v4.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/inception_v4.cpython-39.pyc
new file mode 100644
index 0000000..8cb333a
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/inception_v4.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/levit.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/levit.cpython-39.pyc
new file mode 100644
index 0000000..bd35a6b
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/levit.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/mixnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/mixnet.cpython-39.pyc
new file mode 100644
index 0000000..0607c52
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/mixnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/mobilenet_v2.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/mobilenet_v2.cpython-39.pyc
new file mode 100644
index 0000000..b6b5e94
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/mobilenet_v2.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/pvt_v2.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/pvt_v2.cpython-39.pyc
new file mode 100644
index 0000000..391ff9b
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/pvt_v2.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/rednet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/rednet.cpython-39.pyc
new file mode 100644
index 0000000..019481a
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/rednet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/regnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/regnet.cpython-39.pyc
new file mode 100644
index 0000000..7a1a00a
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/regnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/repvgg.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/repvgg.cpython-39.pyc
new file mode 100644
index 0000000..f518d93
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/repvgg.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/res2net.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/res2net.cpython-39.pyc
new file mode 100644
index 0000000..edb79ea
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/res2net.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/res2net_vd.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/res2net_vd.cpython-39.pyc
new file mode 100644
index 0000000..d7f0f91
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/res2net_vd.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnest.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnest.cpython-39.pyc
new file mode 100644
index 0000000..1466a3f
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnest.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnet_vc.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnet_vc.cpython-39.pyc
new file mode 100644
index 0000000..144f063
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnet_vc.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnext.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnext.cpython-39.pyc
new file mode 100644
index 0000000..9d915d8
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnext.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnext101_wsl.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnext101_wsl.cpython-39.pyc
new file mode 100644
index 0000000..85ad8fa
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnext101_wsl.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnext_vd.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnext_vd.cpython-39.pyc
new file mode 100644
index 0000000..6a76923
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/resnext_vd.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/rexnet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/rexnet.cpython-39.pyc
new file mode 100644
index 0000000..7f387c4
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/rexnet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/se_resnet_vd.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/se_resnet_vd.cpython-39.pyc
new file mode 100644
index 0000000..25a8437
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/se_resnet_vd.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/se_resnext.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/se_resnext.cpython-39.pyc
new file mode 100644
index 0000000..ba0e14c
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/se_resnext.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/se_resnext_vd.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/se_resnext_vd.cpython-39.pyc
new file mode 100644
index 0000000..fb46ef4
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/se_resnext_vd.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/shufflenet_v2.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/shufflenet_v2.cpython-39.pyc
new file mode 100644
index 0000000..f27b598
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/shufflenet_v2.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/squeezenet.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/squeezenet.cpython-39.pyc
new file mode 100644
index 0000000..bdd7361
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/squeezenet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/swin_transformer.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/swin_transformer.cpython-39.pyc
new file mode 100644
index 0000000..29ace62
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/swin_transformer.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/tnt.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/tnt.cpython-39.pyc
new file mode 100644
index 0000000..c340279
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/tnt.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/vision_transformer.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/vision_transformer.cpython-39.pyc
new file mode 100644
index 0000000..f0ac12a
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/vision_transformer.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/xception.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/xception.cpython-39.pyc
new file mode 100644
index 0000000..336dfb4
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/xception.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/xception_deeplab.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/xception_deeplab.cpython-39.pyc
new file mode 100644
index 0000000..f73991b
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/model_zoo/__pycache__/xception_deeplab.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/alexnet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/alexnet.py
new file mode 100644
index 0000000..b44901a
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/alexnet.py
@@ -0,0 +1,168 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout, ReLU
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "AlexNet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/AlexNet_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvPoolLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride,
+                 padding,
+                 stdv,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvPoolLayer, self).__init__()
+
+        self.relu = ReLU() if act == "relu" else None
+
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(
+                name=name + "_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name=name + "_offset", initializer=Uniform(-stdv, stdv)))
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        if self.relu is not None:
+            x = self.relu(x)
+        x = self._pool(x)
+        return x
+
+
+class AlexNetDY(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(AlexNetDY, self).__init__()
+
+        stdv = 1.0 / math.sqrt(3 * 11 * 11)
+        self._conv1 = ConvPoolLayer(
+            3, 64, 11, 4, 2, stdv, act="relu", name="conv1")
+        stdv = 1.0 / math.sqrt(64 * 5 * 5)
+        self._conv2 = ConvPoolLayer(
+            64, 192, 5, 1, 2, stdv, act="relu", name="conv2")
+        stdv = 1.0 / math.sqrt(192 * 3 * 3)
+        self._conv3 = Conv2D(
+            192,
+            384,
+            3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(
+                name="conv3_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="conv3_offset", initializer=Uniform(-stdv, stdv)))
+        stdv = 1.0 / math.sqrt(384 * 3 * 3)
+        self._conv4 = Conv2D(
+            384,
+            256,
+            3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(
+                name="conv4_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="conv4_offset", initializer=Uniform(-stdv, stdv)))
+        stdv = 1.0 / math.sqrt(256 * 3 * 3)
+        self._conv5 = ConvPoolLayer(
+            256, 256, 3, 1, 1, stdv, act="relu", name="conv5")
+        stdv = 1.0 / math.sqrt(256 * 6 * 6)
+
+        self._drop1 = Dropout(p=0.5, mode="downscale_in_infer")
+        self._fc6 = Linear(
+            in_features=256 * 6 * 6,
+            out_features=4096,
+            weight_attr=ParamAttr(
+                name="fc6_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="fc6_offset", initializer=Uniform(-stdv, stdv)))
+
+        self._drop2 = Dropout(p=0.5, mode="downscale_in_infer")
+        self._fc7 = Linear(
+            in_features=4096,
+            out_features=4096,
+            weight_attr=ParamAttr(
+                name="fc7_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="fc7_offset", initializer=Uniform(-stdv, stdv)))
+        self._fc8 = Linear(
+            in_features=4096,
+            out_features=class_num,
+            weight_attr=ParamAttr(
+                name="fc8_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="fc8_offset", initializer=Uniform(-stdv, stdv)))
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        x = self._conv3(x)
+        x = F.relu(x)
+        x = self._conv4(x)
+        x = F.relu(x)
+        x = self._conv5(x)
+        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
+        x = self._drop1(x)
+        x = self._fc6(x)
+        x = F.relu(x)
+        x = self._drop2(x)
+        x = self._fc7(x)
+        x = F.relu(x)
+        x = self._fc8(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def AlexNet(pretrained=False, use_ssld=False, **kwargs):
+    model = AlexNetDY(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["AlexNet"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/cspnet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/cspnet.py
new file mode 100644
index 0000000..ab5021f
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/cspnet.py
@@ -0,0 +1,376 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/rwightman/pytorch-image-models
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "CSPDarkNet53":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSPDarkNet53_pretrained.pdparams"
+}
+
+MODEL_CFGS = {
+    "CSPDarkNet53": dict(
+        stem=dict(
+            out_chs=32, kernel_size=3, stride=1, pool=''),
+        stage=dict(
+            out_chs=(64, 128, 256, 512, 1024),
+            depth=(1, 2, 8, 8, 4),
+            stride=(2, ) * 5,
+            exp_ratio=(2., ) + (1., ) * 4,
+            bottle_ratio=(0.5, ) + (1.0, ) * 4,
+            block_ratio=(1., ) + (0.5, ) * 4,
+            down_growth=True, ))
+}
+
+__all__ = ['CSPDarkNet53'
+           ]  # model_registry will add each entrypoint fn to this
+
+
+class ConvBnAct(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=None,
+                 dilation=1,
+                 groups=1,
+                 act_layer=nn.LeakyReLU,
+                 norm_layer=nn.BatchNorm2D):
+        super().__init__()
+        if padding is None:
+            padding = (kernel_size - 1) // 2
+        self.conv = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=ParamAttr(),
+            bias_attr=False)
+
+        self.bn = norm_layer(num_features=output_channels)
+        self.act = act_layer()
+
+    def forward(self, inputs):
+        x = self.conv(inputs)
+        x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+def create_stem(in_chans=3,
+                out_chs=32,
+                kernel_size=3,
+                stride=2,
+                pool='',
+                act_layer=None,
+                norm_layer=None):
+    stem = nn.Sequential()
+    if not isinstance(out_chs, (tuple, list)):
+        out_chs = [out_chs]
+    assert len(out_chs)
+    in_c = in_chans
+    for i, out_c in enumerate(out_chs):
+        conv_name = f'conv{i + 1}'
+        stem.add_sublayer(
+            conv_name,
+            ConvBnAct(
+                in_c,
+                out_c,
+                kernel_size,
+                stride=stride if i == 0 else 1,
+                act_layer=act_layer,
+                norm_layer=norm_layer))
+        in_c = out_c
+        last_conv = conv_name
+    if pool:
+        stem.add_sublayer(
+            'pool', nn.MaxPool2D(
+                kernel_size=3, stride=2, padding=1))
+    return stem, dict(
+        num_chs=in_c, reduction=stride, module='.'.join(['stem', last_conv]))
+
+
+class DarkBlock(nn.Layer):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 dilation=1,
+                 bottle_ratio=0.5,
+                 groups=1,
+                 act_layer=nn.ReLU,
+                 norm_layer=nn.BatchNorm2D,
+                 attn_layer=None,
+                 drop_block=None):
+        super(DarkBlock, self).__init__()
+        mid_chs = int(round(out_chs * bottle_ratio))
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+        self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
+        self.conv2 = ConvBnAct(
+            mid_chs,
+            out_chs,
+            kernel_size=3,
+            dilation=dilation,
+            groups=groups,
+            **ckwargs)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = x + shortcut
+        return x
+
+
+class CrossStage(nn.Layer):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 stride,
+                 dilation,
+                 depth,
+                 block_ratio=1.,
+                 bottle_ratio=1.,
+                 exp_ratio=1.,
+                 groups=1,
+                 first_dilation=None,
+                 down_growth=False,
+                 cross_linear=False,
+                 block_dpr=None,
+                 block_fn=DarkBlock,
+                 **block_kwargs):
+        super(CrossStage, self).__init__()
+        first_dilation = first_dilation or dilation
+        down_chs = out_chs if down_growth else in_chs
+        exp_chs = int(round(out_chs * exp_ratio))
+        block_out_chs = int(round(out_chs * block_ratio))
+        conv_kwargs = dict(
+            act_layer=block_kwargs.get('act_layer'),
+            norm_layer=block_kwargs.get('norm_layer'))
+
+        if stride != 1 or first_dilation != dilation:
+            self.conv_down = ConvBnAct(
+                in_chs,
+                down_chs,
+                kernel_size=3,
+                stride=stride,
+                dilation=first_dilation,
+                groups=groups,
+                **conv_kwargs)
+            prev_chs = down_chs
+        else:
+            self.conv_down = None
+            prev_chs = in_chs
+
+        self.conv_exp = ConvBnAct(
+            prev_chs, exp_chs, kernel_size=1, **conv_kwargs)
+        prev_chs = exp_chs // 2  # output of conv_exp is always split in two
+
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            self.blocks.add_sublayer(
+                str(i),
+                block_fn(prev_chs, block_out_chs, dilation, bottle_ratio,
+                         groups, **block_kwargs))
+            prev_chs = block_out_chs
+
+        # transition convs
+        self.conv_transition_b = ConvBnAct(
+            prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
+        self.conv_transition = ConvBnAct(
+            exp_chs, out_chs, kernel_size=1, **conv_kwargs)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        x = self.conv_exp(x)
+        split = x.shape[1] // 2
+        xs, xb = x[:, :split], x[:, split:]
+        xb = self.blocks(xb)
+        xb = self.conv_transition_b(xb)
+        out = self.conv_transition(paddle.concat([xs, xb], axis=1))
+        return out
+
+
+class DarkStage(nn.Layer):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 stride,
+                 dilation,
+                 depth,
+                 block_ratio=1.,
+                 bottle_ratio=1.,
+                 groups=1,
+                 first_dilation=None,
+                 block_fn=DarkBlock,
+                 block_dpr=None,
+                 **block_kwargs):
+        super().__init__()
+        first_dilation = first_dilation or dilation
+
+        self.conv_down = ConvBnAct(
+            in_chs,
+            out_chs,
+            kernel_size=3,
+            stride=stride,
+            dilation=first_dilation,
+            groups=groups,
+            act_layer=block_kwargs.get('act_layer'),
+            norm_layer=block_kwargs.get('norm_layer'))
+
+        prev_chs = out_chs
+        block_out_chs = int(round(out_chs * block_ratio))
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            self.blocks.add_sublayer(
+                str(i),
+                block_fn(prev_chs, block_out_chs, dilation, bottle_ratio,
+                         groups, **block_kwargs))
+            prev_chs = block_out_chs
+
+    def forward(self, x):
+        x = self.conv_down(x)
+        x = self.blocks(x)
+        return x
+
+
+def _cfg_to_stage_args(cfg, curr_stride=2, output_stride=32):
+    # get per stage args for stage and containing blocks, calculate strides to meet target output_stride
+    num_stages = len(cfg['depth'])
+    if 'groups' not in cfg:
+        cfg['groups'] = (1, ) * num_stages
+    if 'down_growth' in cfg and not isinstance(cfg['down_growth'],
+                                               (list, tuple)):
+        cfg['down_growth'] = (cfg['down_growth'], ) * num_stages
+    stage_strides = []
+    stage_dilations = []
+    stage_first_dilations = []
+    dilation = 1
+    for cfg_stride in cfg['stride']:
+        stage_first_dilations.append(dilation)
+        if curr_stride >= output_stride:
+            dilation *= cfg_stride
+            stride = 1
+        else:
+            stride = cfg_stride
+            curr_stride *= stride
+        stage_strides.append(stride)
+        stage_dilations.append(dilation)
+    cfg['stride'] = stage_strides
+    cfg['dilation'] = stage_dilations
+    cfg['first_dilation'] = stage_first_dilations
+    stage_args = [
+        dict(zip(cfg.keys(), values)) for values in zip(*cfg.values())
+    ]
+    return stage_args
+
+
+class CSPNet(nn.Layer):
+    def __init__(self,
+                 cfg,
+                 in_chans=3,
+                 class_num=1000,
+                 output_stride=32,
+                 global_pool='avg',
+                 drop_rate=0.,
+                 act_layer=nn.LeakyReLU,
+                 norm_layer=nn.BatchNorm2D,
+                 zero_init_last_bn=True,
+                 stage_fn=CrossStage,
+                 block_fn=DarkBlock):
+        super().__init__()
+        self.class_num = class_num
+        self.drop_rate = drop_rate
+        assert output_stride in (8, 16, 32)
+        layer_args = dict(act_layer=act_layer, norm_layer=norm_layer)
+
+        # Construct the stem
+        self.stem, stem_feat_info = create_stem(in_chans, **cfg['stem'],
+                                                **layer_args)
+        self.feature_info = [stem_feat_info]
+        prev_chs = stem_feat_info['num_chs']
+        curr_stride = stem_feat_info[
+            'reduction']  # reduction does not include pool
+        if cfg['stem']['pool']:
+            curr_stride *= 2
+
+        # Construct the stages
+        per_stage_args = _cfg_to_stage_args(
+            cfg['stage'], curr_stride=curr_stride, output_stride=output_stride)
+        self.stages = nn.LayerList()
+        for i, sa in enumerate(per_stage_args):
+            self.stages.add_sublayer(
+                str(i),
+                stage_fn(
+                    prev_chs, **sa, **layer_args, block_fn=block_fn))
+            prev_chs = sa['out_chs']
+            curr_stride *= sa['stride']
+            self.feature_info += [
+                dict(
+                    num_chs=prev_chs,
+                    reduction=curr_stride,
+                    module=f'stages.{i}')
+            ]
+
+        # Construct the head
+        self.num_features = prev_chs
+
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        self.flatten = nn.Flatten(1)
+        self.fc = nn.Linear(
+            prev_chs,
+            class_num,
+            weight_attr=ParamAttr(),
+            bias_attr=ParamAttr())
+
+    def forward(self, x):
+        x = self.stem(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = self.pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def CSPDarkNet53(pretrained=False, use_ssld=False, **kwargs):
+    model = CSPNet(MODEL_CFGS["CSPDarkNet53"], block_fn=DarkBlock, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["CSPDarkNet53"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/darknet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/darknet.py
new file mode 100644
index 0000000..75aafd8
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/darknet.py
@@ -0,0 +1,197 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "DarkNet53":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DarkNet53_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride,
+                 padding,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + ".conv.weights"),
+            bias_attr=False)
+
+        bn_name = name + ".bn"
+        self._bn = BatchNorm(
+            num_channels=output_channels,
+            act="relu",
+            param_attr=ParamAttr(name=bn_name + ".scale"),
+            bias_attr=ParamAttr(name=bn_name + ".offset"),
+            moving_mean_name=bn_name + ".mean",
+            moving_variance_name=bn_name + ".var")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._bn(x)
+        return x
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, name=None):
+        super(BasicBlock, self).__init__()
+
+        self._conv1 = ConvBNLayer(
+            input_channels, output_channels, 1, 1, 0, name=name + ".0")
+        self._conv2 = ConvBNLayer(
+            output_channels, output_channels * 2, 3, 1, 1, name=name + ".1")
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        return paddle.add(x=inputs, y=x)
+
+
+class DarkNet(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(DarkNet, self).__init__()
+
+        self.stages = [1, 2, 8, 8, 4]
+        self._conv1 = ConvBNLayer(3, 32, 3, 1, 1, name="yolo_input")
+        self._conv2 = ConvBNLayer(
+            32, 64, 3, 2, 1, name="yolo_input.downsample")
+
+        self._basic_block_01 = BasicBlock(64, 32, name="stage.0.0")
+        self._downsample_0 = ConvBNLayer(
+            64, 128, 3, 2, 1, name="stage.0.downsample")
+
+        self._basic_block_11 = BasicBlock(128, 64, name="stage.1.0")
+        self._basic_block_12 = BasicBlock(128, 64, name="stage.1.1")
+        self._downsample_1 = ConvBNLayer(
+            128, 256, 3, 2, 1, name="stage.1.downsample")
+
+        self._basic_block_21 = BasicBlock(256, 128, name="stage.2.0")
+        self._basic_block_22 = BasicBlock(256, 128, name="stage.2.1")
+        self._basic_block_23 = BasicBlock(256, 128, name="stage.2.2")
+        self._basic_block_24 = BasicBlock(256, 128, name="stage.2.3")
+        self._basic_block_25 = BasicBlock(256, 128, name="stage.2.4")
+        self._basic_block_26 = BasicBlock(256, 128, name="stage.2.5")
+        self._basic_block_27 = BasicBlock(256, 128, name="stage.2.6")
+        self._basic_block_28 = BasicBlock(256, 128, name="stage.2.7")
+        self._downsample_2 = ConvBNLayer(
+            256, 512, 3, 2, 1, name="stage.2.downsample")
+
+        self._basic_block_31 = BasicBlock(512, 256, name="stage.3.0")
+        self._basic_block_32 = BasicBlock(512, 256, name="stage.3.1")
+        self._basic_block_33 = BasicBlock(512, 256, name="stage.3.2")
+        self._basic_block_34 = BasicBlock(512, 256, name="stage.3.3")
+        self._basic_block_35 = BasicBlock(512, 256, name="stage.3.4")
+        self._basic_block_36 = BasicBlock(512, 256, name="stage.3.5")
+        self._basic_block_37 = BasicBlock(512, 256, name="stage.3.6")
+        self._basic_block_38 = BasicBlock(512, 256, name="stage.3.7")
+        self._downsample_3 = ConvBNLayer(
+            512, 1024, 3, 2, 1, name="stage.3.downsample")
+
+        self._basic_block_41 = BasicBlock(1024, 512, name="stage.4.0")
+        self._basic_block_42 = BasicBlock(1024, 512, name="stage.4.1")
+        self._basic_block_43 = BasicBlock(1024, 512, name="stage.4.2")
+        self._basic_block_44 = BasicBlock(1024, 512, name="stage.4.3")
+
+        self._pool = AdaptiveAvgPool2D(1)
+
+        stdv = 1.0 / math.sqrt(1024.0)
+        self._out = Linear(
+            1024,
+            class_num,
+            weight_attr=ParamAttr(
+                name="fc_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+
+        x = self._basic_block_01(x)
+        x = self._downsample_0(x)
+
+        x = self._basic_block_11(x)
+        x = self._basic_block_12(x)
+        x = self._downsample_1(x)
+
+        x = self._basic_block_21(x)
+        x = self._basic_block_22(x)
+        x = self._basic_block_23(x)
+        x = self._basic_block_24(x)
+        x = self._basic_block_25(x)
+        x = self._basic_block_26(x)
+        x = self._basic_block_27(x)
+        x = self._basic_block_28(x)
+        x = self._downsample_2(x)
+
+        x = self._basic_block_31(x)
+        x = self._basic_block_32(x)
+        x = self._basic_block_33(x)
+        x = self._basic_block_34(x)
+        x = self._basic_block_35(x)
+        x = self._basic_block_36(x)
+        x = self._basic_block_37(x)
+        x = self._basic_block_38(x)
+        x = self._downsample_3(x)
+
+        x = self._basic_block_41(x)
+        x = self._basic_block_42(x)
+        x = self._basic_block_43(x)
+        x = self._basic_block_44(x)
+
+        x = self._pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DarkNet53(pretrained=False, use_ssld=False, **kwargs):
+    model = DarkNet(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DarkNet53"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/densenet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/densenet.py
new file mode 100644
index 0000000..7e6e202
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/densenet.py
@@ -0,0 +1,344 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "DenseNet121":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet121_pretrained.pdparams",
+    "DenseNet161":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet161_pretrained.pdparams",
+    "DenseNet169":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet169_pretrained.pdparams",
+    "DenseNet201":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet201_pretrained.pdparams",
+    "DenseNet264":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet264_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class BNACConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(BNACConvLayer, self).__init__()
+
+        self._batch_norm = BatchNorm(
+            num_channels,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+
+    def forward(self, input):
+        y = self._batch_norm(input)
+        y = self._conv(y)
+        return y
+
+
+class DenseLayer(nn.Layer):
+    def __init__(self, num_channels, growth_rate, bn_size, dropout, name=None):
+        super(DenseLayer, self).__init__()
+        self.dropout = dropout
+
+        self.bn_ac_func1 = BNACConvLayer(
+            num_channels=num_channels,
+            num_filters=bn_size * growth_rate,
+            filter_size=1,
+            pad=0,
+            stride=1,
+            name=name + "_x1")
+
+        self.bn_ac_func2 = BNACConvLayer(
+            num_channels=bn_size * growth_rate,
+            num_filters=growth_rate,
+            filter_size=3,
+            pad=1,
+            stride=1,
+            name=name + "_x2")
+
+        if dropout:
+            self.dropout_func = Dropout(p=dropout, mode="downscale_in_infer")
+
+    def forward(self, input):
+        conv = self.bn_ac_func1(input)
+        conv = self.bn_ac_func2(conv)
+        if self.dropout:
+            conv = self.dropout_func(conv)
+        conv = paddle.concat([input, conv], axis=1)
+        return conv
+
+
+class DenseBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_layers,
+                 bn_size,
+                 growth_rate,
+                 dropout,
+                 name=None):
+        super(DenseBlock, self).__init__()
+        self.dropout = dropout
+
+        self.dense_layer_func = []
+
+        pre_channel = num_channels
+        for layer in range(num_layers):
+            self.dense_layer_func.append(
+                self.add_sublayer(
+                    "{}_{}".format(name, layer + 1),
+                    DenseLayer(
+                        num_channels=pre_channel,
+                        growth_rate=growth_rate,
+                        bn_size=bn_size,
+                        dropout=dropout,
+                        name=name + '_' + str(layer + 1))))
+            pre_channel = pre_channel + growth_rate
+
+    def forward(self, input):
+        conv = input
+        for func in self.dense_layer_func:
+            conv = func(conv)
+        return conv
+
+
+class TransitionLayer(nn.Layer):
+    def __init__(self, num_channels, num_output_features, name=None):
+        super(TransitionLayer, self).__init__()
+
+        self.conv_ac_func = BNACConvLayer(
+            num_channels=num_channels,
+            num_filters=num_output_features,
+            filter_size=1,
+            pad=0,
+            stride=1,
+            name=name)
+
+        self.pool2d_avg = AvgPool2D(kernel_size=2, stride=2, padding=0)
+
+    def forward(self, input):
+        y = self.conv_ac_func(input)
+        y = self.pool2d_avg(y)
+        return y
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+    def forward(self, input):
+        y = self._conv(input)
+        y = self._batch_norm(y)
+        return y
+
+
+class DenseNet(nn.Layer):
+    def __init__(self, layers=60, bn_size=4, dropout=0, class_num=1000):
+        super(DenseNet, self).__init__()
+
+        supported_layers = [121, 161, 169, 201, 264]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        densenet_spec = {
+            121: (64, 32, [6, 12, 24, 16]),
+            161: (96, 48, [6, 12, 36, 24]),
+            169: (64, 32, [6, 12, 32, 32]),
+            201: (64, 32, [6, 12, 48, 32]),
+            264: (64, 32, [6, 12, 64, 48])
+        }
+        num_init_features, growth_rate, block_config = densenet_spec[layers]
+
+        self.conv1_func = ConvBNLayer(
+            num_channels=3,
+            num_filters=num_init_features,
+            filter_size=7,
+            stride=2,
+            pad=3,
+            act='relu',
+            name="conv1")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_config = block_config
+
+        self.dense_block_func_list = []
+        self.transition_func_list = []
+        pre_num_channels = num_init_features
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            self.dense_block_func_list.append(
+                self.add_sublayer(
+                    "db_conv_{}".format(i + 2),
+                    DenseBlock(
+                        num_channels=pre_num_channels,
+                        num_layers=num_layers,
+                        bn_size=bn_size,
+                        growth_rate=growth_rate,
+                        dropout=dropout,
+                        name='conv' + str(i + 2))))
+
+            num_features = num_features + num_layers * growth_rate
+            pre_num_channels = num_features
+
+            if i != len(block_config) - 1:
+                self.transition_func_list.append(
+                    self.add_sublayer(
+                        "tr_conv{}_blk".format(i + 2),
+                        TransitionLayer(
+                            num_channels=pre_num_channels,
+                            num_output_features=num_features // 2,
+                            name='conv' + str(i + 2) + "_blk")))
+                pre_num_channels = num_features // 2
+                num_features = num_features // 2
+
+        self.batch_norm = BatchNorm(
+            num_features,
+            act="relu",
+            param_attr=ParamAttr(name='conv5_blk_bn_scale'),
+            bias_attr=ParamAttr(name='conv5_blk_bn_offset'),
+            moving_mean_name='conv5_blk_bn_mean',
+            moving_variance_name='conv5_blk_bn_variance')
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        stdv = 1.0 / math.sqrt(num_features * 1.0)
+
+        self.out = Linear(
+            num_features,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, input):
+        conv = self.conv1_func(input)
+        conv = self.pool2d_max(conv)
+
+        for i, num_layers in enumerate(self.block_config):
+            conv = self.dense_block_func_list[i](conv)
+            if i != len(self.block_config) - 1:
+                conv = self.transition_func_list[i](conv)
+
+        conv = self.batch_norm(conv)
+        y = self.pool2d_avg(conv)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DenseNet121(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=121, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet121"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet161(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=161, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet161"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet169(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=169, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet169"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet201(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=201, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet201"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet264(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=264, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet264"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/distilled_vision_transformer.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/distilled_vision_transformer.py
new file mode 100644
index 0000000..676a289
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/distilled_vision_transformer.py
@@ -0,0 +1,272 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/facebookresearch/deit
+
+import paddle
+import paddle.nn as nn
+from .vision_transformer import VisionTransformer, Identity, trunc_normal_, zeros_
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "DeiT_tiny_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_tiny_patch16_224_pretrained.pdparams",
+    "DeiT_small_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_small_patch16_224_pretrained.pdparams",
+    "DeiT_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_patch16_224_pretrained.pdparams",
+    "DeiT_tiny_distilled_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_tiny_distilled_patch16_224_pretrained.pdparams",
+    "DeiT_small_distilled_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_small_distilled_patch16_224_pretrained.pdparams",
+    "DeiT_base_distilled_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_distilled_patch16_224_pretrained.pdparams",
+    "DeiT_base_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_patch16_384_pretrained.pdparams",
+    "DeiT_base_distilled_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_distilled_patch16_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class DistilledVisionTransformer(VisionTransformer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 class_num=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 **kwargs):
+        super().__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            class_num=class_num,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            epsilon=epsilon,
+            **kwargs)
+        self.pos_embed = self.create_parameter(
+            shape=(1, self.patch_embed.num_patches + 2, self.embed_dim),
+            default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+
+        self.dist_token = self.create_parameter(
+            shape=(1, 1, self.embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+
+        self.head_dist = nn.Linear(
+            self.embed_dim,
+            self.class_num) if self.class_num > 0 else Identity()
+
+        trunc_normal_(self.dist_token)
+        trunc_normal_(self.pos_embed)
+        self.head_dist.apply(self._init_weights)
+
+    def forward_features(self, x):
+        B = paddle.shape(x)[0]
+        x = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand((B, -1, -1))
+        dist_token = self.dist_token.expand((B, -1, -1))
+        x = paddle.concat((cls_tokens, dist_token, x), axis=1)
+
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x[:, 0], x[:, 1]
+
+    def forward(self, x):
+        x, x_dist = self.forward_features(x)
+        x = self.head(x)
+        x_dist = self.head_dist(x_dist)
+        return (x + x_dist) / 2
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DeiT_tiny_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_tiny_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_small_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_small_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_tiny_distilled_patch16_224(pretrained=False, use_ssld=False,
+                                    **kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_tiny_distilled_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_small_distilled_patch16_224(pretrained=False,
+                                     use_ssld=False,
+                                     **kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_small_distilled_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_distilled_patch16_224(pretrained=False, use_ssld=False,
+                                    **kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_distilled_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_patch16_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_patch16_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_distilled_patch16_384(pretrained=False, use_ssld=False,
+                                    **kwargs):
+    model = DistilledVisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_distilled_patch16_384"],
+        use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/dla.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/dla.py
new file mode 100644
index 0000000..b1c00b2
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/dla.py
@@ -0,0 +1,528 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/ucbdrive/dla
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn.initializer import Normal, Constant
+
+from ppcls.arch.backbone.base.theseus_layer import Identity
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "DLA34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA34_pretrained.pdparams",
+    "DLA46_c":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA46_c_pretrained.pdparams",
+    "DLA46x_c":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA46x_c_pretrained.pdparams",
+    "DLA60":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA60_pretrained.pdparams",
+    "DLA60x":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA60x_pretrained.pdparams",
+    "DLA60x_c":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA60x_c_pretrained.pdparams",
+    "DLA102":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA102_pretrained.pdparams",
+    "DLA102x":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA102x_pretrained.pdparams",
+    "DLA102x2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA102x2_pretrained.pdparams",
+    "DLA169":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA169_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class DlaBasic(nn.Layer):
+    def __init__(self, inplanes, planes, stride=1, dilation=1, **cargs):
+        super(DlaBasic, self).__init__()
+        self.conv1 = nn.Conv2D(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias_attr=False,
+            dilation=dilation)
+        self.bn1 = nn.BatchNorm2D(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            bias_attr=False,
+            dilation=dilation)
+        self.bn2 = nn.BatchNorm2D(planes)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class DlaBottleneck(nn.Layer):
+    expansion = 2
+
+    def __init__(self,
+                 inplanes,
+                 outplanes,
+                 stride=1,
+                 dilation=1,
+                 cardinality=1,
+                 base_width=64):
+        super(DlaBottleneck, self).__init__()
+        self.stride = stride
+        mid_planes = int(
+            math.floor(outplanes * (base_width / 64)) * cardinality)
+        mid_planes = mid_planes // self.expansion
+
+        self.conv1 = nn.Conv2D(
+            inplanes, mid_planes, kernel_size=1, bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(mid_planes)
+        self.conv2 = nn.Conv2D(
+            mid_planes,
+            mid_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias_attr=False,
+            dilation=dilation,
+            groups=cardinality)
+        self.bn2 = nn.BatchNorm2D(mid_planes)
+        self.conv3 = nn.Conv2D(
+            mid_planes, outplanes, kernel_size=1, bias_attr=False)
+        self.bn3 = nn.BatchNorm2D(outplanes)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class DlaRoot(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(DlaRoot, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            1,
+            stride=1,
+            bias_attr=False,
+            padding=(kernel_size - 1) // 2)
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(paddle.concat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class DlaTree(nn.Layer):
+    def __init__(self,
+                 levels,
+                 block,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 dilation=1,
+                 cardinality=1,
+                 base_width=64,
+                 level_root=False,
+                 root_dim=0,
+                 root_kernel_size=1,
+                 root_residual=False):
+        super(DlaTree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+
+        self.downsample = nn.MaxPool2D(
+            stride, stride=stride) if stride > 1 else Identity()
+        self.project = Identity()
+        cargs = dict(
+            dilation=dilation, cardinality=cardinality, base_width=base_width)
+
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride, **cargs)
+            self.tree2 = block(out_channels, out_channels, 1, **cargs)
+            if in_channels != out_channels:
+                self.project = nn.Sequential(
+                    nn.Conv2D(
+                        in_channels,
+                        out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        bias_attr=False),
+                    nn.BatchNorm2D(out_channels))
+        else:
+            cargs.update(
+                dict(
+                    root_kernel_size=root_kernel_size,
+                    root_residual=root_residual))
+            self.tree1 = DlaTree(
+                levels - 1,
+                block,
+                in_channels,
+                out_channels,
+                stride,
+                root_dim=0,
+                **cargs)
+            self.tree2 = DlaTree(
+                levels - 1,
+                block,
+                out_channels,
+                out_channels,
+                root_dim=root_dim + out_channels,
+                **cargs)
+
+        if levels == 1:
+            self.root = DlaRoot(root_dim, out_channels, root_kernel_size,
+                                root_residual)
+
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.levels = levels
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x)
+        residual = self.project(bottom)
+
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Layer):
+    def __init__(self,
+                 levels,
+                 channels,
+                 in_chans=3,
+                 cardinality=1,
+                 base_width=64,
+                 block=DlaBottleneck,
+                 residual_root=False,
+                 drop_rate=0.0,
+                 class_num=1000,
+                 with_pool=True):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.class_num = class_num
+        self.with_pool = with_pool
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.drop_rate = drop_rate
+
+        self.base_layer = nn.Sequential(
+            nn.Conv2D(
+                in_chans,
+                channels[0],
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                bias_attr=False),
+            nn.BatchNorm2D(channels[0]),
+            nn.ReLU())
+
+        self.level0 = self._make_conv_level(channels[0], channels[0],
+                                            levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+
+        cargs = dict(
+            cardinality=cardinality,
+            base_width=base_width,
+            root_residual=residual_root)
+
+        self.level2 = DlaTree(
+            levels[2],
+            block,
+            channels[1],
+            channels[2],
+            2,
+            level_root=False,
+            **cargs)
+        self.level3 = DlaTree(
+            levels[3],
+            block,
+            channels[2],
+            channels[3],
+            2,
+            level_root=True,
+            **cargs)
+        self.level4 = DlaTree(
+            levels[4],
+            block,
+            channels[3],
+            channels[4],
+            2,
+            level_root=True,
+            **cargs)
+        self.level5 = DlaTree(
+            levels[5],
+            block,
+            channels[4],
+            channels[5],
+            2,
+            level_root=True,
+            **cargs)
+
+        self.feature_info = [
+            # rare to have a meaningful stride 1 level
+            dict(
+                num_chs=channels[0], reduction=1, module='level0'),
+            dict(
+                num_chs=channels[1], reduction=2, module='level1'),
+            dict(
+                num_chs=channels[2], reduction=4, module='level2'),
+            dict(
+                num_chs=channels[3], reduction=8, module='level3'),
+            dict(
+                num_chs=channels[4], reduction=16, module='level4'),
+            dict(
+                num_chs=channels[5], reduction=32, module='level5'),
+        ]
+
+        self.num_features = channels[-1]
+
+        if with_pool:
+            self.global_pool = nn.AdaptiveAvgPool2D(1)
+
+        if class_num > 0:
+            self.fc = nn.Conv2D(self.num_features, class_num, 1)
+
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                normal_ = Normal(mean=0.0, std=math.sqrt(2. / n))
+                normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                ones_(m.weight)
+                zeros_(m.bias)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2D(
+                    inplanes,
+                    planes,
+                    kernel_size=3,
+                    stride=stride if i == 0 else 1,
+                    padding=dilation,
+                    bias_attr=False,
+                    dilation=dilation), nn.BatchNorm2D(planes), nn.ReLU()
+            ])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward_features(self, x):
+        x = self.base_layer(x)
+
+        x = self.level0(x)
+        x = self.level1(x)
+        x = self.level2(x)
+        x = self.level3(x)
+        x = self.level4(x)
+        x = self.level5(x)
+
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+
+        if self.with_pool:
+            x = self.global_pool(x)
+
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+
+        if self.class_num > 0:
+            x = self.fc(x)
+            x = x.flatten(1)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DLA34(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 2, 1),
+                channels=(16, 32, 64, 128, 256, 512),
+                block=DlaBasic,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA34"])
+    return model
+
+
+def DLA46_c(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 2, 1),
+                channels=(16, 32, 64, 64, 128, 256),
+                block=DlaBottleneck,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA46_c"])
+    return model
+
+
+def DLA46x_c(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 2, 1),
+                channels=(16, 32, 64, 64, 128, 256),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA46x_c"])
+    return model
+
+
+def DLA60(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 3, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA60"])
+    return model
+
+
+def DLA60x(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 3, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA60x"])
+    return model
+
+
+def DLA60x_c(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 3, 1),
+                channels=(16, 32, 64, 64, 128, 256),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA60x_c"])
+    return model
+
+
+def DLA102(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 3, 4, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA102"])
+    return model
+
+
+def DLA102x(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 3, 4, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA102x"])
+    return model
+
+
+def DLA102x2(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 3, 4, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                cardinality=64,
+                base_width=4,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA102x2"])
+    return model
+
+
+def DLA169(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 2, 3, 5, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA169"])
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/dpn.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/dpn.py
new file mode 100644
index 0000000..55953ed
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/dpn.py
@@ -0,0 +1,451 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import sys
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "DPN68":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN68_pretrained.pdparams",
+    "DPN92":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN92_pretrained.pdparams",
+    "DPN98":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN98_pretrained.pdparams",
+    "DPN107":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN107_pretrained.pdparams",
+    "DPN131":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN131_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+    def forward(self, input):
+        y = self._conv(input)
+        y = self._batch_norm(y)
+        return y
+
+
+class BNACConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(BNACConvLayer, self).__init__()
+        self.num_channels = num_channels
+
+        self._batch_norm = BatchNorm(
+            num_channels,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+
+    def forward(self, input):
+        y = self._batch_norm(input)
+        y = self._conv(y)
+        return y
+
+
+class DualPathFactory(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_1x1_a,
+                 num_3x3_b,
+                 num_1x1_c,
+                 inc,
+                 G,
+                 _type='normal',
+                 name=None):
+        super(DualPathFactory, self).__init__()
+
+        self.num_1x1_c = num_1x1_c
+        self.inc = inc
+        self.name = name
+
+        kw = 3
+        kh = 3
+        pw = (kw - 1) // 2
+        ph = (kh - 1) // 2
+
+        # type
+        if _type == 'proj':
+            key_stride = 1
+            self.has_proj = True
+        elif _type == 'down':
+            key_stride = 2
+            self.has_proj = True
+        elif _type == 'normal':
+            key_stride = 1
+            self.has_proj = False
+        else:
+            print("not implemented now!!!")
+            sys.exit(1)
+
+        data_in_ch = sum(num_channels) if isinstance(num_channels,
+                                                     list) else num_channels
+
+        if self.has_proj:
+            self.c1x1_w_func = BNACConvLayer(
+                num_channels=data_in_ch,
+                num_filters=num_1x1_c + 2 * inc,
+                filter_size=(1, 1),
+                pad=(0, 0),
+                stride=(key_stride, key_stride),
+                name=name + "_match")
+
+        self.c1x1_a_func = BNACConvLayer(
+            num_channels=data_in_ch,
+            num_filters=num_1x1_a,
+            filter_size=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv1")
+
+        self.c3x3_b_func = BNACConvLayer(
+            num_channels=num_1x1_a,
+            num_filters=num_3x3_b,
+            filter_size=(kw, kh),
+            pad=(pw, ph),
+            stride=(key_stride, key_stride),
+            groups=G,
+            name=name + "_conv2")
+
+        self.c1x1_c_func = BNACConvLayer(
+            num_channels=num_3x3_b,
+            num_filters=num_1x1_c + inc,
+            filter_size=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv3")
+
+    def forward(self, input):
+        # PROJ
+        if isinstance(input, list):
+            data_in = paddle.concat([input[0], input[1]], axis=1)
+        else:
+            data_in = input
+
+        if self.has_proj:
+            c1x1_w = self.c1x1_w_func(data_in)
+            data_o1, data_o2 = paddle.split(
+                c1x1_w, num_or_sections=[self.num_1x1_c, 2 * self.inc], axis=1)
+        else:
+            data_o1 = input[0]
+            data_o2 = input[1]
+
+        c1x1_a = self.c1x1_a_func(data_in)
+        c3x3_b = self.c3x3_b_func(c1x1_a)
+        c1x1_c = self.c1x1_c_func(c3x3_b)
+
+        c1x1_c1, c1x1_c2 = paddle.split(
+            c1x1_c, num_or_sections=[self.num_1x1_c, self.inc], axis=1)
+
+        # OUTPUTS
+        summ = paddle.add(x=data_o1, y=c1x1_c1)
+        dense = paddle.concat([data_o2, c1x1_c2], axis=1)
+        # tensor, channels
+        return [summ, dense]
+
+
+class DPN(nn.Layer):
+    def __init__(self, layers=68, class_num=1000):
+        super(DPN, self).__init__()
+
+        self._class_num = class_num
+
+        args = self.get_net_args(layers)
+        bws = args['bw']
+        inc_sec = args['inc_sec']
+        rs = args['r']
+        k_r = args['k_r']
+        k_sec = args['k_sec']
+        G = args['G']
+        init_num_filter = args['init_num_filter']
+        init_filter_size = args['init_filter_size']
+        init_padding = args['init_padding']
+
+        self.k_sec = k_sec
+
+        self.conv1_x_1_func = ConvBNLayer(
+            num_channels=3,
+            num_filters=init_num_filter,
+            filter_size=init_filter_size,
+            stride=2,
+            pad=init_padding,
+            act='relu',
+            name="conv1")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        num_channel_dpn = init_num_filter
+
+        self.dpn_func_list = []
+        #conv2 - conv5
+        match_list, num = [], 0
+        for gc in range(4):
+            bw = bws[gc]
+            inc = inc_sec[gc]
+            R = (k_r * bw) // rs[gc]
+            if gc == 0:
+                _type1 = 'proj'
+                _type2 = 'normal'
+                match = 1
+            else:
+                _type1 = 'down'
+                _type2 = 'normal'
+                match = match + k_sec[gc - 1]
+            match_list.append(match)
+            self.dpn_func_list.append(
+                self.add_sublayer(
+                    "dpn{}".format(match),
+                    DualPathFactory(
+                        num_channels=num_channel_dpn,
+                        num_1x1_a=R,
+                        num_3x3_b=R,
+                        num_1x1_c=bw,
+                        inc=inc,
+                        G=G,
+                        _type=_type1,
+                        name="dpn" + str(match))))
+            num_channel_dpn = [bw, 3 * inc]
+
+            for i_ly in range(2, k_sec[gc] + 1):
+                num += 1
+                if num in match_list:
+                    num += 1
+                self.dpn_func_list.append(
+                    self.add_sublayer(
+                        "dpn{}".format(num),
+                        DualPathFactory(
+                            num_channels=num_channel_dpn,
+                            num_1x1_a=R,
+                            num_3x3_b=R,
+                            num_1x1_c=bw,
+                            inc=inc,
+                            G=G,
+                            _type=_type2,
+                            name="dpn" + str(num))))
+
+                num_channel_dpn = [
+                    num_channel_dpn[0], num_channel_dpn[1] + inc
+                ]
+
+        out_channel = sum(num_channel_dpn)
+
+        self.conv5_x_x_bn = BatchNorm(
+            num_channels=sum(num_channel_dpn),
+            act="relu",
+            param_attr=ParamAttr(name='final_concat_bn_scale'),
+            bias_attr=ParamAttr('final_concat_bn_offset'),
+            moving_mean_name='final_concat_bn_mean',
+            moving_variance_name='final_concat_bn_variance')
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        stdv = 0.01
+
+        self.out = Linear(
+            out_channel,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, input):
+        conv1_x_1 = self.conv1_x_1_func(input)
+        convX_x_x = self.pool2d_max(conv1_x_1)
+
+        dpn_idx = 0
+        for gc in range(4):
+            convX_x_x = self.dpn_func_list[dpn_idx](convX_x_x)
+            dpn_idx += 1
+            for i_ly in range(2, self.k_sec[gc] + 1):
+                convX_x_x = self.dpn_func_list[dpn_idx](convX_x_x)
+                dpn_idx += 1
+
+        conv5_x_x = paddle.concat(convX_x_x, axis=1)
+        conv5_x_x = self.conv5_x_x_bn(conv5_x_x)
+
+        y = self.pool2d_avg(conv5_x_x)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self.out(y)
+        return y
+
+    def get_net_args(self, layers):
+        if layers == 68:
+            k_r = 128
+            G = 32
+            k_sec = [3, 4, 12, 3]
+            inc_sec = [16, 32, 32, 64]
+            bw = [64, 128, 256, 512]
+            r = [64, 64, 64, 64]
+            init_num_filter = 10
+            init_filter_size = 3
+            init_padding = 1
+        elif layers == 92:
+            k_r = 96
+            G = 32
+            k_sec = [3, 4, 20, 3]
+            inc_sec = [16, 32, 24, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 64
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 98:
+            k_r = 160
+            G = 40
+            k_sec = [3, 6, 20, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 96
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 107:
+            k_r = 200
+            G = 50
+            k_sec = [4, 8, 20, 3]
+            inc_sec = [20, 64, 64, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 131:
+            k_r = 160
+            G = 40
+            k_sec = [4, 8, 28, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        else:
+            raise NotImplementedError
+        net_arg = {
+            'k_r': k_r,
+            'G': G,
+            'k_sec': k_sec,
+            'inc_sec': inc_sec,
+            'bw': bw,
+            'r': r
+        }
+        net_arg['init_num_filter'] = init_num_filter
+        net_arg['init_filter_size'] = init_filter_size
+        net_arg['init_padding'] = init_padding
+
+        return net_arg
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DPN68(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=68, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN68"])
+    return model
+
+
+def DPN92(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=92, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN92"])
+    return model
+
+
+def DPN98(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=98, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN98"])
+    return model
+
+
+def DPN107(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=107, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN107"])
+    return model
+
+
+def DPN131(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=131, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN131"])
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/efficientnet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/efficientnet.py
new file mode 100644
index 0000000..bd0cffa
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/efficientnet.py
@@ -0,0 +1,976 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/lukemelas/EfficientNet-PyTorch
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+import math
+import collections
+import re
+import copy
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "EfficientNetB0_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB0_small_pretrained.pdparams",
+    "EfficientNetB0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB0_pretrained.pdparams",
+    "EfficientNetB1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB1_pretrained.pdparams",
+    "EfficientNetB2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB2_pretrained.pdparams",
+    "EfficientNetB3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB3_pretrained.pdparams",
+    "EfficientNetB4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB4_pretrained.pdparams",
+    "EfficientNetB5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB5_pretrained.pdparams",
+    "EfficientNetB6":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB6_pretrained.pdparams",
+    "EfficientNetB7":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB7_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'batch_norm_momentum',
+    'batch_norm_epsilon',
+    'dropout_rate',
+    'num_classes',
+    'width_coefficient',
+    'depth_coefficient',
+    'depth_divisor',
+    'min_depth',
+    'drop_connect_rate',
+])
+
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+    'expand_ratio', 'id_skip', 'stride', 'se_ratio'
+])
+
+GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)
+
+
+def efficientnet_params(model_name):
+    """ Map EfficientNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients:   width,depth,resolution,dropout
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    }
+    return params_dict[model_name]
+
+
+def efficientnet(width_coefficient=None,
+                 depth_coefficient=None,
+                 dropout_rate=0.2,
+                 drop_connect_rate=0.2):
+    """ Get block arguments according to parameter and coefficients. """
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+
+    global_params = GlobalParams(
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        dropout_rate=dropout_rate,
+        drop_connect_rate=drop_connect_rate,
+        num_classes=1000,
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        depth_divisor=8,
+        min_depth=None)
+
+    return blocks_args, global_params
+
+
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('efficientnet'):
+        w, d, _, p = efficientnet_params(model_name)
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' %
+                                  model_name)
+    if override_params:
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+def round_filters(filters, global_params):
+    """ Calculate and round number of filters based on depth multiplier. """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+
+
+def round_repeats(repeats, global_params):
+    """ Round number of filters based on depth multiplier. """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+
+
+class BlockDecoder(object):
+    """
+    Block Decoder, straight from the official TensorFlow repository.
+    """
+
+    @staticmethod
+    def _decode_block_string(block_string):
+        """ Gets a block through a string notation of arguments. """
+        assert isinstance(block_string, str)
+
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        # Check stride
+        cond_1 = ('s' in options and len(options['s']) == 1)
+        cond_2 = ((len(options['s']) == 2) and
+                  (options['s'][0] == options['s'][1]))
+        assert (cond_1 or cond_2)
+
+        return BlockArgs(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            expand_ratio=int(options['e']),
+            id_skip=('noskip' not in block_string),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=[int(options['s'][0])])
+
+    @staticmethod
+    def _encode_block_string(block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat, 'k%d' % block.kernel_size, 's%d%d' %
+            (block.strides[0], block.strides[1]), 'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters, 'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+
+    @staticmethod
+    def decode(string_list):
+        """
+        Decode a list of string notations to specify blocks in the network.
+
+        string_list: list of strings, each string is a notation of block
+        return
+            list of BlockArgs namedtuples of block args
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+
+    @staticmethod
+    def encode(blocks_args):
+        """
+        Encodes a list of BlockArgs to a list of strings.
+
+        :param blocks_args: a list of BlockArgs namedtuples of block args
+        :return: a list of strings, each string is a notation of block
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+
+
+def initial_type(name, use_bias=False):
+    param_attr = ParamAttr(name=name + "_weights")
+    if use_bias:
+        bias_attr = ParamAttr(name=name + "_offset")
+    else:
+        bias_attr = False
+    return param_attr, bias_attr
+
+
+def init_batch_norm_layer(name="batch_norm"):
+    param_attr = ParamAttr(name=name + "_scale")
+    bias_attr = ParamAttr(name=name + "_offset")
+    return param_attr, bias_attr
+
+
+def init_fc_layer(name="fc"):
+    param_attr = ParamAttr(name=name + "_weights")
+    bias_attr = ParamAttr(name=name + "_offset")
+    return param_attr, bias_attr
+
+
+def cal_padding(img_size, stride, filter_size, dilation=1):
+    """Calculate padding size."""
+    if img_size % stride == 0:
+        out_size = max(filter_size - stride, 0)
+    else:
+        out_size = max(filter_size - (img_size % stride), 0)
+    return out_size // 2, out_size - out_size // 2
+
+
+inp_shape = {
+    "b0_small": [224, 112, 112, 56, 28, 14, 14, 7],
+    "b0": [224, 112, 112, 56, 28, 14, 14, 7],
+    "b1": [240, 120, 120, 60, 30, 15, 15, 8],
+    "b2": [260, 130, 130, 65, 33, 17, 17, 9],
+    "b3": [300, 150, 150, 75, 38, 19, 19, 10],
+    "b4": [380, 190, 190, 95, 48, 24, 24, 12],
+    "b5": [456, 228, 228, 114, 57, 29, 29, 15],
+    "b6": [528, 264, 264, 132, 66, 33, 33, 17],
+    "b7": [600, 300, 300, 150, 75, 38, 38, 19]
+}
+
+
+def _drop_connect(inputs, prob, is_test):
+    if is_test:
+        output = inputs
+    else:
+        keep_prob = 1.0 - prob
+        inputs_shape = paddle.shape(inputs)
+        random_tensor = keep_prob + paddle.rand(
+            shape=[inputs_shape[0], 1, 1, 1])
+        binary_tensor = paddle.floor(random_tensor)
+        output = paddle.multiply(inputs, binary_tensor) / keep_prob
+    return output
+
+
+class Conv2ds(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=None,
+                 name="conv2d",
+                 act=None,
+                 use_bias=False,
+                 padding_type=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(Conv2ds, self).__init__()
+        assert act in [None, "swish", "sigmoid"]
+        self.act = act
+
+        param_attr, bias_attr = initial_type(name=name, use_bias=use_bias)
+
+        def get_padding(filter_size, stride=1, dilation=1):
+            padding = ((stride - 1) + dilation * (filter_size - 1)) // 2
+            return padding
+
+        inps = 1 if model_name == None and cur_stage == None else inp_shape[
+            model_name][cur_stage]
+        self.need_crop = False
+        if padding_type == "SAME":
+            top_padding, bottom_padding = cal_padding(inps, stride,
+                                                      filter_size)
+            left_padding, right_padding = cal_padding(inps, stride,
+                                                      filter_size)
+            height_padding = bottom_padding
+            width_padding = right_padding
+            if top_padding != bottom_padding or left_padding != right_padding:
+                height_padding = top_padding + stride
+                width_padding = left_padding + stride
+                self.need_crop = True
+            padding = [height_padding, width_padding]
+        elif padding_type == "VALID":
+            height_padding = 0
+            width_padding = 0
+            padding = [height_padding, width_padding]
+        elif padding_type == "DYNAMIC":
+            padding = get_padding(filter_size, stride)
+        else:
+            padding = padding_type
+
+        groups = 1 if groups is None else groups
+        self._conv = Conv2D(
+            input_channels,
+            output_channels,
+            filter_size,
+            groups=groups,
+            stride=stride,
+            #             act=act,
+            padding=padding,
+            weight_attr=param_attr,
+            bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        if self.act == "swish":
+            x = F.swish(x)
+        elif self.act == "sigmoid":
+            x = F.sigmoid(x)
+
+        if self.need_crop:
+            x = x[:, :, 1:, 1:]
+        return x
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 filter_size,
+                 output_channels,
+                 stride=1,
+                 num_groups=1,
+                 padding_type="SAME",
+                 conv_act=None,
+                 bn_act="swish",
+                 use_bn=True,
+                 use_bias=False,
+                 name=None,
+                 conv_name=None,
+                 bn_name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2ds(
+            input_channels=input_channels,
+            output_channels=output_channels,
+            filter_size=filter_size,
+            stride=stride,
+            groups=num_groups,
+            act=conv_act,
+            padding_type=padding_type,
+            name=conv_name,
+            use_bias=use_bias,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self.use_bn = use_bn
+        if use_bn is True:
+            bn_name = name + bn_name
+            param_attr, bias_attr = init_batch_norm_layer(bn_name)
+
+            self._bn = BatchNorm(
+                num_channels=output_channels,
+                act=bn_act,
+                momentum=0.99,
+                epsilon=0.001,
+                moving_mean_name=bn_name + "_mean",
+                moving_variance_name=bn_name + "_variance",
+                param_attr=param_attr,
+                bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        if self.use_bn:
+            x = self._conv(inputs)
+            x = self._bn(x)
+            return x
+        else:
+            return self._conv(inputs)
+
+
+class ExpandConvNorm(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ExpandConvNorm, self).__init__()
+
+        self.oup = block_args.input_filters * block_args.expand_ratio
+        self.expand_ratio = block_args.expand_ratio
+
+        if self.expand_ratio != 1:
+            self._conv = ConvBNLayer(
+                input_channels,
+                1,
+                self.oup,
+                bn_act=None,
+                padding_type=padding_type,
+                name=name,
+                conv_name=name + "_expand_conv",
+                bn_name="_bn0",
+                model_name=model_name,
+                cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        if self.expand_ratio != 1:
+            return self._conv(inputs)
+        else:
+            return inputs
+
+
+class DepthwiseConvNorm(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(DepthwiseConvNorm, self).__init__()
+
+        self.k = block_args.kernel_size
+        self.s = block_args.stride
+        if isinstance(self.s, list) or isinstance(self.s, tuple):
+            self.s = self.s[0]
+        oup = block_args.input_filters * block_args.expand_ratio
+
+        self._conv = ConvBNLayer(
+            input_channels,
+            self.k,
+            oup,
+            self.s,
+            num_groups=input_channels,
+            bn_act=None,
+            padding_type=padding_type,
+            name=name,
+            conv_name=name + "_depthwise_conv",
+            bn_name="_bn1",
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class ProjectConvNorm(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ProjectConvNorm, self).__init__()
+
+        final_oup = block_args.output_filters
+
+        self._conv = ConvBNLayer(
+            input_channels,
+            1,
+            final_oup,
+            bn_act=None,
+            padding_type=padding_type,
+            name=name,
+            conv_name=name + "_project_conv",
+            bn_name="_bn2",
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class SEBlock(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 num_squeezed_channels,
+                 oup,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(SEBlock, self).__init__()
+
+        self._pool = AdaptiveAvgPool2D(1)
+        self._conv1 = Conv2ds(
+            input_channels,
+            num_squeezed_channels,
+            1,
+            use_bias=True,
+            padding_type=padding_type,
+            act="swish",
+            name=name + "_se_reduce")
+
+        self._conv2 = Conv2ds(
+            num_squeezed_channels,
+            oup,
+            1,
+            act="sigmoid",
+            use_bias=True,
+            padding_type=padding_type,
+            name=name + "_se_expand")
+
+    def forward(self, inputs):
+        x = self._pool(inputs)
+        x = self._conv1(x)
+        x = self._conv2(x)
+        out = paddle.multiply(inputs, x)
+        return out
+
+
+class MbConvBlock(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 padding_type,
+                 use_se,
+                 name=None,
+                 drop_connect_rate=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(MbConvBlock, self).__init__()
+
+        oup = block_args.input_filters * block_args.expand_ratio
+        self.block_args = block_args
+        self.has_se = use_se and (block_args.se_ratio is not None) and (
+            0 < block_args.se_ratio <= 1)
+        self.id_skip = block_args.id_skip
+        self.expand_ratio = block_args.expand_ratio
+        self.drop_connect_rate = drop_connect_rate
+
+        if self.expand_ratio != 1:
+            self._ecn = ExpandConvNorm(
+                input_channels,
+                block_args,
+                padding_type=padding_type,
+                name=name,
+                model_name=model_name,
+                cur_stage=cur_stage)
+
+        self._dcn = DepthwiseConvNorm(
+            input_channels * block_args.expand_ratio,
+            block_args,
+            padding_type=padding_type,
+            name=name,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+        if self.has_se:
+            num_squeezed_channels = max(
+                1, int(block_args.input_filters * block_args.se_ratio))
+            self._se = SEBlock(
+                input_channels * block_args.expand_ratio,
+                num_squeezed_channels,
+                oup,
+                padding_type=padding_type,
+                name=name,
+                model_name=model_name,
+                cur_stage=cur_stage)
+
+        self._pcn = ProjectConvNorm(
+            input_channels * block_args.expand_ratio,
+            block_args,
+            padding_type=padding_type,
+            name=name,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        x = inputs
+        if self.expand_ratio != 1:
+            x = self._ecn(x)
+            x = F.swish(x)
+
+        x = self._dcn(x)
+        x = F.swish(x)
+        if self.has_se:
+            x = self._se(x)
+        x = self._pcn(x)
+
+        if self.id_skip and \
+                self.block_args.stride == 1 and \
+                self.block_args.input_filters == self.block_args.output_filters:
+            if self.drop_connect_rate:
+                x = _drop_connect(x, self.drop_connect_rate, not self.training)
+            x = paddle.add(x, inputs)
+        return x
+
+
+class ConvStemNorm(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 padding_type,
+                 _global_params,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ConvStemNorm, self).__init__()
+
+        output_channels = round_filters(32, _global_params)
+        self._conv = ConvBNLayer(
+            input_channels,
+            filter_size=3,
+            output_channels=output_channels,
+            stride=2,
+            bn_act=None,
+            padding_type=padding_type,
+            name="",
+            conv_name="_conv_stem",
+            bn_name="_bn0",
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class ExtractFeatures(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 _block_args,
+                 _global_params,
+                 padding_type,
+                 use_se,
+                 model_name=None):
+        super(ExtractFeatures, self).__init__()
+
+        self._global_params = _global_params
+
+        self._conv_stem = ConvStemNorm(
+            input_channels,
+            padding_type=padding_type,
+            _global_params=_global_params,
+            model_name=model_name,
+            cur_stage=0)
+
+        self.block_args_copy = copy.deepcopy(_block_args)
+        idx = 0
+        block_size = 0
+        for block_arg in self.block_args_copy:
+            block_arg = block_arg._replace(
+                input_filters=round_filters(block_arg.input_filters,
+                                            _global_params),
+                output_filters=round_filters(block_arg.output_filters,
+                                             _global_params),
+                num_repeat=round_repeats(block_arg.num_repeat, _global_params))
+            block_size += 1
+            for _ in range(block_arg.num_repeat - 1):
+                block_size += 1
+
+        self.conv_seq = []
+        cur_stage = 1
+        for block_args in _block_args:
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.input_filters,
+                                            _global_params),
+                output_filters=round_filters(block_args.output_filters,
+                                             _global_params),
+                num_repeat=round_repeats(block_args.num_repeat,
+                                         _global_params))
+
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / block_size
+
+            _mc_block = self.add_sublayer(
+                "_blocks." + str(idx) + ".",
+                MbConvBlock(
+                    block_args.input_filters,
+                    block_args=block_args,
+                    padding_type=padding_type,
+                    use_se=use_se,
+                    name="_blocks." + str(idx) + ".",
+                    drop_connect_rate=drop_connect_rate,
+                    model_name=model_name,
+                    cur_stage=cur_stage))
+            self.conv_seq.append(_mc_block)
+            idx += 1
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(
+                    input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                drop_connect_rate = self._global_params.drop_connect_rate
+                if drop_connect_rate:
+                    drop_connect_rate *= float(idx) / block_size
+                _mc_block = self.add_sublayer(
+                    "block." + str(idx) + ".",
+                    MbConvBlock(
+                        block_args.input_filters,
+                        block_args,
+                        padding_type=padding_type,
+                        use_se=use_se,
+                        name="_blocks." + str(idx) + ".",
+                        drop_connect_rate=drop_connect_rate,
+                        model_name=model_name,
+                        cur_stage=cur_stage))
+                self.conv_seq.append(_mc_block)
+                idx += 1
+            cur_stage += 1
+
+    def forward(self, inputs):
+        x = self._conv_stem(inputs)
+        x = F.swish(x)
+        for _mc_block in self.conv_seq:
+            x = _mc_block(x)
+        return x
+
+
+class EfficientNet(nn.Layer):
+    def __init__(self,
+                 name="b0",
+                 padding_type="SAME",
+                 override_params=None,
+                 use_se=True,
+                 class_num=1000):
+        super(EfficientNet, self).__init__()
+
+        model_name = 'efficientnet-' + name
+        self.name = name
+        self._block_args, self._global_params = get_model_params(
+            model_name, override_params)
+        self.padding_type = padding_type
+        self.use_se = use_se
+
+        self._ef = ExtractFeatures(
+            3,
+            self._block_args,
+            self._global_params,
+            self.padding_type,
+            self.use_se,
+            model_name=self.name)
+
+        output_channels = round_filters(1280, self._global_params)
+        if name == "b0_small" or name == "b0" or name == "b1":
+            oup = 320
+        elif name == "b2":
+            oup = 352
+        elif name == "b3":
+            oup = 384
+        elif name == "b4":
+            oup = 448
+        elif name == "b5":
+            oup = 512
+        elif name == "b6":
+            oup = 576
+        elif name == "b7":
+            oup = 640
+        self._conv = ConvBNLayer(
+            oup,
+            1,
+            output_channels,
+            bn_act="swish",
+            padding_type=self.padding_type,
+            name="",
+            conv_name="_conv_head",
+            bn_name="_bn1",
+            model_name=self.name,
+            cur_stage=7)
+        self._pool = AdaptiveAvgPool2D(1)
+
+        if self._global_params.dropout_rate:
+            self._drop = Dropout(
+                p=self._global_params.dropout_rate, mode="upscale_in_train")
+
+        param_attr, bias_attr = init_fc_layer("_fc")
+        self._fc = Linear(
+            output_channels,
+            class_num,
+            weight_attr=param_attr,
+            bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        x = self._ef(inputs)
+        x = self._conv(x)
+        x = self._pool(x)
+        if self._global_params.dropout_rate:
+            x = self._drop(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def EfficientNetB0_small(padding_type='DYNAMIC',
+                         override_params=None,
+                         use_se=False,
+                         pretrained=False,
+                         use_ssld=False,
+                         **kwargs):
+    model = EfficientNet(
+        name='b0',
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB0_small"])
+    return model
+
+
+def EfficientNetB0(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    model = EfficientNet(
+        name='b0',
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB0"])
+    return model
+
+
+def EfficientNetB1(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    model = EfficientNet(
+        name='b1',
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB1"])
+    return model
+
+
+def EfficientNetB2(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    model = EfficientNet(
+        name='b2',
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB2"])
+    return model
+
+
+def EfficientNetB3(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    model = EfficientNet(
+        name='b3',
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB3"])
+    return model
+
+
+def EfficientNetB4(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    model = EfficientNet(
+        name='b4',
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB4"])
+    return model
+
+
+def EfficientNetB5(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    model = EfficientNet(
+        name='b5',
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB5"])
+    return model
+
+
+def EfficientNetB6(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    model = EfficientNet(
+        name='b6',
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB6"])
+    return model
+
+
+def EfficientNetB7(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    model = EfficientNet(
+        name='b7',
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB7"])
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/ghostnet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/ghostnet.py
new file mode 100644
index 0000000..4d338c1
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/ghostnet.py
@@ -0,0 +1,363 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/huawei-noah/CV-Backbones/tree/master/ghostnet_pytorch
+
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform, KaimingNormal
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "GhostNet_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GhostNet_x0_5_pretrained.pdparams",
+    "GhostNet_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GhostNet_x1_0_pretrained.pdparams",
+    "GhostNet_x1_3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GhostNet_x1_3_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+
+        self._batch_norm = BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(
+                name=bn_name + "_scale", regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(
+                name=bn_name + "_offset", regularizer=L2Decay(0.0)),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, num_channels, reduction_ratio=4, name=None):
+        super(SEBlock, self).__init__()
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+        self._num_channels = num_channels
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        med_ch = num_channels // reduction_ratio
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_1_weights"),
+            bias_attr=ParamAttr(name=name + "_1_offset"))
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_channels,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_2_weights"),
+            bias_attr=ParamAttr(name=name + "_2_offset"))
+
+    def forward(self, inputs):
+        pool = self.pool2d_gap(inputs)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = paddle.clip(x=excitation, min=0, max=1)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = paddle.multiply(inputs, excitation)
+        return out
+
+
+class GhostModule(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 output_channels,
+                 kernel_size=1,
+                 ratio=2,
+                 dw_size=3,
+                 stride=1,
+                 relu=True,
+                 name=None):
+        super(GhostModule, self).__init__()
+        init_channels = int(math.ceil(output_channels / ratio))
+        new_channels = int(init_channels * (ratio - 1))
+        self.primary_conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=init_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=1,
+            act="relu" if relu else None,
+            name=name + "_primary_conv")
+        self.cheap_operation = ConvBNLayer(
+            in_channels=init_channels,
+            out_channels=new_channels,
+            kernel_size=dw_size,
+            stride=1,
+            groups=init_channels,
+            act="relu" if relu else None,
+            name=name + "_cheap_operation")
+
+    def forward(self, inputs):
+        x = self.primary_conv(inputs)
+        y = self.cheap_operation(x)
+        out = paddle.concat([x, y], axis=1)
+        return out
+
+
+class GhostBottleneck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 hidden_dim,
+                 output_channels,
+                 kernel_size,
+                 stride,
+                 use_se,
+                 name=None):
+        super(GhostBottleneck, self).__init__()
+        self._stride = stride
+        self._use_se = use_se
+        self._num_channels = in_channels
+        self._output_channels = output_channels
+        self.ghost_module_1 = GhostModule(
+            in_channels=in_channels,
+            output_channels=hidden_dim,
+            kernel_size=1,
+            stride=1,
+            relu=True,
+            name=name + "_ghost_module_1")
+        if stride == 2:
+            self.depthwise_conv = ConvBNLayer(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                groups=hidden_dim,
+                act=None,
+                name=name +
+                "_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+        if use_se:
+            self.se_block = SEBlock(num_channels=hidden_dim, name=name + "_se")
+        self.ghost_module_2 = GhostModule(
+            in_channels=hidden_dim,
+            output_channels=output_channels,
+            kernel_size=1,
+            relu=False,
+            name=name + "_ghost_module_2")
+        if stride != 1 or in_channels != output_channels:
+            self.shortcut_depthwise = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                groups=in_channels,
+                act=None,
+                name=name +
+                "_shortcut_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+            self.shortcut_conv = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=output_channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                act=None,
+                name=name + "_shortcut_conv")
+
+    def forward(self, inputs):
+        x = self.ghost_module_1(inputs)
+        if self._stride == 2:
+            x = self.depthwise_conv(x)
+        if self._use_se:
+            x = self.se_block(x)
+        x = self.ghost_module_2(x)
+        if self._stride == 1 and self._num_channels == self._output_channels:
+            shortcut = inputs
+        else:
+            shortcut = self.shortcut_depthwise(inputs)
+            shortcut = self.shortcut_conv(shortcut)
+        return paddle.add(x=x, y=shortcut)
+
+
+class GhostNet(nn.Layer):
+    def __init__(self, scale, class_num=1000):
+        super(GhostNet, self).__init__()
+        self.cfgs = [
+            # k, t, c, SE, s
+            [3, 16, 16, 0, 1],
+            [3, 48, 24, 0, 2],
+            [3, 72, 24, 0, 1],
+            [5, 72, 40, 1, 2],
+            [5, 120, 40, 1, 1],
+            [3, 240, 80, 0, 2],
+            [3, 200, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 480, 112, 1, 1],
+            [3, 672, 112, 1, 1],
+            [5, 672, 160, 1, 2],
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1],
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1]
+        ]
+        self.scale = scale
+        output_channels = int(self._make_divisible(16 * self.scale, 4))
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=output_channels,
+            kernel_size=3,
+            stride=2,
+            groups=1,
+            act="relu",
+            name="conv1")
+        # build inverted residual blocks
+        idx = 0
+        self.ghost_bottleneck_list = []
+        for k, exp_size, c, use_se, s in self.cfgs:
+            in_channels = output_channels
+            output_channels = int(self._make_divisible(c * self.scale, 4))
+            hidden_dim = int(self._make_divisible(exp_size * self.scale, 4))
+            ghost_bottleneck = self.add_sublayer(
+                name="_ghostbottleneck_" + str(idx),
+                sublayer=GhostBottleneck(
+                    in_channels=in_channels,
+                    hidden_dim=hidden_dim,
+                    output_channels=output_channels,
+                    kernel_size=k,
+                    stride=s,
+                    use_se=use_se,
+                    name="_ghostbottleneck_" + str(idx)))
+            self.ghost_bottleneck_list.append(ghost_bottleneck)
+            idx += 1
+        # build last several layers
+        in_channels = output_channels
+        output_channels = int(self._make_divisible(exp_size * self.scale, 4))
+        self.conv_last = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=output_channels,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            act="relu",
+            name="conv_last")
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+        in_channels = output_channels
+        self._fc0_output_channels = 1280
+        self.fc_0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=self._fc0_output_channels,
+            kernel_size=1,
+            stride=1,
+            act="relu",
+            name="fc_0")
+        self.dropout = nn.Dropout(p=0.2)
+        stdv = 1.0 / math.sqrt(self._fc0_output_channels * 1.0)
+        self.fc_1 = Linear(
+            self._fc0_output_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                name="fc_1_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_1_offset"))
+
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        for ghost_bottleneck in self.ghost_bottleneck_list:
+            x = ghost_bottleneck(x)
+        x = self.conv_last(x)
+        x = self.pool2d_gap(x)
+        x = self.fc_0(x)
+        x = self.dropout(x)
+        x = paddle.reshape(x, shape=[-1, self._fc0_output_channels])
+        x = self.fc_1(x)
+        return x
+
+    def _make_divisible(self, v, divisor, min_value=None):
+        """
+        This function is taken from the original tf repo.
+        It ensures that all layers have a channel number that is divisible by 8
+        It can be seen here:
+        https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+        """
+        if min_value is None:
+            min_value = divisor
+        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+        # Make sure that round down does not go down by more than 10%.
+        if new_v < 0.9 * v:
+            new_v += divisor
+        return new_v
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def GhostNet_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = GhostNet(scale=0.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GhostNet_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def GhostNet_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = GhostNet(scale=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GhostNet_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def GhostNet_x1_3(pretrained=False, use_ssld=False, **kwargs):
+    model = GhostNet(scale=1.3, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GhostNet_x1_3"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/googlenet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/googlenet.py
new file mode 100644
index 0000000..2252842
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/googlenet.py
@@ -0,0 +1,229 @@
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "GoogLeNet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GoogLeNet_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def xavier(channels, filter_size, name):
+    stdv = (3.0 / (filter_size**2 * channels))**0.5
+    param_attr = ParamAttr(
+        initializer=Uniform(-stdv, stdv), name=name + "_weights")
+    return param_attr
+
+
+class ConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        return y
+
+
+class Inception(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter1,
+                 filter3R,
+                 filter3,
+                 filter5R,
+                 filter5,
+                 proj,
+                 name=None):
+        super(Inception, self).__init__()
+
+        self._conv1 = ConvLayer(
+            input_channels, filter1, 1, name="inception_" + name + "_1x1")
+        self._conv3r = ConvLayer(
+            input_channels,
+            filter3R,
+            1,
+            name="inception_" + name + "_3x3_reduce")
+        self._conv3 = ConvLayer(
+            filter3R, filter3, 3, name="inception_" + name + "_3x3")
+        self._conv5r = ConvLayer(
+            input_channels,
+            filter5R,
+            1,
+            name="inception_" + name + "_5x5_reduce")
+        self._conv5 = ConvLayer(
+            filter5R, filter5, 5, name="inception_" + name + "_5x5")
+        self._pool = MaxPool2D(kernel_size=3, stride=1, padding=1)
+
+        self._convprj = ConvLayer(
+            input_channels, proj, 1, name="inception_" + name + "_3x3_proj")
+
+    def forward(self, inputs):
+        conv1 = self._conv1(inputs)
+
+        conv3r = self._conv3r(inputs)
+        conv3 = self._conv3(conv3r)
+
+        conv5r = self._conv5r(inputs)
+        conv5 = self._conv5(conv5r)
+
+        pool = self._pool(inputs)
+        convprj = self._convprj(pool)
+
+        cat = paddle.concat([conv1, conv3, conv5, convprj], axis=1)
+        cat = F.relu(cat)
+        return cat
+
+
+class GoogLeNetDY(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(GoogLeNetDY, self).__init__()
+        self._conv = ConvLayer(3, 64, 7, 2, name="conv1")
+        self._pool = MaxPool2D(kernel_size=3, stride=2)
+        self._conv_1 = ConvLayer(64, 64, 1, name="conv2_1x1")
+        self._conv_2 = ConvLayer(64, 192, 3, name="conv2_3x3")
+
+        self._ince3a = Inception(
+            192, 192, 64, 96, 128, 16, 32, 32, name="ince3a")
+        self._ince3b = Inception(
+            256, 256, 128, 128, 192, 32, 96, 64, name="ince3b")
+
+        self._ince4a = Inception(
+            480, 480, 192, 96, 208, 16, 48, 64, name="ince4a")
+        self._ince4b = Inception(
+            512, 512, 160, 112, 224, 24, 64, 64, name="ince4b")
+        self._ince4c = Inception(
+            512, 512, 128, 128, 256, 24, 64, 64, name="ince4c")
+        self._ince4d = Inception(
+            512, 512, 112, 144, 288, 32, 64, 64, name="ince4d")
+        self._ince4e = Inception(
+            528, 528, 256, 160, 320, 32, 128, 128, name="ince4e")
+
+        self._ince5a = Inception(
+            832, 832, 256, 160, 320, 32, 128, 128, name="ince5a")
+        self._ince5b = Inception(
+            832, 832, 384, 192, 384, 48, 128, 128, name="ince5b")
+
+        self._pool_5 = AdaptiveAvgPool2D(1)
+
+        self._drop = Dropout(p=0.4, mode="downscale_in_infer")
+        self._fc_out = Linear(
+            1024,
+            class_num,
+            weight_attr=xavier(1024, 1, "out"),
+            bias_attr=ParamAttr(name="out_offset"))
+        self._pool_o1 = AvgPool2D(kernel_size=5, stride=3)
+        self._conv_o1 = ConvLayer(512, 128, 1, name="conv_o1")
+        self._fc_o1 = Linear(
+            1152,
+            1024,
+            weight_attr=xavier(2048, 1, "fc_o1"),
+            bias_attr=ParamAttr(name="fc_o1_offset"))
+        self._drop_o1 = Dropout(p=0.7, mode="downscale_in_infer")
+        self._out1 = Linear(
+            1024,
+            class_num,
+            weight_attr=xavier(1024, 1, "out1"),
+            bias_attr=ParamAttr(name="out1_offset"))
+        self._pool_o2 = AvgPool2D(kernel_size=5, stride=3)
+        self._conv_o2 = ConvLayer(528, 128, 1, name="conv_o2")
+        self._fc_o2 = Linear(
+            1152,
+            1024,
+            weight_attr=xavier(2048, 1, "fc_o2"),
+            bias_attr=ParamAttr(name="fc_o2_offset"))
+        self._drop_o2 = Dropout(p=0.7, mode="downscale_in_infer")
+        self._out2 = Linear(
+            1024,
+            class_num,
+            weight_attr=xavier(1024, 1, "out2"),
+            bias_attr=ParamAttr(name="out2_offset"))
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._pool(x)
+        x = self._conv_1(x)
+        x = self._conv_2(x)
+        x = self._pool(x)
+
+        x = self._ince3a(x)
+        x = self._ince3b(x)
+        x = self._pool(x)
+
+        ince4a = self._ince4a(x)
+        x = self._ince4b(ince4a)
+        x = self._ince4c(x)
+        ince4d = self._ince4d(x)
+        x = self._ince4e(ince4d)
+        x = self._pool(x)
+
+        x = self._ince5a(x)
+        ince5b = self._ince5b(x)
+
+        x = self._pool_5(ince5b)
+        x = self._drop(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        out = self._fc_out(x)
+
+        x = self._pool_o1(ince4a)
+        x = self._conv_o1(x)
+        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
+        x = self._fc_o1(x)
+        x = F.relu(x)
+        x = self._drop_o1(x)
+        out1 = self._out1(x)
+
+        x = self._pool_o2(ince4d)
+        x = self._conv_o2(x)
+        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
+        x = self._fc_o2(x)
+        x = self._drop_o2(x)
+        out2 = self._out2(x)
+        return [out, out1, out2]
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def GoogLeNet(pretrained=False, use_ssld=False, **kwargs):
+    model = GoogLeNetDY(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GoogLeNet"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/gvt.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/gvt.py
new file mode 100644
index 0000000..2af7ccf
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/gvt.py
@@ -0,0 +1,693 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/Meituan-AutoML/Twins
+
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+
+from .vision_transformer import trunc_normal_, normal_, zeros_, ones_, to_2tuple, DropPath, Identity, Mlp
+from .vision_transformer import Block as ViTBlock
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "pcpvt_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/pcpvt_small_pretrained.pdparams",
+    "pcpvt_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/pcpvt_base_pretrained.pdparams",
+    "pcpvt_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/pcpvt_large_pretrained.pdparams",
+    "alt_gvt_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/alt_gvt_small_pretrained.pdparams",
+    "alt_gvt_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/alt_gvt_base_pretrained.pdparams",
+    "alt_gvt_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/alt_gvt_large_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class GroupAttention(nn.Layer):
+    """LSA: self attention within a group.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 ws=1):
+        super().__init__()
+        if ws == 1:
+            raise Exception("ws {ws} should not be 1")
+        if dim % num_heads != 0:
+            raise Exception(
+                "dim {dim} should be divided by num_heads {num_heads}.")
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.ws = ws
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        h_group, w_group = H // self.ws, W // self.ws
+        total_groups = h_group * w_group
+        x = x.reshape([B, h_group, self.ws, w_group, self.ws, C]).transpose(
+            [0, 1, 3, 2, 4, 5])
+        qkv = self.qkv(x).reshape([
+            B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads
+        ]).transpose([3, 0, 1, 4, 2, 5])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = paddle.matmul(q, k.transpose([0, 1, 2, 4, 3])) * self.scale
+
+        attn = nn.Softmax(axis=-1)(attn)
+        attn = self.attn_drop(attn)
+        attn = paddle.matmul(attn, v).transpose([0, 1, 3, 2, 4]).reshape(
+            [B, h_group, w_group, self.ws, self.ws, C])
+
+        x = attn.transpose([0, 1, 3, 2, 4, 5]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    """GSA: using a key to summarize the information for a group to be efficient.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2D(
+                dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(
+            [B, N, self.num_heads, C // self.num_heads]).transpose(
+                [0, 2, 1, 3])
+
+        if self.sr_ratio > 1:
+            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+            tmp_n = H * W // self.sr_ratio**2
+            x_ = self.sr(x_).reshape([B, C, tmp_n]).transpose([0, 2, 1])
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape(
+                [B, tmp_n, 2, self.num_heads, C // self.num_heads]).transpose(
+                    [2, 0, 3, 1, 4])
+        else:
+            kv = self.kv(x).reshape(
+                [B, N, 2, self.num_heads, C // self.num_heads]).transpose(
+                    [2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        attn = paddle.matmul(q, k.transpose([0, 1, 3, 2])) * self.scale
+        attn = nn.Softmax(axis=-1)(attn)
+        attn = self.attn_drop(attn)
+
+        x = paddle.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class SBlock(ViTBlock):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1):
+        super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
+                         attn_drop, drop_path, act_layer, norm_layer)
+
+    def forward(self, x, H, W):
+        return super().forward(x)
+
+
+class GroupBlock(ViTBlock):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1,
+                 ws=1):
+        super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
+                         attn_drop, drop_path, act_layer, norm_layer)
+        del self.attn
+        if ws == 1:
+            self.attn = Attention(dim, num_heads, qkv_bias, qk_scale,
+                                  attn_drop, drop, sr_ratio)
+        else:
+            self.attn = GroupAttention(dim, num_heads, qkv_bias, qk_scale,
+                                       attn_drop, drop, ws)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding.
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        if img_size % patch_size != 0:
+            raise Exception(
+                f"img_size {img_size} should be divided by patch_size {patch_size}."
+            )
+
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose([0, 2, 1])
+        x = self.norm(x)
+        H, W = H // self.patch_size[0], W // self.patch_size[1]
+        return x, (H, W)
+
+
+# borrow from PVT https://github.com/whai362/PVT.git
+class PyramidVisionTransformer(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 block_cls=Block):
+        super().__init__()
+        self.class_num = class_num
+        self.depths = depths
+
+        # patch_embed
+        self.patch_embeds = nn.LayerList()
+        self.pos_embeds = nn.ParameterList()
+        self.pos_drops = nn.LayerList()
+        self.blocks = nn.LayerList()
+
+        for i in range(len(depths)):
+            if i == 0:
+                self.patch_embeds.append(
+                    PatchEmbed(img_size, patch_size, in_chans, embed_dims[i]))
+            else:
+                self.patch_embeds.append(
+                    PatchEmbed(img_size // patch_size // 2**(i - 1), 2,
+                               embed_dims[i - 1], embed_dims[i]))
+            patch_num = self.patch_embeds[i].num_patches + 1 if i == len(
+                embed_dims) - 1 else self.patch_embeds[i].num_patches
+            self.pos_embeds.append(
+                self.create_parameter(
+                    shape=[1, patch_num, embed_dims[i]],
+                    default_initializer=zeros_))
+            self.pos_drops.append(nn.Dropout(p=drop_rate))
+
+        dpr = [
+            x.numpy()[0]
+            for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        cur = 0
+        for k in range(len(depths)):
+            _block = nn.LayerList([
+                block_cls(
+                    dim=embed_dims[k],
+                    num_heads=num_heads[k],
+                    mlp_ratio=mlp_ratios[k],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[k]) for i in range(depths[k])
+            ])
+            self.blocks.append(_block)
+            cur += depths[k]
+
+        self.norm = norm_layer(embed_dims[-1])
+
+        # cls_token
+        self.cls_token = self.create_parameter(
+            shape=[1, 1, embed_dims[-1]],
+            default_initializer=zeros_,
+            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
+
+        # classification head
+        self.head = nn.Linear(embed_dims[-1],
+                              class_num) if class_num > 0 else Identity()
+
+        # init weights
+        for pos_emb in self.pos_embeds:
+            trunc_normal_(pos_emb)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        for i in range(len(self.depths)):
+            x, (H, W) = self.patch_embeds[i](x)
+            if i == len(self.depths) - 1:
+                cls_tokens = self.cls_token.expand([B, -1, -1])
+                x = paddle.concat([cls_tokens, x], dim=1)
+            x = x + self.pos_embeds[i]
+            x = self.pos_drops[i](x)
+            for blk in self.blocks[i]:
+                x = blk(x, H, W)
+            if i < len(self.depths) - 1:
+                x = x.reshape([B, H, W, -1]).transpose(
+                    [0, 3, 1, 2]).contiguous()
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+# PEG  from https://arxiv.org/abs/2102.10882
+class PosCNN(nn.Layer):
+    def __init__(self, in_chans, embed_dim=768, s=1):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Conv2D(
+                in_chans,
+                embed_dim,
+                3,
+                s,
+                1,
+                bias_attr=paddle.ParamAttr(regularizer=L2Decay(0.0)),
+                groups=embed_dim,
+                weight_attr=paddle.ParamAttr(regularizer=L2Decay(0.0)), ))
+        self.s = s
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        feat_token = x
+        cnn_feat = feat_token.transpose([0, 2, 1]).reshape([B, C, H, W])
+        if self.s == 1:
+            x = self.proj(cnn_feat) + cnn_feat
+        else:
+            x = self.proj(cnn_feat)
+        x = x.flatten(2).transpose([0, 2, 1])
+        return x
+
+
+class CPVTV2(PyramidVisionTransformer):
+    """
+    Use useful results from CPVT. PEG and GAP.
+    Therefore, cls token is no longer required.
+    PEG is used to encode the absolute position on the fly, which greatly affects the performance when input resolution
+    changes during the training (such as segmentation, detection)
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 block_cls=Block):
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
+        del self.pos_embeds
+        del self.cls_token
+        self.pos_block = nn.LayerList(
+            [PosCNN(embed_dim, embed_dim) for embed_dim in embed_dims])
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        import math
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            normal_(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            m.weight.data.fill_(1.0)
+            m.bias.data.zero_()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(len(self.depths)):
+            x, (H, W) = self.patch_embeds[i](x)
+            x = self.pos_drops[i](x)
+
+            for j, blk in enumerate(self.blocks[i]):
+                x = blk(x, H, W)
+                if j == 0:
+                    x = self.pos_block[i](x, H, W)  # PEG here
+
+            if i < len(self.depths) - 1:
+                x = x.reshape([B, H, W, x.shape[-1]]).transpose([0, 3, 1, 2])
+
+        x = self.norm(x)
+        return x.mean(axis=1)  # GAP here
+
+
+class PCPVT(CPVTV2):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256],
+                 num_heads=[1, 2, 4],
+                 mlp_ratios=[4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[4, 4, 4],
+                 sr_ratios=[4, 2, 1],
+                 block_cls=SBlock):
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
+
+
+class ALTGVT(PCPVT):
+    """
+    alias Twins-SVT
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256],
+                 num_heads=[1, 2, 4],
+                 mlp_ratios=[4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[4, 4, 4],
+                 sr_ratios=[4, 2, 1],
+                 block_cls=GroupBlock,
+                 wss=[7, 7, 7]):
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
+        del self.blocks
+        self.wss = wss
+        # transformer encoder
+        dpr = [
+            x.numpy()[0]
+            for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+        self.blocks = nn.LayerList()
+        for k in range(len(depths)):
+            _block = nn.LayerList([
+                block_cls(
+                    dim=embed_dims[k],
+                    num_heads=num_heads[k],
+                    mlp_ratio=mlp_ratios[k],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[k],
+                    ws=1 if i % 2 == 1 else wss[k]) for i in range(depths[k])
+            ])
+            self.blocks.append(_block)
+            cur += depths[k]
+        self.apply(self._init_weights)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def pcpvt_small(pretrained=False, use_ssld=False, **kwargs):
+    model = CPVTV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["pcpvt_small"], use_ssld=use_ssld)
+    return model
+
+
+def pcpvt_base(pretrained=False, use_ssld=False, **kwargs):
+    model = CPVTV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 18, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["pcpvt_base"], use_ssld=use_ssld)
+    return model
+
+
+def pcpvt_large(pretrained=False, use_ssld=False, **kwargs):
+    model = CPVTV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 8, 27, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["pcpvt_large"], use_ssld=use_ssld)
+    return model
+
+
+def alt_gvt_small(pretrained=False, use_ssld=False, **kwargs):
+    model = ALTGVT(
+        patch_size=4,
+        embed_dims=[64, 128, 256, 512],
+        num_heads=[2, 4, 8, 16],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 10, 4],
+        wss=[7, 7, 7, 7],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["alt_gvt_small"], use_ssld=use_ssld)
+    return model
+
+
+def alt_gvt_base(pretrained=False, use_ssld=False, **kwargs):
+    model = ALTGVT(
+        patch_size=4,
+        embed_dims=[96, 192, 384, 768],
+        num_heads=[3, 6, 12, 24],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 18, 2],
+        wss=[7, 7, 7, 7],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["alt_gvt_base"], use_ssld=use_ssld)
+    return model
+
+
+def alt_gvt_large(pretrained=False, use_ssld=False, **kwargs):
+    model = ALTGVT(
+        patch_size=4,
+        embed_dims=[128, 256, 512, 1024],
+        num_heads=[4, 8, 16, 32],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 18, 2],
+        wss=[7, 7, 7, 7],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["alt_gvt_large"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/hardnet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/hardnet.py
new file mode 100644
index 0000000..fffd3a4
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/hardnet.py
@@ -0,0 +1,293 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/PingoLH/Pytorch-HarDNet
+
+import paddle
+import paddle.nn as nn
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    'HarDNet39_ds':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet39_ds_pretrained.pdparams',
+    'HarDNet68_ds':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet68_ds_pretrained.pdparams',
+    'HarDNet68':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet68_pretrained.pdparams',
+    'HarDNet85':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet85_pretrained.pdparams'
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+def ConvLayer(in_channels,
+              out_channels,
+              kernel_size=3,
+              stride=1,
+              bias_attr=False):
+    layer = nn.Sequential(
+        ('conv', nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=1,
+            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)),
+        ('relu', nn.ReLU6()))
+    return layer
+
+
+def DWConvLayer(in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                bias_attr=False):
+    layer = nn.Sequential(
+        ('dwconv', nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            groups=out_channels,
+            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)))
+    return layer
+
+
+def CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1):
+    layer = nn.Sequential(
+        ('layer1', ConvLayer(
+            in_channels, out_channels, kernel_size=kernel_size)),
+        ('layer2', DWConvLayer(
+            out_channels, out_channels, stride=stride)))
+    return layer
+
+
+class HarDBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 growth_rate,
+                 grmul,
+                 n_layers,
+                 keepBase=False,
+                 residual_out=False,
+                 dwconv=False):
+        super().__init__()
+        self.keepBase = keepBase
+        self.links = []
+        layers_ = []
+        self.out_channels = 0  # if upsample else in_channels
+        for i in range(n_layers):
+            outch, inch, link = self.get_link(i + 1, in_channels, growth_rate,
+                                              grmul)
+            self.links.append(link)
+            if dwconv:
+                layers_.append(CombConvLayer(inch, outch))
+            else:
+                layers_.append(ConvLayer(inch, outch))
+
+            if (i % 2 == 0) or (i == n_layers - 1):
+                self.out_channels += outch
+        # print("Blk out =",self.out_channels)
+        self.layers = nn.LayerList(layers_)
+
+    def get_link(self, layer, base_ch, growth_rate, grmul):
+        if layer == 0:
+            return base_ch, 0, []
+        out_channels = growth_rate
+
+        link = []
+        for i in range(10):
+            dv = 2**i
+            if layer % dv == 0:
+                k = layer - dv
+                link.append(k)
+                if i > 0:
+                    out_channels *= grmul
+
+        out_channels = int(int(out_channels + 1) / 2) * 2
+        in_channels = 0
+
+        for i in link:
+            ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul)
+            in_channels += ch
+
+        return out_channels, in_channels, link
+
+    def forward(self, x):
+        layers_ = [x]
+
+        for layer in range(len(self.layers)):
+            link = self.links[layer]
+            tin = []
+            for i in link:
+                tin.append(layers_[i])
+            if len(tin) > 1:
+                x = paddle.concat(tin, 1)
+            else:
+                x = tin[0]
+            out = self.layers[layer](x)
+            layers_.append(out)
+
+        t = len(layers_)
+        out_ = []
+        for i in range(t):
+            if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1):
+                out_.append(layers_[i])
+        out = paddle.concat(out_, 1)
+
+        return out
+
+
+class HarDNet(nn.Layer):
+    def __init__(self,
+                 depth_wise=False,
+                 arch=85,
+                 class_num=1000,
+                 with_pool=True):
+        super().__init__()
+        first_ch = [32, 64]
+        second_kernel = 3
+        max_pool = True
+        grmul = 1.7
+        drop_rate = 0.1
+
+        # HarDNet68
+        ch_list = [128, 256, 320, 640, 1024]
+        gr = [14, 16, 20, 40, 160]
+        n_layers = [8, 16, 16, 16, 4]
+        downSamp = [1, 0, 1, 1, 0]
+
+        if arch == 85:
+            # HarDNet85
+            first_ch = [48, 96]
+            ch_list = [192, 256, 320, 480, 720, 1280]
+            gr = [24, 24, 28, 36, 48, 256]
+            n_layers = [8, 16, 16, 16, 16, 4]
+            downSamp = [1, 0, 1, 0, 1, 0]
+            drop_rate = 0.2
+
+        elif arch == 39:
+            # HarDNet39
+            first_ch = [24, 48]
+            ch_list = [96, 320, 640, 1024]
+            grmul = 1.6
+            gr = [16, 20, 64, 160]
+            n_layers = [4, 16, 8, 4]
+            downSamp = [1, 1, 1, 0]
+
+        if depth_wise:
+            second_kernel = 1
+            max_pool = False
+            drop_rate = 0.05
+
+        blks = len(n_layers)
+        self.base = nn.LayerList([])
+
+        # First Layer: Standard Conv3x3, Stride=2
+        self.base.append(
+            ConvLayer(
+                in_channels=3,
+                out_channels=first_ch[0],
+                kernel_size=3,
+                stride=2,
+                bias_attr=False))
+
+        # Second Layer
+        self.base.append(
+            ConvLayer(
+                first_ch[0], first_ch[1], kernel_size=second_kernel))
+
+        # Maxpooling or DWConv3x3 downsampling
+        if max_pool:
+            self.base.append(nn.MaxPool2D(kernel_size=3, stride=2, padding=1))
+        else:
+            self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2))
+
+        # Build all HarDNet blocks
+        ch = first_ch[1]
+        for i in range(blks):
+            blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise)
+            ch = blk.out_channels
+            self.base.append(blk)
+
+            if i == blks - 1 and arch == 85:
+                self.base.append(nn.Dropout(0.1))
+
+            self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1))
+            ch = ch_list[i]
+            if downSamp[i] == 1:
+                if max_pool:
+                    self.base.append(nn.MaxPool2D(kernel_size=2, stride=2))
+                else:
+                    self.base.append(DWConvLayer(ch, ch, stride=2))
+
+        ch = ch_list[blks - 1]
+
+        layers = []
+
+        if with_pool:
+            layers.append(nn.AdaptiveAvgPool2D((1, 1)))
+
+        if class_num > 0:
+            layers.append(nn.Flatten())
+            layers.append(nn.Dropout(drop_rate))
+            layers.append(nn.Linear(ch, class_num))
+
+        self.base.append(nn.Sequential(*layers))
+
+    def forward(self, x):
+        for layer in self.base:
+            x = layer(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def HarDNet39_ds(pretrained=False, **kwargs):
+    model = HarDNet(arch=39, depth_wise=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet39_ds"])
+    return model
+
+
+def HarDNet68_ds(pretrained=False, **kwargs):
+    model = HarDNet(arch=68, depth_wise=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet68_ds"])
+    return model
+
+
+def HarDNet68(pretrained=False, **kwargs):
+    model = HarDNet(arch=68, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet68"])
+    return model
+
+
+def HarDNet85(pretrained=False, **kwargs):
+    model = HarDNet(arch=85, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet85"])
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/inception_v4.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/inception_v4.py
new file mode 100644
index 0000000..e0460d4
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/inception_v4.py
@@ -0,0 +1,477 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "InceptionV4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/InceptionV4_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act='relu',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class InceptionStem(nn.Layer):
+    def __init__(self):
+        super(InceptionStem, self).__init__()
+        self._conv_1 = ConvBNLayer(
+            3, 32, 3, stride=2, act="relu", name="conv1_3x3_s2")
+        self._conv_2 = ConvBNLayer(32, 32, 3, act="relu", name="conv2_3x3_s1")
+        self._conv_3 = ConvBNLayer(
+            32, 64, 3, padding=1, act="relu", name="conv3_3x3_s1")
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self._conv2 = ConvBNLayer(
+            64, 96, 3, stride=2, act="relu", name="inception_stem1_3x3_s2")
+        self._conv1_1 = ConvBNLayer(
+            160, 64, 1, act="relu", name="inception_stem2_3x3_reduce")
+        self._conv1_2 = ConvBNLayer(
+            64, 96, 3, act="relu", name="inception_stem2_3x3")
+        self._conv2_1 = ConvBNLayer(
+            160, 64, 1, act="relu", name="inception_stem2_1x7_reduce")
+        self._conv2_2 = ConvBNLayer(
+            64,
+            64, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_stem2_1x7")
+        self._conv2_3 = ConvBNLayer(
+            64,
+            64, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_stem2_7x1")
+        self._conv2_4 = ConvBNLayer(
+            64, 96, 3, act="relu", name="inception_stem2_3x3_2")
+        self._conv3 = ConvBNLayer(
+            192, 192, 3, stride=2, act="relu", name="inception_stem3_3x3_s2")
+
+    def forward(self, inputs):
+        conv = self._conv_1(inputs)
+        conv = self._conv_2(conv)
+        conv = self._conv_3(conv)
+
+        pool1 = self._pool(conv)
+        conv2 = self._conv2(conv)
+        concat = paddle.concat([pool1, conv2], axis=1)
+
+        conv1 = self._conv1_1(concat)
+        conv1 = self._conv1_2(conv1)
+
+        conv2 = self._conv2_1(concat)
+        conv2 = self._conv2_2(conv2)
+        conv2 = self._conv2_3(conv2)
+        conv2 = self._conv2_4(conv2)
+
+        concat = paddle.concat([conv1, conv2], axis=1)
+
+        conv1 = self._conv3(concat)
+        pool1 = self._pool(concat)
+
+        concat = paddle.concat([conv1, pool1], axis=1)
+        return concat
+
+
+class InceptionA(nn.Layer):
+    def __init__(self, name):
+        super(InceptionA, self).__init__()
+        self._pool = AvgPool2D(kernel_size=3, stride=1, padding=1)
+        self._conv1 = ConvBNLayer(
+            384, 96, 1, act="relu", name="inception_a" + name + "_1x1")
+        self._conv2 = ConvBNLayer(
+            384, 96, 1, act="relu", name="inception_a" + name + "_1x1_2")
+        self._conv3_1 = ConvBNLayer(
+            384, 64, 1, act="relu", name="inception_a" + name + "_3x3_reduce")
+        self._conv3_2 = ConvBNLayer(
+            64,
+            96,
+            3,
+            padding=1,
+            act="relu",
+            name="inception_a" + name + "_3x3")
+        self._conv4_1 = ConvBNLayer(
+            384,
+            64,
+            1,
+            act="relu",
+            name="inception_a" + name + "_3x3_2_reduce")
+        self._conv4_2 = ConvBNLayer(
+            64,
+            96,
+            3,
+            padding=1,
+            act="relu",
+            name="inception_a" + name + "_3x3_2")
+        self._conv4_3 = ConvBNLayer(
+            96,
+            96,
+            3,
+            padding=1,
+            act="relu",
+            name="inception_a" + name + "_3x3_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv1 = self._conv1(pool1)
+
+        conv2 = self._conv2(inputs)
+
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+
+        conv4 = self._conv4_1(inputs)
+        conv4 = self._conv4_2(conv4)
+        conv4 = self._conv4_3(conv4)
+
+        concat = paddle.concat([conv1, conv2, conv3, conv4], axis=1)
+        return concat
+
+
+class ReductionA(nn.Layer):
+    def __init__(self):
+        super(ReductionA, self).__init__()
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self._conv2 = ConvBNLayer(
+            384, 384, 3, stride=2, act="relu", name="reduction_a_3x3")
+        self._conv3_1 = ConvBNLayer(
+            384, 192, 1, act="relu", name="reduction_a_3x3_2_reduce")
+        self._conv3_2 = ConvBNLayer(
+            192, 224, 3, padding=1, act="relu", name="reduction_a_3x3_2")
+        self._conv3_3 = ConvBNLayer(
+            224, 256, 3, stride=2, act="relu", name="reduction_a_3x3_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv2 = self._conv2(inputs)
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+        conv3 = self._conv3_3(conv3)
+        concat = paddle.concat([pool1, conv2, conv3], axis=1)
+        return concat
+
+
+class InceptionB(nn.Layer):
+    def __init__(self, name=None):
+        super(InceptionB, self).__init__()
+        self._pool = AvgPool2D(kernel_size=3, stride=1, padding=1)
+        self._conv1 = ConvBNLayer(
+            1024, 128, 1, act="relu", name="inception_b" + name + "_1x1")
+        self._conv2 = ConvBNLayer(
+            1024, 384, 1, act="relu", name="inception_b" + name + "_1x1_2")
+        self._conv3_1 = ConvBNLayer(
+            1024,
+            192,
+            1,
+            act="relu",
+            name="inception_b" + name + "_1x7_reduce")
+        self._conv3_2 = ConvBNLayer(
+            192,
+            224, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_b" + name + "_1x7")
+        self._conv3_3 = ConvBNLayer(
+            224,
+            256, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_b" + name + "_7x1")
+        self._conv4_1 = ConvBNLayer(
+            1024,
+            192,
+            1,
+            act="relu",
+            name="inception_b" + name + "_7x1_2_reduce")
+        self._conv4_2 = ConvBNLayer(
+            192,
+            192, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_b" + name + "_1x7_2")
+        self._conv4_3 = ConvBNLayer(
+            192,
+            224, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_b" + name + "_7x1_2")
+        self._conv4_4 = ConvBNLayer(
+            224,
+            224, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_b" + name + "_1x7_3")
+        self._conv4_5 = ConvBNLayer(
+            224,
+            256, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_b" + name + "_7x1_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv1 = self._conv1(pool1)
+
+        conv2 = self._conv2(inputs)
+
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+        conv3 = self._conv3_3(conv3)
+
+        conv4 = self._conv4_1(inputs)
+        conv4 = self._conv4_2(conv4)
+        conv4 = self._conv4_3(conv4)
+        conv4 = self._conv4_4(conv4)
+        conv4 = self._conv4_5(conv4)
+
+        concat = paddle.concat([conv1, conv2, conv3, conv4], axis=1)
+        return concat
+
+
+class ReductionB(nn.Layer):
+    def __init__(self):
+        super(ReductionB, self).__init__()
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self._conv2_1 = ConvBNLayer(
+            1024, 192, 1, act="relu", name="reduction_b_3x3_reduce")
+        self._conv2_2 = ConvBNLayer(
+            192, 192, 3, stride=2, act="relu", name="reduction_b_3x3")
+        self._conv3_1 = ConvBNLayer(
+            1024, 256, 1, act="relu", name="reduction_b_1x7_reduce")
+        self._conv3_2 = ConvBNLayer(
+            256,
+            256, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="reduction_b_1x7")
+        self._conv3_3 = ConvBNLayer(
+            256,
+            320, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="reduction_b_7x1")
+        self._conv3_4 = ConvBNLayer(
+            320, 320, 3, stride=2, act="relu", name="reduction_b_3x3_2")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+
+        conv2 = self._conv2_1(inputs)
+        conv2 = self._conv2_2(conv2)
+
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+        conv3 = self._conv3_3(conv3)
+        conv3 = self._conv3_4(conv3)
+
+        concat = paddle.concat([pool1, conv2, conv3], axis=1)
+
+        return concat
+
+
+class InceptionC(nn.Layer):
+    def __init__(self, name=None):
+        super(InceptionC, self).__init__()
+        self._pool = AvgPool2D(kernel_size=3, stride=1, padding=1)
+        self._conv1 = ConvBNLayer(
+            1536, 256, 1, act="relu", name="inception_c" + name + "_1x1")
+        self._conv2 = ConvBNLayer(
+            1536, 256, 1, act="relu", name="inception_c" + name + "_1x1_2")
+        self._conv3_0 = ConvBNLayer(
+            1536, 384, 1, act="relu", name="inception_c" + name + "_1x1_3")
+        self._conv3_1 = ConvBNLayer(
+            384,
+            256, (1, 3),
+            padding=(0, 1),
+            act="relu",
+            name="inception_c" + name + "_1x3")
+        self._conv3_2 = ConvBNLayer(
+            384,
+            256, (3, 1),
+            padding=(1, 0),
+            act="relu",
+            name="inception_c" + name + "_3x1")
+        self._conv4_0 = ConvBNLayer(
+            1536, 384, 1, act="relu", name="inception_c" + name + "_1x1_4")
+        self._conv4_00 = ConvBNLayer(
+            384,
+            448, (1, 3),
+            padding=(0, 1),
+            act="relu",
+            name="inception_c" + name + "_1x3_2")
+        self._conv4_000 = ConvBNLayer(
+            448,
+            512, (3, 1),
+            padding=(1, 0),
+            act="relu",
+            name="inception_c" + name + "_3x1_2")
+        self._conv4_1 = ConvBNLayer(
+            512,
+            256, (1, 3),
+            padding=(0, 1),
+            act="relu",
+            name="inception_c" + name + "_1x3_3")
+        self._conv4_2 = ConvBNLayer(
+            512,
+            256, (3, 1),
+            padding=(1, 0),
+            act="relu",
+            name="inception_c" + name + "_3x1_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv1 = self._conv1(pool1)
+
+        conv2 = self._conv2(inputs)
+
+        conv3 = self._conv3_0(inputs)
+        conv3_1 = self._conv3_1(conv3)
+        conv3_2 = self._conv3_2(conv3)
+
+        conv4 = self._conv4_0(inputs)
+        conv4 = self._conv4_00(conv4)
+        conv4 = self._conv4_000(conv4)
+        conv4_1 = self._conv4_1(conv4)
+        conv4_2 = self._conv4_2(conv4)
+
+        concat = paddle.concat(
+            [conv1, conv2, conv3_1, conv3_2, conv4_1, conv4_2], axis=1)
+
+        return concat
+
+
+class InceptionV4DY(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(InceptionV4DY, self).__init__()
+        self._inception_stem = InceptionStem()
+
+        self._inceptionA_1 = InceptionA(name="1")
+        self._inceptionA_2 = InceptionA(name="2")
+        self._inceptionA_3 = InceptionA(name="3")
+        self._inceptionA_4 = InceptionA(name="4")
+        self._reductionA = ReductionA()
+
+        self._inceptionB_1 = InceptionB(name="1")
+        self._inceptionB_2 = InceptionB(name="2")
+        self._inceptionB_3 = InceptionB(name="3")
+        self._inceptionB_4 = InceptionB(name="4")
+        self._inceptionB_5 = InceptionB(name="5")
+        self._inceptionB_6 = InceptionB(name="6")
+        self._inceptionB_7 = InceptionB(name="7")
+        self._reductionB = ReductionB()
+
+        self._inceptionC_1 = InceptionC(name="1")
+        self._inceptionC_2 = InceptionC(name="2")
+        self._inceptionC_3 = InceptionC(name="3")
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self._drop = Dropout(p=0.2, mode="downscale_in_infer")
+        stdv = 1.0 / math.sqrt(1536 * 1.0)
+        self.out = Linear(
+            1536,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="final_fc_weights"),
+            bias_attr=ParamAttr(name="final_fc_offset"))
+
+    def forward(self, inputs):
+        x = self._inception_stem(inputs)
+
+        x = self._inceptionA_1(x)
+        x = self._inceptionA_2(x)
+        x = self._inceptionA_3(x)
+        x = self._inceptionA_4(x)
+        x = self._reductionA(x)
+
+        x = self._inceptionB_1(x)
+        x = self._inceptionB_2(x)
+        x = self._inceptionB_3(x)
+        x = self._inceptionB_4(x)
+        x = self._inceptionB_5(x)
+        x = self._inceptionB_6(x)
+        x = self._inceptionB_7(x)
+        x = self._reductionB(x)
+
+        x = self._inceptionC_1(x)
+        x = self._inceptionC_2(x)
+        x = self._inceptionC_3(x)
+
+        x = self.avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._drop(x)
+        x = self.out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def InceptionV4(pretrained=False, use_ssld=False, **kwargs):
+    model = InceptionV4DY(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["InceptionV4"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/levit.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/levit.py
new file mode 100644
index 0000000..991f832
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/levit.py
@@ -0,0 +1,589 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/facebookresearch/LeViT
+
+import itertools
+import math
+import warnings
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant
+from paddle.regularizer import L2Decay
+
+from .vision_transformer import trunc_normal_, zeros_, ones_, Identity
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "LeViT_128S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_128S_pretrained.pdparams",
+    "LeViT_128":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_128_pretrained.pdparams",
+    "LeViT_192":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_192_pretrained.pdparams",
+    "LeViT_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_256_pretrained.pdparams",
+    "LeViT_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def cal_attention_biases(attention_biases, attention_bias_idxs):
+    gather_list = []
+    attention_bias_t = paddle.transpose(attention_biases, (1, 0))
+    nums = attention_bias_idxs.shape[0]
+    for idx in range(nums):
+        gather = paddle.gather(attention_bias_t, attention_bias_idxs[idx])
+        gather_list.append(gather)
+    shape0, shape1 = attention_bias_idxs.shape
+    gather = paddle.concat(gather_list)
+    return paddle.transpose(gather, (1, 0)).reshape((0, shape0, shape1))
+
+
+class Conv2d_BN(nn.Sequential):
+    def __init__(self,
+                 a,
+                 b,
+                 ks=1,
+                 stride=1,
+                 pad=0,
+                 dilation=1,
+                 groups=1,
+                 bn_weight_init=1,
+                 resolution=-10000):
+        super().__init__()
+        self.add_sublayer(
+            'c',
+            nn.Conv2D(
+                a, b, ks, stride, pad, dilation, groups, bias_attr=False))
+        bn = nn.BatchNorm2D(b)
+        ones_(bn.weight)
+        zeros_(bn.bias)
+        self.add_sublayer('bn', bn)
+
+
+class Linear_BN(nn.Sequential):
+    def __init__(self, a, b, bn_weight_init=1):
+        super().__init__()
+        self.add_sublayer('c', nn.Linear(a, b, bias_attr=False))
+        bn = nn.BatchNorm1D(b)
+        if bn_weight_init == 0:
+            zeros_(bn.weight)
+        else:
+            ones_(bn.weight)
+        zeros_(bn.bias)
+        self.add_sublayer('bn', bn)
+
+    def forward(self, x):
+        l, bn = self._sub_layers.values()
+        x = l(x)
+        return paddle.reshape(bn(x.flatten(0, 1)), x.shape)
+
+
+class BN_Linear(nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_sublayer('bn', nn.BatchNorm1D(a))
+        l = nn.Linear(a, b, bias_attr=bias)
+        trunc_normal_(l.weight)
+        if bias:
+            zeros_(l.bias)
+        self.add_sublayer('l', l)
+
+
+def b16(n, activation, resolution=224):
+    return nn.Sequential(
+        Conv2d_BN(
+            3, n // 8, 3, 2, 1, resolution=resolution),
+        activation(),
+        Conv2d_BN(
+            n // 8, n // 4, 3, 2, 1, resolution=resolution // 2),
+        activation(),
+        Conv2d_BN(
+            n // 4, n // 2, 3, 2, 1, resolution=resolution // 4),
+        activation(),
+        Conv2d_BN(
+            n // 2, n, 3, 2, 1, resolution=resolution // 8))
+
+
+class Residual(nn.Layer):
+    def __init__(self, m, drop):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+
+    def forward(self, x):
+        if self.training and self.drop > 0:
+            y = paddle.rand(
+                shape=[x.shape[0], 1, 1]).__ge__(self.drop).astype("float32")
+            y = y.divide(paddle.full_like(y, 1 - self.drop))
+            return paddle.add(x, y)
+        else:
+            return paddle.add(x, self.m(x))
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 key_dim,
+                 num_heads=8,
+                 attn_ratio=4,
+                 activation=None,
+                 resolution=14):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        self.h = self.dh + nh_kd * 2
+        self.qkv = Linear_BN(dim, self.h)
+        self.proj = nn.Sequential(
+            activation(), Linear_BN(
+                self.dh, dim, bn_weight_init=0))
+        points = list(itertools.product(range(resolution), range(resolution)))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = self.create_parameter(
+            shape=(num_heads, len(attention_offsets)),
+            default_initializer=zeros_,
+            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
+        tensor_idxs = paddle.to_tensor(idxs, dtype='int64')
+        self.register_buffer('attention_bias_idxs',
+                             paddle.reshape(tensor_idxs, [N, N]))
+
+    @paddle.no_grad()
+    def train(self, mode=True):
+        if mode:
+            super().train()
+        else:
+            super().eval()
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = cal_attention_biases(self.attention_biases,
+                                           self.attention_bias_idxs)
+
+    def forward(self, x):
+        self.training = True
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = paddle.reshape(qkv,
+                             [B, N, self.num_heads, self.h // self.num_heads])
+        q, k, v = paddle.split(
+            qkv, [self.key_dim, self.key_dim, self.d], axis=3)
+        q = paddle.transpose(q, perm=[0, 2, 1, 3])
+        k = paddle.transpose(k, perm=[0, 2, 1, 3])
+        v = paddle.transpose(v, perm=[0, 2, 1, 3])
+        k_transpose = paddle.transpose(k, perm=[0, 1, 3, 2])
+
+        if self.training:
+            attention_biases = cal_attention_biases(self.attention_biases,
+                                                    self.attention_bias_idxs)
+        else:
+            attention_biases = self.ab
+        attn = (paddle.matmul(q, k_transpose) * self.scale + attention_biases)
+        attn = F.softmax(attn)
+        x = paddle.transpose(paddle.matmul(attn, v), perm=[0, 2, 1, 3])
+        x = paddle.reshape(x, [B, N, self.dh])
+        x = self.proj(x)
+        return x
+
+
+class Subsample(nn.Layer):
+    def __init__(self, stride, resolution):
+        super().__init__()
+        self.stride = stride
+        self.resolution = resolution
+
+    def forward(self, x):
+        B, N, C = x.shape
+        x = paddle.reshape(x, [B, self.resolution, self.resolution, C])
+        end1, end2 = x.shape[1], x.shape[2]
+        x = x[:, 0:end1:self.stride, 0:end2:self.stride]
+        x = paddle.reshape(x, [B, -1, C])
+        return x
+
+
+class AttentionSubsample(nn.Layer):
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 key_dim,
+                 num_heads=8,
+                 attn_ratio=2,
+                 activation=None,
+                 stride=2,
+                 resolution=14,
+                 resolution_=7):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * self.num_heads
+        self.attn_ratio = attn_ratio
+        self.resolution_ = resolution_
+        self.resolution_2 = resolution_**2
+        self.training = True
+        h = self.dh + nh_kd
+        self.kv = Linear_BN(in_dim, h)
+
+        self.q = nn.Sequential(
+            Subsample(stride, resolution), Linear_BN(in_dim, nh_kd))
+        self.proj = nn.Sequential(activation(), Linear_BN(self.dh, out_dim))
+
+        self.stride = stride
+        self.resolution = resolution
+        points = list(itertools.product(range(resolution), range(resolution)))
+        points_ = list(
+            itertools.product(range(resolution_), range(resolution_)))
+
+        N = len(points)
+        N_ = len(points_)
+        attention_offsets = {}
+        idxs = []
+        i = 0
+        j = 0
+        for p1 in points_:
+            i += 1
+            for p2 in points:
+                j += 1
+                size = 1
+                offset = (abs(p1[0] * stride - p2[0] + (size - 1) / 2),
+                          abs(p1[1] * stride - p2[1] + (size - 1) / 2))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = self.create_parameter(
+            shape=(num_heads, len(attention_offsets)),
+            default_initializer=zeros_,
+            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
+
+        tensor_idxs_ = paddle.to_tensor(idxs, dtype='int64')
+        self.register_buffer('attention_bias_idxs',
+                             paddle.reshape(tensor_idxs_, [N_, N]))
+
+    @paddle.no_grad()
+    def train(self, mode=True):
+        if mode:
+            super().train()
+        else:
+            super().eval()
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = cal_attention_biases(self.attention_biases,
+                                           self.attention_bias_idxs)
+
+    def forward(self, x):
+        self.training = True
+        B, N, C = x.shape
+        kv = self.kv(x)
+        kv = paddle.reshape(kv, [B, N, self.num_heads, -1])
+        k, v = paddle.split(kv, [self.key_dim, self.d], axis=3)
+        k = paddle.transpose(k, perm=[0, 2, 1, 3])  # BHNC
+        v = paddle.transpose(v, perm=[0, 2, 1, 3])
+        q = paddle.reshape(
+            self.q(x), [B, self.resolution_2, self.num_heads, self.key_dim])
+        q = paddle.transpose(q, perm=[0, 2, 1, 3])
+
+        if self.training:
+            attention_biases = cal_attention_biases(self.attention_biases,
+                                                    self.attention_bias_idxs)
+        else:
+            attention_biases = self.ab
+
+        attn = (paddle.matmul(
+            q, paddle.transpose(
+                k, perm=[0, 1, 3, 2]))) * self.scale + attention_biases
+        attn = F.softmax(attn)
+
+        x = paddle.reshape(
+            paddle.transpose(
+                paddle.matmul(attn, v), perm=[0, 2, 1, 3]), [B, -1, self.dh])
+        x = self.proj(x)
+        return x
+
+
+class LeViT(nn.Layer):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=[192],
+                 key_dim=[64],
+                 depth=[12],
+                 num_heads=[3],
+                 attn_ratio=[2],
+                 mlp_ratio=[2],
+                 hybrid_backbone=None,
+                 down_ops=[],
+                 attention_activation=nn.Hardswish,
+                 mlp_activation=nn.Hardswish,
+                 distillation=True,
+                 drop_path=0):
+        super().__init__()
+
+        self.class_num = class_num
+        self.num_features = embed_dim[-1]
+        self.embed_dim = embed_dim
+        self.distillation = distillation
+
+        self.patch_embed = hybrid_backbone
+
+        self.blocks = []
+        down_ops.append([''])
+        resolution = img_size // patch_size
+        for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate(
+                zip(embed_dim, key_dim, depth, num_heads, attn_ratio,
+                    mlp_ratio, down_ops)):
+            for _ in range(dpth):
+                self.blocks.append(
+                    Residual(
+                        Attention(
+                            ed,
+                            kd,
+                            nh,
+                            attn_ratio=ar,
+                            activation=attention_activation,
+                            resolution=resolution, ),
+                        drop_path))
+                if mr > 0:
+                    h = int(ed * mr)
+                    self.blocks.append(
+                        Residual(
+                            nn.Sequential(
+                                Linear_BN(ed, h),
+                                mlp_activation(),
+                                Linear_BN(
+                                    h, ed, bn_weight_init=0), ),
+                            drop_path))
+            if do[0] == 'Subsample':
+                #('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+                resolution_ = (resolution - 1) // do[5] + 1
+                self.blocks.append(
+                    AttentionSubsample(
+                        *embed_dim[i:i + 2],
+                        key_dim=do[1],
+                        num_heads=do[2],
+                        attn_ratio=do[3],
+                        activation=attention_activation,
+                        stride=do[5],
+                        resolution=resolution,
+                        resolution_=resolution_))
+                resolution = resolution_
+                if do[4] > 0:  # mlp_ratio
+                    h = int(embed_dim[i + 1] * do[4])
+                    self.blocks.append(
+                        Residual(
+                            nn.Sequential(
+                                Linear_BN(embed_dim[i + 1], h),
+                                mlp_activation(),
+                                Linear_BN(
+                                    h, embed_dim[i + 1], bn_weight_init=0), ),
+                            drop_path))
+        self.blocks = nn.Sequential(*self.blocks)
+
+        # Classifier head
+        self.head = BN_Linear(embed_dim[-1],
+                              class_num) if class_num > 0 else Identity()
+        if distillation:
+            self.head_dist = BN_Linear(
+                embed_dim[-1], class_num) if class_num > 0 else Identity()
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = x.flatten(2)
+        x = paddle.transpose(x, perm=[0, 2, 1])
+        x = self.blocks(x)
+        x = x.mean(1)
+
+        x = paddle.reshape(x, [-1, self.embed_dim[-1]])
+        if self.distillation:
+            x = self.head(x), self.head_dist(x)
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+        else:
+            x = self.head(x)
+        return x
+
+
+def model_factory(C, D, X, N, drop_path, class_num, distillation):
+    embed_dim = [int(x) for x in C.split('_')]
+    num_heads = [int(x) for x in N.split('_')]
+    depth = [int(x) for x in X.split('_')]
+    act = nn.Hardswish
+    model = LeViT(
+        patch_size=16,
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        key_dim=[D] * 3,
+        depth=depth,
+        attn_ratio=[2, 2, 2],
+        mlp_ratio=[2, 2, 2],
+        down_ops=[
+            #('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+            ['Subsample', D, embed_dim[0] // D, 4, 2, 2],
+            ['Subsample', D, embed_dim[1] // D, 4, 2, 2],
+        ],
+        attention_activation=act,
+        mlp_activation=act,
+        hybrid_backbone=b16(embed_dim[0], activation=act),
+        class_num=class_num,
+        drop_path=drop_path,
+        distillation=distillation)
+
+    return model
+
+
+specification = {
+    'LeViT_128S': {
+        'C': '128_256_384',
+        'D': 16,
+        'N': '4_6_8',
+        'X': '2_3_4',
+        'drop_path': 0
+    },
+    'LeViT_128': {
+        'C': '128_256_384',
+        'D': 16,
+        'N': '4_8_12',
+        'X': '4_4_4',
+        'drop_path': 0
+    },
+    'LeViT_192': {
+        'C': '192_288_384',
+        'D': 32,
+        'N': '3_5_6',
+        'X': '4_4_4',
+        'drop_path': 0
+    },
+    'LeViT_256': {
+        'C': '256_384_512',
+        'D': 32,
+        'N': '4_6_8',
+        'X': '4_4_4',
+        'drop_path': 0
+    },
+    'LeViT_384': {
+        'C': '384_512_768',
+        'D': 32,
+        'N': '6_9_12',
+        'X': '4_4_4',
+        'drop_path': 0.1
+    },
+}
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def LeViT_128S(pretrained=False,
+               use_ssld=False,
+               class_num=1000,
+               distillation=False,
+               **kwargs):
+    model = model_factory(
+        **specification['LeViT_128S'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_128S"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_128(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_128'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_128"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_192(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_192'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_192"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_256(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_256'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_256"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_384(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_384'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_384"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/mixnet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/mixnet.py
new file mode 100644
index 0000000..c2a1adb
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/mixnet.py
@@ -0,0 +1,815 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    MixNet for ImageNet-1K, implemented in Paddle.
+    Original paper: 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+"""
+
+import os
+from inspect import isfunction
+from functools import reduce
+import paddle
+import paddle.nn as nn
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "MixNet_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MixNet_S_pretrained.pdparams",
+    "MixNet_M":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MixNet_M_pretrained.pdparams",
+    "MixNet_L":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MixNet_L_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class Identity(nn.Layer):
+    """
+    Identity block.
+    """
+
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def round_channels(channels, divisor=8):
+    """
+    Round weighted channel number (make divisible operation).
+
+    Parameters:
+    ----------
+    channels : int or float
+        Original number of channels.
+    divisor : int, default 8
+        Alignment value.
+
+    Returns:
+    -------
+    int
+        Weighted number of channels.
+    """
+    rounded_channels = max(
+        int(channels + divisor / 2.0) // divisor * divisor, divisor)
+    if float(rounded_channels) < 0.9 * channels:
+        rounded_channels += divisor
+    return rounded_channels
+
+
+def get_activation_layer(activation):
+    """
+    Create activation layer from string/function.
+
+    Parameters:
+    ----------
+    activation : function, or str, or nn.Module
+        Activation function or name of activation function.
+
+    Returns:
+    -------
+    nn.Module
+        Activation layer.
+    """
+    assert activation is not None
+    if isfunction(activation):
+        return activation()
+    elif isinstance(activation, str):
+        if activation == "relu":
+            return nn.ReLU()
+        elif activation == "relu6":
+            return nn.ReLU6()
+        elif activation == "swish":
+            return nn.Swish()
+        elif activation == "hswish":
+            return nn.Hardswish()
+        elif activation == "sigmoid":
+            return nn.Sigmoid()
+        elif activation == "hsigmoid":
+            return nn.Hardsigmoid()
+        elif activation == "identity":
+            return Identity()
+        else:
+            raise NotImplementedError()
+    else:
+        assert isinstance(activation, nn.Layer)
+        return activation
+
+
+class ConvBlock(nn.Layer):
+    """
+    Standard convolution block with Batch normalization and activation.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int or tuple/list of 2 int
+        Convolution window size.
+    stride : int or tuple/list of 2 int
+        Strides of the convolution.
+    padding : int, or tuple/list of 2 int, or tuple/list of 4 int
+        Padding value for convolution layer.
+    dilation : int or tuple/list of 2 int, default 1
+        Dilation value for convolution layer.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    use_bn : bool, default True
+        Whether to use BatchNorm layer.
+    bn_eps : float, default 1e-5
+        Small float added to variance in Batch norm.
+    activation : function or str or None, default nn.ReLU()
+        Activation function or name of activation function.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 use_bn=True,
+                 bn_eps=1e-5,
+                 activation=nn.ReLU()):
+        super(ConvBlock, self).__init__()
+        self.activate = (activation is not None)
+        self.use_bn = use_bn
+        self.use_pad = (isinstance(padding, (list, tuple)) and
+                        (len(padding) == 4))
+
+        if self.use_pad:
+            self.pad = padding
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=bias,
+            weight_attr=None)
+        if self.use_bn:
+            self.bn = nn.BatchNorm2D(num_features=out_channels, epsilon=bn_eps)
+        if self.activate:
+            self.activ = get_activation_layer(activation)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn(x)
+        if self.activate:
+            x = self.activ(x)
+        return x
+
+
+class SEBlock(nn.Layer):
+    def __init__(self,
+                 channels,
+                 reduction=16,
+                 mid_channels=None,
+                 round_mid=False,
+                 use_conv=True,
+                 mid_activation=nn.ReLU(),
+                 out_activation=nn.Sigmoid()):
+        super(SEBlock, self).__init__()
+        self.use_conv = use_conv
+        if mid_channels is None:
+            mid_channels = channels // reduction if not round_mid else round_channels(
+                float(channels) / reduction)
+
+        self.pool = nn.AdaptiveAvgPool2D(output_size=1)
+        if use_conv:
+            self.conv1 = nn.Conv2D(
+                in_channels=channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                bias_attr=True,
+                weight_attr=None)
+
+        else:
+            self.fc1 = nn.Linear(
+                in_features=channels, out_features=mid_channels)
+        self.activ = get_activation_layer(mid_activation)
+        if use_conv:
+            self.conv2 = nn.Conv2D(
+                in_channels=mid_channels,
+                out_channels=channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                bias_attr=True,
+                weight_attr=None)
+        else:
+            self.fc2 = nn.Linear(
+                in_features=mid_channels, out_features=channels)
+        self.sigmoid = get_activation_layer(out_activation)
+
+    def forward(self, x):
+        w = self.pool(x)
+        if not self.use_conv:
+            w = w.reshape(shape=[w.shape[0], -1])
+        w = self.conv1(w) if self.use_conv else self.fc1(w)
+        w = self.activ(w)
+        w = self.conv2(w) if self.use_conv else self.fc2(w)
+        w = self.sigmoid(w)
+        if not self.use_conv:
+            w = w.unsqueeze(2).unsqueeze(3)
+        x = x * w
+        return x
+
+
+class MixConv(nn.Layer):
+    """
+    Mixed convolution layer from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Convolution window size.
+    stride : int or tuple/list of 2 int
+        Strides of the convolution.
+    padding : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Padding value for convolution layer.
+    dilation : int or tuple/list of 2 int, default 1
+        Dilation value for convolution layer.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    axis : int, default 1
+        The axis on which to concatenate the outputs.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 axis=1):
+        super(MixConv, self).__init__()
+        kernel_size = kernel_size if isinstance(kernel_size,
+                                                list) else [kernel_size]
+        padding = padding if isinstance(padding, list) else [padding]
+        kernel_count = len(kernel_size)
+        self.splitted_in_channels = self.split_channels(in_channels,
+                                                        kernel_count)
+        splitted_out_channels = self.split_channels(out_channels, kernel_count)
+        for i, kernel_size_i in enumerate(kernel_size):
+            in_channels_i = self.splitted_in_channels[i]
+            out_channels_i = splitted_out_channels[i]
+            padding_i = padding[i]
+            _ = self.add_sublayer(
+                name=str(i),
+                sublayer=nn.Conv2D(
+                    in_channels=in_channels_i,
+                    out_channels=out_channels_i,
+                    kernel_size=kernel_size_i,
+                    stride=stride,
+                    padding=padding_i,
+                    dilation=dilation,
+                    groups=(out_channels_i
+                            if out_channels == groups else groups),
+                    bias_attr=bias,
+                    weight_attr=None))
+        self.axis = axis
+
+    def forward(self, x):
+        xx = paddle.split(x, self.splitted_in_channels, axis=self.axis)
+        xx = paddle.split(x, self.splitted_in_channels, axis=self.axis)
+        out = [
+            conv_i(x_i) for x_i, conv_i in zip(xx, self._sub_layers.values())
+        ]
+        x = paddle.concat(tuple(out), axis=self.axis)
+        return x
+
+    @staticmethod
+    def split_channels(channels, kernel_count):
+        splitted_channels = [channels // kernel_count] * kernel_count
+        splitted_channels[0] += channels - sum(splitted_channels)
+        return splitted_channels
+
+
+class MixConvBlock(nn.Layer):
+    """
+    Mixed convolution block with Batch normalization and activation.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Convolution window size.
+    stride : int or tuple/list of 2 int
+        Strides of the convolution.
+    padding : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Padding value for convolution layer.
+    dilation : int or tuple/list of 2 int, default 1
+        Dilation value for convolution layer.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    use_bn : bool, default True
+        Whether to use BatchNorm layer.
+    bn_eps : float, default 1e-5
+        Small float added to variance in Batch norm.
+    activation : function or str or None, default nn.ReLU()
+        Activation function or name of activation function.
+    activate : bool, default True
+        Whether activate the convolution block.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 use_bn=True,
+                 bn_eps=1e-5,
+                 activation=nn.ReLU()):
+        super(MixConvBlock, self).__init__()
+        self.activate = (activation is not None)
+        self.use_bn = use_bn
+
+        self.conv = MixConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        if self.use_bn:
+            self.bn = nn.BatchNorm2D(num_features=out_channels, epsilon=bn_eps)
+        if self.activate:
+            self.activ = get_activation_layer(activation)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn(x)
+        if self.activate:
+            x = self.activ(x)
+        return x
+
+
+def mixconv1x1_block(in_channels,
+                     out_channels,
+                     kernel_count,
+                     stride=1,
+                     groups=1,
+                     bias=False,
+                     use_bn=True,
+                     bn_eps=1e-5,
+                     activation=nn.ReLU()):
+    """
+    1x1 version of the mixed convolution block.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_count : int
+        Kernel count.
+    stride : int or tuple/list of 2 int, default 1
+        Strides of the convolution.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    use_bn : bool, default True
+        Whether to use BatchNorm layer.
+    bn_eps : float, default 1e-5
+        Small float added to variance in Batch norm.
+    activation : function or str, or None, default nn.ReLU()
+        Activation function or name of activation function.
+    """
+    return MixConvBlock(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=([1] * kernel_count),
+        stride=stride,
+        padding=([0] * kernel_count),
+        groups=groups,
+        bias=bias,
+        use_bn=use_bn,
+        bn_eps=bn_eps,
+        activation=activation)
+
+
+class MixUnit(nn.Layer):
+    """
+    MixNet unit.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.  exp_channels : int
+        Number of middle (expanded) channels.
+    stride : int or tuple/list of 2 int
+        Strides of the second convolution layer.
+    exp_kernel_count : int
+        Expansion convolution kernel count for each unit.
+    conv1_kernel_count : int
+        Conv1 kernel count for each unit.
+    conv2_kernel_count : int
+        Conv2 kernel count for each unit.
+    exp_factor : int
+        Expansion factor for each unit.
+    se_factor : int
+        SE reduction factor for each unit.
+    activation : str
+        Activation function or name of activation function.
+    """
+
+    def __init__(self, in_channels, out_channels, stride, exp_kernel_count,
+                 conv1_kernel_count, conv2_kernel_count, exp_factor, se_factor,
+                 activation):
+        super(MixUnit, self).__init__()
+        assert exp_factor >= 1
+        assert se_factor >= 0
+        self.residual = (in_channels == out_channels) and (stride == 1)
+        self.use_se = se_factor > 0
+        mid_channels = exp_factor * in_channels
+        self.use_exp_conv = exp_factor > 1
+
+        if self.use_exp_conv:
+            if exp_kernel_count == 1:
+                self.exp_conv = ConvBlock(
+                    in_channels=in_channels,
+                    out_channels=mid_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                    bias=False,
+                    use_bn=True,
+                    bn_eps=1e-5,
+                    activation=activation)
+            else:
+                self.exp_conv = mixconv1x1_block(
+                    in_channels=in_channels,
+                    out_channels=mid_channels,
+                    kernel_count=exp_kernel_count,
+                    activation=activation)
+        if conv1_kernel_count == 1:
+            self.conv1 = ConvBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                dilation=1,
+                groups=mid_channels,
+                bias=False,
+                use_bn=True,
+                bn_eps=1e-5,
+                activation=activation)
+        else:
+            self.conv1 = MixConvBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                kernel_size=[3 + 2 * i for i in range(conv1_kernel_count)],
+                stride=stride,
+                padding=[1 + i for i in range(conv1_kernel_count)],
+                groups=mid_channels,
+                activation=activation)
+        if self.use_se:
+            self.se = SEBlock(
+                channels=mid_channels,
+                reduction=(exp_factor * se_factor),
+                round_mid=False,
+                mid_activation=activation)
+        if conv2_kernel_count == 1:
+            self.conv2 = ConvBlock(
+                in_channels=mid_channels,
+                out_channels=out_channels,
+                activation=None,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                groups=1,
+                bias=False,
+                use_bn=True,
+                bn_eps=1e-5)
+        else:
+            self.conv2 = mixconv1x1_block(
+                in_channels=mid_channels,
+                out_channels=out_channels,
+                kernel_count=conv2_kernel_count,
+                activation=None)
+
+    def forward(self, x):
+        if self.residual:
+            identity = x
+        if self.use_exp_conv:
+            x = self.exp_conv(x)
+        x = self.conv1(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.conv2(x)
+        if self.residual:
+            x = x + identity
+        return x
+
+
+class MixInitBlock(nn.Layer):
+    """
+    MixNet specific initial block.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    """
+
+    def __init__(self, in_channels, out_channels):
+        super(MixInitBlock, self).__init__()
+        self.conv1 = ConvBlock(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=2,
+            kernel_size=3,
+            padding=1)
+        self.conv2 = MixUnit(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            stride=1,
+            exp_kernel_count=1,
+            conv1_kernel_count=1,
+            conv2_kernel_count=1,
+            exp_factor=1,
+            se_factor=0,
+            activation="relu")
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class MixNet(nn.Layer):
+    """
+    MixNet model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+
+    Parameters:
+    ----------
+    channels : list of list of int
+        Number of output channels for each unit.
+    init_block_channels : int
+        Number of output channels for the initial unit.
+    final_block_channels : int
+        Number of output channels for the final block of the feature extractor.
+    exp_kernel_counts : list of list of int
+        Expansion convolution kernel count for each unit.
+    conv1_kernel_counts : list of list of int
+        Conv1 kernel count for each unit.
+    conv2_kernel_counts : list of list of int
+        Conv2 kernel count for each unit.
+    exp_factors : list of list of int
+        Expansion factor for each unit.
+    se_factors : list of list of int
+        SE reduction factor for each unit.
+    in_channels : int, default 3
+        Number of input channels.
+    in_size : tuple of two ints, default (224, 224)
+        Spatial size of the expected input image.
+    class_num : int, default 1000
+        Number of classification classes.
+    """
+
+    def __init__(self,
+                 channels,
+                 init_block_channels,
+                 final_block_channels,
+                 exp_kernel_counts,
+                 conv1_kernel_counts,
+                 conv2_kernel_counts,
+                 exp_factors,
+                 se_factors,
+                 in_channels=3,
+                 in_size=(224, 224),
+                 class_num=1000):
+        super(MixNet, self).__init__()
+        self.in_size = in_size
+        self.class_num = class_num
+
+        self.features = nn.Sequential()
+        self.features.add_sublayer(
+            "init_block",
+            MixInitBlock(
+                in_channels=in_channels, out_channels=init_block_channels))
+        in_channels = init_block_channels
+        for i, channels_per_stage in enumerate(channels):
+            stage = nn.Sequential()
+            for j, out_channels in enumerate(channels_per_stage):
+                stride = 2 if ((j == 0) and (i != 3)) or (
+                    (j == len(channels_per_stage) // 2) and (i == 3)) else 1
+                exp_kernel_count = exp_kernel_counts[i][j]
+                conv1_kernel_count = conv1_kernel_counts[i][j]
+                conv2_kernel_count = conv2_kernel_counts[i][j]
+                exp_factor = exp_factors[i][j]
+                se_factor = se_factors[i][j]
+                activation = "relu" if i == 0 else "swish"
+                stage.add_sublayer(
+                    "unit{}".format(j + 1),
+                    MixUnit(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        stride=stride,
+                        exp_kernel_count=exp_kernel_count,
+                        conv1_kernel_count=conv1_kernel_count,
+                        conv2_kernel_count=conv2_kernel_count,
+                        exp_factor=exp_factor,
+                        se_factor=se_factor,
+                        activation=activation))
+                in_channels = out_channels
+            self.features.add_sublayer("stage{}".format(i + 1), stage)
+        self.features.add_sublayer(
+            "final_block",
+            ConvBlock(
+                in_channels=in_channels,
+                out_channels=final_block_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                groups=1,
+                bias=False,
+                use_bn=True,
+                bn_eps=1e-5,
+                activation=nn.ReLU()))
+        in_channels = final_block_channels
+        self.features.add_sublayer(
+            "final_pool", nn.AvgPool2D(
+                kernel_size=7, stride=1))
+
+        self.output = nn.Linear(
+            in_features=in_channels, out_features=class_num)
+
+    def forward(self, x):
+        x = self.features(x)
+        reshape_dim = reduce(lambda x, y: x * y, x.shape[1:])
+        x = x.reshape(shape=[x.shape[0], reshape_dim])
+        x = self.output(x)
+        return x
+
+
+def get_mixnet(version, width_scale, model_name=None, **kwargs):
+    """
+    Create MixNet model with specific parameters.
+
+    Parameters:
+    ----------
+    version : str
+        Version of MobileNetV3 ('s' or 'm').
+    width_scale : float
+        Scale factor for width of layers.
+    model_name : str or None, default None
+        Model name.
+    """
+
+    if version == "s":
+        init_block_channels = 16
+        channels = [[24, 24], [40, 40, 40, 40], [80, 80, 80],
+                    [120, 120, 120, 200, 200, 200]]
+        exp_kernel_counts = [[2, 2], [1, 2, 2, 2], [1, 1, 1],
+                             [2, 2, 2, 1, 1, 1]]
+        conv1_kernel_counts = [[1, 1], [3, 2, 2, 2], [3, 2, 2],
+                               [3, 4, 4, 5, 4, 4]]
+        conv2_kernel_counts = [[2, 2], [1, 2, 2, 2], [2, 2, 2],
+                               [2, 2, 2, 1, 2, 2]]
+        exp_factors = [[6, 3], [6, 6, 6, 6], [6, 6, 6], [6, 3, 3, 6, 6, 6]]
+        se_factors = [[0, 0], [2, 2, 2, 2], [4, 4, 4], [2, 2, 2, 2, 2, 2]]
+    elif version == "m":
+        init_block_channels = 24
+        channels = [[32, 32], [40, 40, 40, 40], [80, 80, 80, 80],
+                    [120, 120, 120, 120, 200, 200, 200, 200]]
+        exp_kernel_counts = [[2, 2], [1, 2, 2, 2], [1, 2, 2, 2],
+                             [1, 2, 2, 2, 1, 1, 1, 1]]
+        conv1_kernel_counts = [[3, 1], [4, 2, 2, 2], [3, 4, 4, 4],
+                               [1, 4, 4, 4, 4, 4, 4, 4]]
+        conv2_kernel_counts = [[2, 2], [1, 2, 2, 2], [1, 2, 2, 2],
+                               [1, 2, 2, 2, 1, 2, 2, 2]]
+        exp_factors = [[6, 3], [6, 6, 6, 6], [6, 6, 6, 6],
+                       [6, 3, 3, 3, 6, 6, 6, 6]]
+        se_factors = [[0, 0], [2, 2, 2, 2], [4, 4, 4, 4],
+                      [2, 2, 2, 2, 2, 2, 2, 2]]
+    else:
+        raise ValueError("Unsupported MixNet version {}".format(version))
+
+    final_block_channels = 1536
+
+    if width_scale != 1.0:
+        channels = [[round_channels(cij * width_scale) for cij in ci]
+                    for ci in channels]
+        init_block_channels = round_channels(init_block_channels * width_scale)
+
+    net = MixNet(
+        channels=channels,
+        init_block_channels=init_block_channels,
+        final_block_channels=final_block_channels,
+        exp_kernel_counts=exp_kernel_counts,
+        conv1_kernel_counts=conv1_kernel_counts,
+        conv2_kernel_counts=conv2_kernel_counts,
+        exp_factors=exp_factors,
+        se_factors=se_factors,
+        **kwargs)
+
+    return net
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MixNet_S(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MixNet-S model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+    """
+    model = get_mixnet(
+        version="s", width_scale=1.0, model_name="MixNet_S", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MixNet_S"], use_ssld=use_ssld)
+    return model
+
+
+def MixNet_M(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MixNet-M model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+    """
+    model = get_mixnet(
+        version="m", width_scale=1.0, model_name="MixNet_M", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MixNet_M"], use_ssld=use_ssld)
+    return model
+
+
+def MixNet_L(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MixNet-S model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+    """
+    model = get_mixnet(
+        version="m", width_scale=1.3, model_name="MixNet_L", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MixNet_L"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/mobilenet_v2.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/mobilenet_v2.py
new file mode 100644
index 0000000..b32c025
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/mobilenet_v2.py
@@ -0,0 +1,287 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "MobileNetV2_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_25_pretrained.pdparams",
+    "MobileNetV2_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_5_pretrained.pdparams",
+    "MobileNetV2_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_75_pretrained.pdparams",
+    "MobileNetV2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_pretrained.pdparams",
+    "MobileNetV2_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x1_5_pretrained.pdparams",
+    "MobileNetV2_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x2_0_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 name=None,
+                 use_cudnn=True):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance")
+
+    def forward(self, inputs, if_act=True):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if if_act:
+            y = F.relu6(y)
+        return y
+
+
+class InvertedResidualUnit(nn.Layer):
+    def __init__(self, num_channels, num_in_filter, num_filters, stride,
+                 filter_size, padding, expansion_factor, name):
+        super(InvertedResidualUnit, self).__init__()
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        self._expand_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            name=name + "_expand")
+
+        self._bottleneck_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            use_cudnn=False,
+            name=name + "_dwise")
+
+        self._linear_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            name=name + "_linear")
+
+    def forward(self, inputs, ifshortcut):
+        y = self._expand_conv(inputs, if_act=True)
+        y = self._bottleneck_conv(y, if_act=True)
+        y = self._linear_conv(y, if_act=False)
+        if ifshortcut:
+            y = paddle.add(inputs, y)
+        return y
+
+
+class InvresiBlocks(nn.Layer):
+    def __init__(self, in_c, t, c, n, s, name):
+        super(InvresiBlocks, self).__init__()
+
+        self._first_block = InvertedResidualUnit(
+            num_channels=in_c,
+            num_in_filter=in_c,
+            num_filters=c,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t,
+            name=name + "_1")
+
+        self._block_list = []
+        for i in range(1, n):
+            block = self.add_sublayer(
+                name + "_" + str(i + 1),
+                sublayer=InvertedResidualUnit(
+                    num_channels=c,
+                    num_in_filter=c,
+                    num_filters=c,
+                    stride=1,
+                    filter_size=3,
+                    padding=1,
+                    expansion_factor=t,
+                    name=name + "_" + str(i + 1)))
+            self._block_list.append(block)
+
+    def forward(self, inputs):
+        y = self._first_block(inputs, ifshortcut=False)
+        for block in self._block_list:
+            y = block(y, ifshortcut=True)
+        return y
+
+
+class MobileNet(nn.Layer):
+    def __init__(self, class_num=1000, scale=1.0, prefix_name=""):
+        super(MobileNet, self).__init__()
+        self.scale = scale
+        self.class_num = class_num
+
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            name=prefix_name + "conv1_1")
+
+        self.block_list = []
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            block = self.add_sublayer(
+                prefix_name + "conv" + str(i),
+                sublayer=InvresiBlocks(
+                    in_c=in_c,
+                    t=t,
+                    c=int(c * scale),
+                    n=n,
+                    s=s,
+                    name=prefix_name + "conv" + str(i)))
+            self.block_list.append(block)
+            in_c = int(c * scale)
+
+        self.out_c = int(1280 * scale) if scale > 1.0 else 1280
+        self.conv9 = ConvBNLayer(
+            num_channels=in_c,
+            num_filters=self.out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            name=prefix_name + "conv9")
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.out = Linear(
+            self.out_c,
+            class_num,
+            weight_attr=ParamAttr(name=prefix_name + "fc10_weights"),
+            bias_attr=ParamAttr(name=prefix_name + "fc10_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1(inputs, if_act=True)
+        for block in self.block_list:
+            y = block(y)
+        y = self.conv9(y, if_act=True)
+        y = self.pool2d_avg(y)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV2_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=0.25, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x0_25"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=0.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=0.75, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x0_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=1.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x1_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=2.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x2_0"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/pvt_v2.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/pvt_v2.py
new file mode 100644
index 0000000..e2fdfd4
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/pvt_v2.py
@@ -0,0 +1,492 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/whai362/PVT
+
+from functools import partial
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity, drop_path
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "PVT_V2_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B0_pretrained.pdparams",
+    "PVT_V2_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B1_pretrained.pdparams",
+    "PVT_V2_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B2_pretrained.pdparams",
+    "PVT_V2_B2_Linear":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B2_Linear_pretrained.pdparams",
+    "PVT_V2_B3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B3_pretrained.pdparams",
+    "PVT_V2_B4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B4_pretrained.pdparams",
+    "PVT_V2_B5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B5_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+@paddle.jit.not_to_static
+def swapdim(x, dim1, dim2):
+    a = list(range(len(x.shape)))
+    a[dim1], a[dim2] = a[dim2], a[dim1]
+    return x.transpose(a)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.,
+                 linear=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1,
+                 linear=False):
+        super().__init__()
+        assert dim % num_heads == 0
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2D(
+                    dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+        else:
+            self.pool = nn.AdaptiveAvgPool2D(7)
+            self.sr = nn.Conv2D(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(
+            [B, N, self.num_heads, C // self.num_heads]).transpose(
+                [0, 2, 1, 3])
+
+        if not self.linear:
+            if self.sr_ratio > 1:
+                x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+                x_ = self.sr(x_)
+                h_, w_ = x_.shape[-2:]
+                x_ = x_.reshape([B, C, h_ * w_]).transpose([0, 2, 1])
+                x_ = self.norm(x_)
+                kv = self.kv(x_)
+                kv = kv.reshape([
+                    B, kv.shape[2] * kv.shape[1] // 2 // C, 2, self.num_heads,
+                    C // self.num_heads
+                ]).transpose([2, 0, 3, 1, 4])
+            else:
+                kv = self.kv(x)
+                kv = kv.reshape([
+                    B, kv.shape[2] * kv.shape[1] // 2 // C, 2, self.num_heads,
+                    C // self.num_heads
+                ]).transpose([2, 0, 3, 1, 4])
+        else:
+            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+            x_ = self.sr(self.pool(x_))
+            x_ = x_.reshape([B, C, x_.shape[2] * x_.shape[3]]).transpose(
+                [0, 2, 1])
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_)
+            kv = kv.reshape([
+                B, kv.shape[2] * kv.shape[1] // 2 // C, 2, self.num_heads,
+                C // self.num_heads
+            ]).transpose([2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        attn = (q @swapdim(k, -2, -1)) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = swapdim((attn @v), 1, 2).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1,
+                 linear=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio,
+            linear=linear)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop,
+                       linear=linear)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+
+        return x
+
+
+class OverlapPatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=7,
+                 stride=4,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2)
+        x = swapdim(x, 1, 2)
+        x = self.norm(x)
+
+        return x, H, W
+
+
+class PyramidVisionTransformerV2(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 num_stages=4,
+                 linear=False):
+        super().__init__()
+        self.class_num = class_num
+        self.depths = depths
+        self.num_stages = num_stages
+
+        dpr = [x for x in paddle.linspace(0, drop_path_rate, sum(depths))
+               ]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(
+                img_size=img_size if i == 0 else img_size // (2**(i + 1)),
+                patch_size=7 if i == 0 else 3,
+                stride=4 if i == 0 else 2,
+                in_chans=in_chans if i == 0 else embed_dims[i - 1],
+                embed_dim=embed_dims[i])
+
+            block = nn.LayerList([
+                Block(
+                    dim=embed_dims[i],
+                    num_heads=num_heads[i],
+                    mlp_ratio=mlp_ratios[i],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + j],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[i],
+                    linear=linear) for j in range(depths[i])
+            ])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+
+        # classification head
+        self.head = nn.Linear(embed_dims[3],
+                              class_num) if class_num > 0 else Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embed(x)
+            for blk in block:
+                x = blk(x, H, W)
+            x = norm(x)
+            if i != self.num_stages - 1:
+                x = x.reshape([B, H, W, x.shape[2]]).transpose([0, 3, 1, 2])
+
+        return x.mean(axis=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = swapdim(x, 1, 2)
+        x = x.reshape([B, C, H, W])
+        x = self.dwconv(x)
+        x = x.flatten(2)
+        x = swapdim(x, 1, 2)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PVT_V2_B0(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[32, 64, 160, 256],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B0"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B1(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B1"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B2(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B2"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B3(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 18, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B3"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B4(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 8, 27, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B4"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B5(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 6, 40, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B5"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B2_Linear(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        linear=True,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B2_Linear"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/rednet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/rednet.py
new file mode 100644
index 0000000..be84da1
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/rednet.py
@@ -0,0 +1,203 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/d-li14/involution
+
+import paddle
+import paddle.nn as nn
+
+from paddle.vision.models import resnet
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "RedNet26":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet26_pretrained.pdparams",
+    "RedNet38":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet38_pretrained.pdparams",
+    "RedNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet50_pretrained.pdparams",
+    "RedNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet101_pretrained.pdparams",
+    "RedNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet152_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+class Involution(nn.Layer):
+    def __init__(self, channels, kernel_size, stride):
+        super(Involution, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.channels = channels
+        reduction_ratio = 4
+        self.group_channels = 16
+        self.groups = self.channels // self.group_channels
+        self.conv1 = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels=channels,
+                out_channels=channels // reduction_ratio,
+                kernel_size=1,
+                bias_attr=False)),
+            ('bn', nn.BatchNorm2D(channels // reduction_ratio)),
+            ('activate', nn.ReLU()))
+        self.conv2 = nn.Sequential(('conv', nn.Conv2D(
+            in_channels=channels // reduction_ratio,
+            out_channels=kernel_size**2 * self.groups,
+            kernel_size=1,
+            stride=1)))
+        if stride > 1:
+            self.avgpool = nn.AvgPool2D(stride, stride)
+
+    def forward(self, x):
+        weight = self.conv2(
+            self.conv1(x if self.stride == 1 else self.avgpool(x)))
+        b, c, h, w = weight.shape
+        weight = weight.reshape(
+            (b, self.groups, self.kernel_size**2, h, w)).unsqueeze(2)
+
+        out = nn.functional.unfold(x, self.kernel_size, self.stride,
+                                   (self.kernel_size - 1) // 2, 1)
+        out = out.reshape(
+            (b, self.groups, self.group_channels, self.kernel_size**2, h, w))
+        out = (weight * out).sum(axis=3).reshape((b, self.channels, h, w))
+        return out
+
+
+class BottleneckBlock(resnet.BottleneckBlock):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BottleneckBlock, self).__init__(inplanes, planes, stride,
+                                              downsample, groups, base_width,
+                                              dilation, norm_layer)
+        width = int(planes * (base_width / 64.)) * groups
+        self.conv2 = Involution(width, 7, stride)
+
+
+class RedNet(resnet.ResNet):
+    def __init__(self, block, depth, class_num=1000, with_pool=True):
+        super(RedNet, self).__init__(
+            block=block, depth=50, num_classes=class_num, with_pool=with_pool)
+        layer_cfg = {
+            26: [1, 2, 4, 1],
+            38: [2, 3, 5, 2],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3]
+        }
+        layers = layer_cfg[depth]
+
+        self.conv1 = None
+        self.bn1 = None
+        self.relu = None
+        self.inplanes = 64
+        self.class_num = class_num
+        self.stem = nn.Sequential(
+            nn.Sequential(
+                ('conv', nn.Conv2D(
+                    in_channels=3,
+                    out_channels=self.inplanes // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias_attr=False)),
+                ('bn', nn.BatchNorm2D(self.inplanes // 2)),
+                ('activate', nn.ReLU())),
+            Involution(self.inplanes // 2, 3, 1),
+            nn.BatchNorm2D(self.inplanes // 2),
+            nn.ReLU(),
+            nn.Sequential(
+                ('conv', nn.Conv2D(
+                    in_channels=self.inplanes // 2,
+                    out_channels=self.inplanes,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias_attr=False)), ('bn', nn.BatchNorm2D(self.inplanes)),
+                ('activate', nn.ReLU())))
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.with_pool:
+            x = self.avgpool(x)
+
+        if self.class_num > 0:
+            x = paddle.flatten(x, 1)
+            x = self.fc(x)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def RedNet26(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 26, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet26"])
+    return model
+
+
+def RedNet38(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 38, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet38"])
+    return model
+
+
+def RedNet50(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 50, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet50"])
+    return model
+
+
+def RedNet101(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 101, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet101"])
+    return model
+
+
+def RedNet152(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 152, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet152"])
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/regnet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/regnet.py
new file mode 100644
index 0000000..dc381cb
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/regnet.py
@@ -0,0 +1,431 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/facebookresearch/pycls
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "RegNetX_200MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_200MF_pretrained.pdparams",
+    "RegNetX_4GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_4GF_pretrained.pdparams",
+    "RegNetX_32GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_32GF_pretrained.pdparams",
+    "RegNetY_200MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetY_200MF_pretrained.pdparams",
+    "RegNetY_4GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetY_4GF_pretrained.pdparams",
+    "RegNetY_32GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetY_32GF_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def quantize_float(f, q):
+    """Converts a float to closest non-zero int divisible by q."""
+    return int(round(f / q) * q)
+
+
+def adjust_ws_gs_comp(ws, bms, gs):
+    """Adjusts the compatibility of widths and groups."""
+    ws_bot = [int(w * b) for w, b in zip(ws, bms)]
+    gs = [min(g, w_bot) for g, w_bot in zip(gs, ws_bot)]
+    ws_bot = [quantize_float(w_bot, g) for w_bot, g in zip(ws_bot, gs)]
+    ws = [int(w_bot / b) for w_bot, b in zip(ws_bot, bms)]
+    return ws, gs
+
+
+def get_stages_from_blocks(ws, rs):
+    """Gets ws/ds of network at each stage from per block values."""
+    ts = [
+        w != wp or r != rp
+        for w, wp, r, rp in zip(ws + [0], [0] + ws, rs + [0], [0] + rs)
+    ]
+    s_ws = [w for w, t in zip(ws, ts[:-1]) if t]
+    s_ds = np.diff([d for d, t in zip(range(len(ts)), ts) if t]).tolist()
+    return s_ws, s_ds
+
+
+def generate_regnet(w_a, w_0, w_m, d, q=8):
+    """Generates per block ws from RegNet parameters."""
+    assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0
+    ws_cont = np.arange(d) * w_a + w_0
+    ks = np.round(np.log(ws_cont / w_0) / np.log(w_m))
+    ws = w_0 * np.power(w_m, ks)
+    ws = np.round(np.divide(ws, q)) * q
+    num_stages, max_stage = len(np.unique(ws)), ks.max() + 1
+    ws, ws_cont = ws.astype(int).tolist(), ws_cont.tolist()
+    return ws, num_stages, max_stage, ws_cont
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + ".conv2d.output.1.w_0"),
+            bias_attr=ParamAttr(name=name + ".conv2d.output.1.b_0"))
+        bn_name = name + "_bn"
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + ".output.1.w_0"),
+            bias_attr=ParamAttr(bn_name + ".output.1.b_0"),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 bm,
+                 gw,
+                 se_on,
+                 se_r,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        # Compute the bottleneck width
+        w_b = int(round(num_filters * bm))
+        # Compute the number of groups
+        num_gs = w_b // gw
+        self.se_on = se_on
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=w_b,
+            filter_size=1,
+            padding=0,
+            act="relu",
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=w_b,
+            num_filters=w_b,
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            groups=num_gs,
+            act="relu",
+            name=name + "_branch2b")
+        if se_on:
+            w_se = int(round(num_channels * se_r))
+            self.se_block = SELayer(
+                num_channels=w_b,
+                num_filters=w_b,
+                reduction_ratio=w_se,
+                name=name + "_branch2se")
+        self.conv2 = ConvBNLayer(
+            num_channels=w_b,
+            num_filters=num_filters,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        if self.se_on:
+            conv1 = self.se_block(conv1)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + "_sqz_offset"))
+
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + "_exc_offset"))
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.reshape(pool, shape=[-1, self._num_channels])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = F.sigmoid(excitation)
+        excitation = paddle.reshape(
+            excitation, shape=[-1, self._num_channels, 1, 1])
+        out = input * excitation
+        return out
+
+
+class RegNet(nn.Layer):
+    def __init__(self,
+                 w_a,
+                 w_0,
+                 w_m,
+                 d,
+                 group_w,
+                 bot_mul,
+                 q=8,
+                 se_on=False,
+                 class_num=1000):
+        super(RegNet, self).__init__()
+
+        # Generate RegNet ws per block
+        b_ws, num_s, max_s, ws_cont = generate_regnet(w_a, w_0, w_m, d, q)
+        # Convert to per stage format
+        ws, ds = get_stages_from_blocks(b_ws, b_ws)
+        # Generate group widths and bot muls
+        gws = [group_w for _ in range(num_s)]
+        bms = [bot_mul for _ in range(num_s)]
+        # Adjust the compatibility of ws and gws
+        ws, gws = adjust_ws_gs_comp(ws, bms, gws)
+        # Use the same stride for each stage
+        ss = [2 for _ in range(num_s)]
+        # Use SE for RegNetY
+        se_r = 0.25
+        # Construct the model
+        # Group params by stage
+        stage_params = list(zip(ds, ws, ss, bms, gws))
+        # Construct the stem
+        stem_type = "simple_stem_in"
+        stem_w = 32
+        block_type = "res_bottleneck_block"
+
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            num_filters=stem_w,
+            filter_size=3,
+            stride=2,
+            padding=1,
+            act="relu",
+            name="stem_conv")
+
+        self.block_list = []
+        for block, (d, w_out, stride, bm, gw) in enumerate(stage_params):
+            shortcut = False
+            for i in range(d):
+                num_channels = stem_w if block == i == 0 else in_channels
+                # Stride apply to the first block of the stage
+                b_stride = stride if i == 0 else 1
+                conv_name = "s" + str(block + 1) + "_b" + str(i +
+                                                              1)  # chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    conv_name,
+                    BottleneckBlock(
+                        num_channels=num_channels,
+                        num_filters=w_out,
+                        stride=b_stride,
+                        bm=bm,
+                        gw=gw,
+                        se_on=se_on,
+                        se_r=se_r,
+                        shortcut=shortcut,
+                        name=conv_name))
+                in_channels = w_out
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = w_out
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_0.w_0"),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def RegNetX_200MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=36.44,
+        w_0=24,
+        w_m=2.49,
+        d=13,
+        group_w=8,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_200MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_4GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=38.65,
+        w_0=96,
+        w_m=2.43,
+        d=23,
+        group_w=40,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_4GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_32GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=69.86,
+        w_0=320,
+        w_m=2.0,
+        d=23,
+        group_w=168,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_32GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetY_200MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=36.44,
+        w_0=24,
+        w_m=2.49,
+        d=13,
+        group_w=8,
+        bot_mul=1.0,
+        q=8,
+        se_on=True,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_32GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetY_4GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=31.41,
+        w_0=96,
+        w_m=2.24,
+        d=22,
+        group_w=64,
+        bot_mul=1.0,
+        q=8,
+        se_on=True,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_32GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetY_32GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=115.89,
+        w_0=232,
+        w_m=2.53,
+        d=20,
+        group_w=232,
+        bot_mul=1.0,
+        q=8,
+        se_on=True,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_32GF"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/repvgg.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/repvgg.py
new file mode 100644
index 0000000..1218be7
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/repvgg.py
@@ -0,0 +1,382 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/DingXiaoH/RepVGG
+
+import paddle.nn as nn
+import paddle
+import numpy as np
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "RepVGG_A0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_A0_pretrained.pdparams",
+    "RepVGG_A1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_A1_pretrained.pdparams",
+    "RepVGG_A2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_A2_pretrained.pdparams",
+    "RepVGG_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B0_pretrained.pdparams",
+    "RepVGG_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B1_pretrained.pdparams",
+    "RepVGG_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B2_pretrained.pdparams",
+    "RepVGG_B1g2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B1g2_pretrained.pdparams",
+    "RepVGG_B1g4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B1g4_pretrained.pdparams",
+    "RepVGG_B2g4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B2g4_pretrained.pdparams",
+    "RepVGG_B3g4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B3g4_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+optional_groupwise_layers = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26]
+g2_map = {l: 2 for l in optional_groupwise_layers}
+g4_map = {l: 4 for l in optional_groupwise_layers}
+
+
+class ConvBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1):
+        super(ConvBN, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+        self.bn = nn.BatchNorm2D(num_features=out_channels)
+
+    def forward(self, x):
+        y = self.conv(x)
+        y = self.bn(y)
+        return y
+
+
+class RepVGGBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros'):
+        super(RepVGGBlock, self).__init__()
+        self.is_repped = False
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.padding_mode = padding_mode
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        padding_11 = padding - kernel_size // 2
+
+        self.nonlinearity = nn.ReLU()
+
+        self.rbr_identity = nn.BatchNorm2D(
+            num_features=in_channels
+        ) if out_channels == in_channels and stride == 1 else None
+        self.rbr_dense = ConvBN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups)
+        self.rbr_1x1 = ConvBN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=padding_11,
+            groups=groups)
+
+    def forward(self, inputs):
+        if not self.training and not self.is_repped:
+            self.rep()
+            self.is_repped = True
+        if self.training and self.is_repped:
+            self.is_repped = False
+
+        if not self.training:
+            return self.nonlinearity(self.rbr_reparam(inputs))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+        return self.nonlinearity(
+            self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)
+
+    def rep(self):
+        if not hasattr(self, 'rbr_reparam'):
+            self.rbr_reparam = nn.Conv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.kernel_size,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                padding_mode=self.padding_mode)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam.weight.set_value(kernel)
+        self.rbr_reparam.bias.set_value(bias)
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, ConvBN):
+            kernel = branch.conv.weight
+            running_mean = branch.bn._mean
+            running_var = branch.bn._variance
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2D)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros(
+                    (self.in_channels, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = paddle.to_tensor(kernel_value)
+            kernel = self.id_tensor
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class RepVGG(nn.Layer):
+    def __init__(self,
+                 num_blocks,
+                 width_multiplier=None,
+                 override_groups_map=None,
+                 class_num=1000):
+        super(RepVGG, self).__init__()
+
+        assert len(width_multiplier) == 4
+        self.override_groups_map = override_groups_map or dict()
+
+        assert 0 not in self.override_groups_map
+
+        self.in_planes = min(64, int(64 * width_multiplier[0]))
+
+        self.stage0 = RepVGGBlock(
+            in_channels=3,
+            out_channels=self.in_planes,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.cur_layer_idx = 1
+        self.stage1 = self._make_stage(
+            int(64 * width_multiplier[0]), num_blocks[0], stride=2)
+        self.stage2 = self._make_stage(
+            int(128 * width_multiplier[1]), num_blocks[1], stride=2)
+        self.stage3 = self._make_stage(
+            int(256 * width_multiplier[2]), num_blocks[2], stride=2)
+        self.stage4 = self._make_stage(
+            int(512 * width_multiplier[3]), num_blocks[3], stride=2)
+        self.gap = nn.AdaptiveAvgPool2D(output_size=1)
+        self.linear = nn.Linear(int(512 * width_multiplier[3]), class_num)
+
+    def _make_stage(self, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        blocks = []
+        for stride in strides:
+            cur_groups = self.override_groups_map.get(self.cur_layer_idx, 1)
+            blocks.append(
+                RepVGGBlock(
+                    in_channels=self.in_planes,
+                    out_channels=planes,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=cur_groups))
+            self.in_planes = planes
+            self.cur_layer_idx += 1
+        return nn.Sequential(*blocks)
+
+    def forward(self, x):
+        out = self.stage0(x)
+        out = self.stage1(out)
+        out = self.stage2(out)
+        out = self.stage3(out)
+        out = self.stage4(out)
+        out = self.gap(out)
+        out = paddle.flatten(out, start_axis=1)
+        out = self.linear(out)
+        return out
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def RepVGG_A0(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[2, 4, 14, 1],
+        width_multiplier=[0.75, 0.75, 0.75, 2.5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_A0"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_A1(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[2, 4, 14, 1],
+        width_multiplier=[1, 1, 1, 2.5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_A1"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_A2(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[2, 4, 14, 1],
+        width_multiplier=[1.5, 1.5, 1.5, 2.75],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_A2"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B0(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[1, 1, 1, 2.5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B0"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B1(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2, 2, 2, 4],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B1"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B1g2(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2, 2, 2, 4],
+        override_groups_map=g2_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B1g2"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B1g4(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2, 2, 2, 4],
+        override_groups_map=g4_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B1g4"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B2(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2.5, 2.5, 2.5, 5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B2"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B2g4(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2.5, 2.5, 2.5, 5],
+        override_groups_map=g4_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B2g4"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B3g4(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[3, 3, 3, 5],
+        override_groups_map=g4_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B3g4"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/res2net.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/res2net.py
new file mode 100644
index 0000000..191cc84
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/res2net.py
@@ -0,0 +1,264 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "Res2Net50_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net50_26w_4s_pretrained.pdparams",
+    "Res2Net50_14w_8s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net50_14w_8s_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels1,
+                 num_channels2,
+                 num_filters,
+                 stride,
+                 scales,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.stride = stride
+        self.scales = scales
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels1,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1_list = []
+        for s in range(scales - 1):
+            conv1 = self.add_sublayer(
+                name + '_branch2b_' + str(s + 1),
+                ConvBNLayer(
+                    num_channels=num_filters // scales,
+                    num_filters=num_filters // scales,
+                    filter_size=3,
+                    stride=stride,
+                    act='relu',
+                    name=name + '_branch2b_' + str(s + 1)))
+            self.conv1_list.append(conv1)
+        self.pool2d_avg = AvgPool2D(kernel_size=3, stride=stride, padding=1)
+
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_channels2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels1,
+                num_filters=num_channels2,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        xs = paddle.split(y, self.scales, 1)
+        ys = []
+        for s, conv1 in enumerate(self.conv1_list):
+            if s == 0 or self.stride == 2:
+                ys.append(conv1(xs[s]))
+            else:
+                ys.append(conv1(paddle.add(xs[s], ys[-1])))
+        if self.stride == 1:
+            ys.append(xs[-1])
+        else:
+            ys.append(self.pool2d_avg(xs[-1]))
+        conv1 = paddle.concat(ys, axis=1)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class Res2Net(nn.Layer):
+    def __init__(self, layers=50, scales=4, width=26, class_num=1000):
+        super(Res2Net, self).__init__()
+
+        self.layers = layers
+        self.scales = scales
+        self.width = width
+        basic_width = self.width * self.scales
+        supported_layers = [50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_channels2 = [256, 512, 1024, 2048]
+        num_filters = [basic_width * t for t in [1, 2, 4, 8]]
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="conv1")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels1=num_channels[block]
+                        if i == 0 else num_channels2[block],
+                        num_channels2=num_channels2[block],
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        scales=scales,
+                        shortcut=shortcut,
+                        if_first=block == i == 0,
+                        name=conv_name))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Res2Net50_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net(layers=50, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Res2Net50_26w_4s"], use_ssld=use_ssld)
+    return model
+
+
+def Res2Net50_14w_8s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net(layers=50, scales=8, width=14, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Res2Net50_14w_8s"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/res2net_vd.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/res2net_vd.py
new file mode 100644
index 0000000..a375679
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/res2net_vd.py
@@ -0,0 +1,305 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "Res2Net50_vd_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net50_vd_26w_4s_pretrained.pdparams",
+    "Res2Net101_vd_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net101_vd_26w_4s_pretrained.pdparams",
+    "Res2Net200_vd_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net200_vd_26w_4s_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels1,
+                 num_channels2,
+                 num_filters,
+                 stride,
+                 scales,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.stride = stride
+        self.scales = scales
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels1,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1_list = []
+        for s in range(scales - 1):
+            conv1 = self.add_sublayer(
+                name + '_branch2b_' + str(s + 1),
+                ConvBNLayer(
+                    num_channels=num_filters // scales,
+                    num_filters=num_filters // scales,
+                    filter_size=3,
+                    stride=stride,
+                    act='relu',
+                    name=name + '_branch2b_' + str(s + 1)))
+            self.conv1_list.append(conv1)
+        self.pool2d_avg = AvgPool2D(kernel_size=3, stride=stride, padding=1)
+
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_channels2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels1,
+                num_filters=num_channels2,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        xs = paddle.split(y, self.scales, 1)
+        ys = []
+        for s, conv1 in enumerate(self.conv1_list):
+            if s == 0 or self.stride == 2:
+                ys.append(conv1(xs[s]))
+            else:
+                ys.append(conv1(xs[s] + ys[-1]))
+        if self.stride == 1:
+            ys.append(xs[-1])
+        else:
+            ys.append(self.pool2d_avg(xs[-1]))
+        conv1 = paddle.concat(ys, axis=1)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class Res2Net_vd(nn.Layer):
+    def __init__(self, layers=50, scales=4, width=26, class_num=1000):
+        super(Res2Net_vd, self).__init__()
+
+        self.layers = layers
+        self.scales = scales
+        self.width = width
+        basic_width = self.width * self.scales
+        supported_layers = [50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_channels2 = [256, 512, 1024, 2048]
+        num_filters = [basic_width * t for t in [1, 2, 4, 8]]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152, 200] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels1=num_channels[block]
+                        if i == 0 else num_channels2[block],
+                        num_channels2=num_channels2[block],
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        scales=scales,
+                        shortcut=shortcut,
+                        if_first=block == i == 0,
+                        name=conv_name))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Res2Net50_vd_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net_vd(layers=50, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["Res2Net50_vd_26w_4s"],
+        use_ssld=use_ssld)
+    return model
+
+
+def Res2Net101_vd_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net_vd(layers=101, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["Res2Net101_vd_26w_4s"],
+        use_ssld=use_ssld)
+    return model
+
+
+def Res2Net200_vd_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net_vd(layers=200, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["Res2Net200_vd_26w_4s"],
+        use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnest.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnest.py
new file mode 100644
index 0000000..88eee8a
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnest.py
@@ -0,0 +1,740 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/zhanghang1989/ResNeSt
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import math
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.regularizer import L2Decay
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNeSt50_fast_1s1x64d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt50_fast_1s1x64d_pretrained.pdparams",
+    "ResNeSt50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt50_pretrained.pdparams",
+    "ResNeSt101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt101_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        bn_decay = 0.0
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weight"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(
+                name=name + "_scale", regularizer=L2Decay(bn_decay)),
+            bias_attr=ParamAttr(
+                name + "_offset", regularizer=L2Decay(bn_decay)),
+            moving_mean_name=name + "_mean",
+            moving_variance_name=name + "_variance")
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        return x
+
+
+class rSoftmax(nn.Layer):
+    def __init__(self, radix, cardinality):
+        super(rSoftmax, self).__init__()
+        self.radix = radix
+        self.cardinality = cardinality
+
+    def forward(self, x):
+        cardinality = self.cardinality
+        radix = self.radix
+
+        batch, r, h, w = x.shape
+        if self.radix > 1:
+            x = paddle.reshape(
+                x=x,
+                shape=[
+                    batch, cardinality, radix,
+                    int(r * h * w / cardinality / radix)
+                ])
+            x = paddle.transpose(x=x, perm=[0, 2, 1, 3])
+            x = nn.functional.softmax(x, axis=1)
+            x = paddle.reshape(x=x, shape=[batch, r * h * w, 1, 1])
+        else:
+            x = nn.functional.sigmoid(x)
+        return x
+
+
+class SplatConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 radix=2,
+                 reduction_factor=4,
+                 rectify_avg=False,
+                 name=None):
+        super(SplatConv, self).__init__()
+
+        self.radix = radix
+
+        self.conv1 = ConvBNLayer(
+            num_channels=in_channels,
+            num_filters=channels * radix,
+            filter_size=kernel_size,
+            stride=stride,
+            groups=groups * radix,
+            act="relu",
+            name=name + "_1_weights")
+
+        self.avg_pool2d = AdaptiveAvgPool2D(1)
+
+        inter_channels = int(max(in_channels * radix // reduction_factor, 32))
+
+        # to calc gap
+        self.conv2 = ConvBNLayer(
+            num_channels=channels,
+            num_filters=inter_channels,
+            filter_size=1,
+            stride=1,
+            groups=groups,
+            act="relu",
+            name=name + "_2_weights")
+
+        # to calc atten
+        self.conv3 = Conv2D(
+            in_channels=inter_channels,
+            out_channels=channels * radix,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=groups,
+            weight_attr=ParamAttr(
+                name=name + "_weights", initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.rsoftmax = rSoftmax(radix=radix, cardinality=groups)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        if self.radix > 1:
+            splited = paddle.split(x, num_or_sections=self.radix, axis=1)
+            gap = paddle.add_n(splited)
+        else:
+            gap = x
+
+        gap = self.avg_pool2d(gap)
+        gap = self.conv2(gap)
+
+        atten = self.conv3(gap)
+        atten = self.rsoftmax(atten)
+
+        if self.radix > 1:
+            attens = paddle.split(atten, num_or_sections=self.radix, axis=1)
+            y = paddle.add_n([
+                paddle.multiply(split, att)
+                for (att, split) in zip(attens, splited)
+            ])
+        else:
+            y = paddle.multiply(x, atten)
+
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 radix=1,
+                 cardinality=1,
+                 bottleneck_width=64,
+                 avd=False,
+                 avd_first=False,
+                 dilation=1,
+                 is_first=False,
+                 rectify_avg=False,
+                 last_gamma=False,
+                 avg_down=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.radix = radix
+        self.cardinality = cardinality
+        self.avd = avd
+        self.avd_first = avd_first
+        self.dilation = dilation
+        self.is_first = is_first
+        self.rectify_avg = rectify_avg
+        self.last_gamma = last_gamma
+        self.avg_down = avg_down
+
+        group_width = int(planes * (bottleneck_width / 64.)) * cardinality
+
+        self.conv1 = ConvBNLayer(
+            num_channels=self.inplanes,
+            num_filters=group_width,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            act="relu",
+            name=name + "_conv1")
+
+        if avd and avd_first and (stride > 1 or is_first):
+            self.avg_pool2d_1 = AvgPool2D(
+                kernel_size=3, stride=stride, padding=1)
+
+        if radix >= 1:
+            self.conv2 = SplatConv(
+                in_channels=group_width,
+                channels=group_width,
+                kernel_size=3,
+                stride=1,
+                padding=dilation,
+                dilation=dilation,
+                groups=cardinality,
+                bias=False,
+                radix=radix,
+                rectify_avg=rectify_avg,
+                name=name + "_splat")
+        else:
+            self.conv2 = ConvBNLayer(
+                num_channels=group_width,
+                num_filters=group_width,
+                filter_size=3,
+                stride=1,
+                dilation=dilation,
+                groups=cardinality,
+                act="relu",
+                name=name + "_conv2")
+
+        if avd and avd_first == False and (stride > 1 or is_first):
+            self.avg_pool2d_2 = AvgPool2D(
+                kernel_size=3, stride=stride, padding=1)
+
+        self.conv3 = ConvBNLayer(
+            num_channels=group_width,
+            num_filters=planes * 4,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            act=None,
+            name=name + "_conv3")
+
+        if stride != 1 or self.inplanes != self.planes * 4:
+            if avg_down:
+                if dilation == 1:
+                    self.avg_pool2d_3 = AvgPool2D(
+                        kernel_size=stride, stride=stride, padding=0)
+                else:
+                    self.avg_pool2d_3 = AvgPool2D(
+                        kernel_size=1, stride=1, padding=0, ceil_mode=True)
+
+                self.conv4 = Conv2D(
+                    in_channels=self.inplanes,
+                    out_channels=planes * 4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                    weight_attr=ParamAttr(
+                        name=name + "_weights", initializer=KaimingNormal()),
+                    bias_attr=False)
+            else:
+                self.conv4 = Conv2D(
+                    in_channels=self.inplanes,
+                    out_channels=planes * 4,
+                    kernel_size=1,
+                    stride=stride,
+                    padding=0,
+                    groups=1,
+                    weight_attr=ParamAttr(
+                        name=name + "_shortcut_weights",
+                        initializer=KaimingNormal()),
+                    bias_attr=False)
+
+            bn_decay = 0.0
+            self._batch_norm = BatchNorm(
+                planes * 4,
+                act=None,
+                param_attr=ParamAttr(
+                    name=name + "_shortcut_scale",
+                    regularizer=L2Decay(bn_decay)),
+                bias_attr=ParamAttr(
+                    name + "_shortcut_offset", regularizer=L2Decay(bn_decay)),
+                moving_mean_name=name + "_shortcut_mean",
+                moving_variance_name=name + "_shortcut_variance")
+
+    def forward(self, x):
+        short = x
+
+        x = self.conv1(x)
+        if self.avd and self.avd_first and (self.stride > 1 or self.is_first):
+            x = self.avg_pool2d_1(x)
+
+        x = self.conv2(x)
+
+        if self.avd and self.avd_first == False and (self.stride > 1 or
+                                                     self.is_first):
+            x = self.avg_pool2d_2(x)
+
+        x = self.conv3(x)
+
+        if self.stride != 1 or self.inplanes != self.planes * 4:
+            if self.avg_down:
+                short = self.avg_pool2d_3(short)
+
+            short = self.conv4(short)
+
+            short = self._batch_norm(short)
+
+        y = paddle.add(x=short, y=x)
+        y = F.relu(y)
+        return y
+
+
+class ResNeStLayer(nn.Layer):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 blocks,
+                 radix,
+                 cardinality,
+                 bottleneck_width,
+                 avg_down,
+                 avd,
+                 avd_first,
+                 rectify_avg,
+                 last_gamma,
+                 stride=1,
+                 dilation=1,
+                 is_first=True,
+                 name=None):
+        super(ResNeStLayer, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+        self.blocks = blocks
+        self.radix = radix
+        self.cardinality = cardinality
+        self.bottleneck_width = bottleneck_width
+        self.avg_down = avg_down
+        self.avd = avd
+        self.avd_first = avd_first
+        self.rectify_avg = rectify_avg
+        self.last_gamma = last_gamma
+        self.is_first = is_first
+
+        if dilation == 1 or dilation == 2:
+            bottleneck_func = self.add_sublayer(
+                name + "_bottleneck_0",
+                BottleneckBlock(
+                    inplanes=self.inplanes,
+                    planes=planes,
+                    stride=stride,
+                    radix=radix,
+                    cardinality=cardinality,
+                    bottleneck_width=bottleneck_width,
+                    avg_down=self.avg_down,
+                    avd=avd,
+                    avd_first=avd_first,
+                    dilation=1,
+                    is_first=is_first,
+                    rectify_avg=rectify_avg,
+                    last_gamma=last_gamma,
+                    name=name + "_bottleneck_0"))
+        elif dilation == 4:
+            bottleneck_func = self.add_sublayer(
+                name + "_bottleneck_0",
+                BottleneckBlock(
+                    inplanes=self.inplanes,
+                    planes=planes,
+                    stride=stride,
+                    radix=radix,
+                    cardinality=cardinality,
+                    bottleneck_width=bottleneck_width,
+                    avg_down=self.avg_down,
+                    avd=avd,
+                    avd_first=avd_first,
+                    dilation=2,
+                    is_first=is_first,
+                    rectify_avg=rectify_avg,
+                    last_gamma=last_gamma,
+                    name=name + "_bottleneck_0"))
+        else:
+            raise RuntimeError("=>unknown dilation size")
+
+        self.inplanes = planes * 4
+        self.bottleneck_block_list = [bottleneck_func]
+        for i in range(1, blocks):
+            curr_name = name + "_bottleneck_" + str(i)
+
+            bottleneck_func = self.add_sublayer(
+                curr_name,
+                BottleneckBlock(
+                    inplanes=self.inplanes,
+                    planes=planes,
+                    radix=radix,
+                    cardinality=cardinality,
+                    bottleneck_width=bottleneck_width,
+                    avg_down=self.avg_down,
+                    avd=avd,
+                    avd_first=avd_first,
+                    dilation=dilation,
+                    rectify_avg=rectify_avg,
+                    last_gamma=last_gamma,
+                    name=curr_name))
+            self.bottleneck_block_list.append(bottleneck_func)
+
+    def forward(self, x):
+        for bottleneck_block in self.bottleneck_block_list:
+            x = bottleneck_block(x)
+        return x
+
+
+class ResNeSt(nn.Layer):
+    def __init__(self,
+                 layers,
+                 radix=1,
+                 groups=1,
+                 bottleneck_width=64,
+                 dilated=False,
+                 dilation=1,
+                 deep_stem=False,
+                 stem_width=64,
+                 avg_down=False,
+                 rectify_avg=False,
+                 avd=False,
+                 avd_first=False,
+                 final_drop=0.0,
+                 last_gamma=False,
+                 class_num=1000):
+        super(ResNeSt, self).__init__()
+
+        self.cardinality = groups
+        self.bottleneck_width = bottleneck_width
+        # ResNet-D params
+        self.inplanes = stem_width * 2 if deep_stem else 64
+        self.avg_down = avg_down
+        self.last_gamma = last_gamma
+        # ResNeSt params
+        self.radix = radix
+        self.avd = avd
+        self.avd_first = avd_first
+
+        self.deep_stem = deep_stem
+        self.stem_width = stem_width
+        self.layers = layers
+        self.final_drop = final_drop
+        self.dilated = dilated
+        self.dilation = dilation
+
+        self.rectify_avg = rectify_avg
+
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ("conv1", ConvBNLayer(
+                    num_channels=3,
+                    num_filters=stem_width,
+                    filter_size=3,
+                    stride=2,
+                    act="relu",
+                    name="conv1")), ("conv2", ConvBNLayer(
+                        num_channels=stem_width,
+                        num_filters=stem_width,
+                        filter_size=3,
+                        stride=1,
+                        act="relu",
+                        name="conv2")), ("conv3", ConvBNLayer(
+                            num_channels=stem_width,
+                            num_filters=stem_width * 2,
+                            filter_size=3,
+                            stride=1,
+                            act="relu",
+                            name="conv3")))
+        else:
+            self.stem = ConvBNLayer(
+                num_channels=3,
+                num_filters=stem_width,
+                filter_size=7,
+                stride=2,
+                act="relu",
+                name="conv1")
+
+        self.max_pool2d = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = ResNeStLayer(
+            inplanes=self.stem_width * 2
+            if self.deep_stem else self.stem_width,
+            planes=64,
+            blocks=self.layers[0],
+            radix=radix,
+            cardinality=self.cardinality,
+            bottleneck_width=bottleneck_width,
+            avg_down=self.avg_down,
+            avd=avd,
+            avd_first=avd_first,
+            rectify_avg=rectify_avg,
+            last_gamma=last_gamma,
+            stride=1,
+            dilation=1,
+            is_first=False,
+            name="layer1")
+
+        #         return
+
+        self.layer2 = ResNeStLayer(
+            inplanes=256,
+            planes=128,
+            blocks=self.layers[1],
+            radix=radix,
+            cardinality=self.cardinality,
+            bottleneck_width=bottleneck_width,
+            avg_down=self.avg_down,
+            avd=avd,
+            avd_first=avd_first,
+            rectify_avg=rectify_avg,
+            last_gamma=last_gamma,
+            stride=2,
+            name="layer2")
+
+        if self.dilated or self.dilation == 4:
+            self.layer3 = ResNeStLayer(
+                inplanes=512,
+                planes=256,
+                blocks=self.layers[2],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=1,
+                dilation=2,
+                name="layer3")
+            self.layer4 = ResNeStLayer(
+                inplanes=1024,
+                planes=512,
+                blocks=self.layers[3],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=1,
+                dilation=4,
+                name="layer4")
+        elif self.dilation == 2:
+            self.layer3 = ResNeStLayer(
+                inplanes=512,
+                planes=256,
+                blocks=self.layers[2],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=2,
+                dilation=1,
+                name="layer3")
+            self.layer4 = ResNeStLayer(
+                inplanes=1024,
+                planes=512,
+                blocks=self.layers[3],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=1,
+                dilation=2,
+                name="layer4")
+        else:
+            self.layer3 = ResNeStLayer(
+                inplanes=512,
+                planes=256,
+                blocks=self.layers[2],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=2,
+                name="layer3")
+            self.layer4 = ResNeStLayer(
+                inplanes=1024,
+                planes=512,
+                blocks=self.layers[3],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=2,
+                name="layer4")
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.out_channels = 2048
+
+        stdv = 1.0 / math.sqrt(self.out_channels * 1.0)
+
+        self.out = Linear(
+            self.out_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=nn.initializer.Uniform(-stdv, stdv),
+                name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.max_pool2d(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+
+        x = self.layer3(x)
+
+        x = self.layer4(x)
+        x = self.pool2d_avg(x)
+        x = paddle.reshape(x, shape=[-1, self.out_channels])
+        x = self.out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeSt50_fast_1s1x64d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 4, 6, 3],
+        radix=1,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=32,
+        avg_down=True,
+        avd=True,
+        avd_first=True,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeSt50_fast_1s1x64d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt50(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 4, 6, 3],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=32,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt50"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt101(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 4, 23, 3],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=64,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt101"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnet_vc.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnet_vc.py
new file mode 100644
index 0000000..6b972dc
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnet_vc.py
@@ -0,0 +1,309 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet50_vc":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vc_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet_vc(nn.Layer):
+    def __init__(self, layers=50, class_num=1000):
+        super(ResNet_vc, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_0.w_0"),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet50_vc(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNet_vc(layers=50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNet50_vc"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnext.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnext.py
new file mode 100644
index 0000000..1aef811
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnext.py
@@ -0,0 +1,298 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNeXt50_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_32x4d_pretrained.pdparams",
+    "ResNeXt50_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_64x4d_pretrained.pdparams",
+    "ResNeXt101_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x4d_pretrained.pdparams",
+    "ResNeXt101_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_64x4d_pretrained.pdparams",
+    "ResNeXt152_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_32x4d_pretrained.pdparams",
+    "ResNeXt152_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_64x4d_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format="NCHW"):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 shortcut=True,
+                 name=None,
+                 data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a",
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b",
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c",
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1",
+                data_format=data_format)
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self,
+                 layers=50,
+                 class_num=1000,
+                 cardinality=32,
+                 input_image_channel=3,
+                 data_format="NCHW"):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.data_format = data_format
+        self.input_image_channel = input_image_channel
+        self.cardinality = cardinality
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+
+        self.conv = ConvBNLayer(
+            num_channels=self.input_image_channel,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="res_conv1",
+            data_format=self.data_format)
+        self.pool2d_max = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=self.data_format)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        shortcut=shortcut,
+                        name=conv_name,
+                        data_format=self.data_format))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1, data_format=self.data_format)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                inputs = paddle.tensor.transpose(inputs, [0, 2, 3, 1])
+                inputs.stop_gradient = True
+            y = self.conv(inputs)
+            y = self.pool2d_max(y)
+            for block in self.block_list:
+                y = block(y)
+            y = self.pool2d_avg(y)
+            y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+            y = self.out(y)
+            return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeXt50_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt50_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_64x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt101_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt101_64x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt152_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt152_64x4d"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnext101_wsl.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
new file mode 100644
index 0000000..e85e133
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
@@ -0,0 +1,490 @@
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNeXt101_32x8d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x8d_wsl_pretrained.pdparams",
+    "ResNeXt101_32x16d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x16_wsl_pretrained.pdparams",
+    "ResNeXt101_32x32d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x32d_wsl_pretrained.pdparams",
+    "ResNeXt101_32x48d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x48d_wsl_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        if "downsample" in name:
+            conv_name = name + ".0"
+        else:
+            conv_name = name
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=conv_name + ".weight"),
+            bias_attr=False)
+        if "downsample" in name:
+            bn_name = name[:9] + "downsample.1"
+        else:
+            if "conv1" == name:
+                bn_name = "bn" + name[-1]
+            else:
+                bn_name = (name[:10] if name[7:9].isdigit() else name[:9]
+                           ) + "bn" + name[-1]
+        self._bn = BatchNorm(
+            num_channels=output_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + ".weight"),
+            bias_attr=ParamAttr(name=bn_name + ".bias"),
+            moving_mean_name=bn_name + ".running_mean",
+            moving_variance_name=bn_name + ".running_var")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._bn(x)
+        return x
+
+
+class ShortCut(nn.Layer):
+    def __init__(self, input_channels, output_channels, stride, name=None):
+        super(ShortCut, self).__init__()
+
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        self.stride = stride
+        if input_channels != output_channels or stride != 1:
+            self._conv = ConvBNLayer(
+                input_channels,
+                output_channels,
+                filter_size=1,
+                stride=stride,
+                name=name)
+
+    def forward(self, inputs):
+        if self.input_channels != self.output_channels or self.stride != 1:
+            return self._conv(inputs)
+        return inputs
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, stride, cardinality,
+                 width, name):
+        super(BottleneckBlock, self).__init__()
+
+        self._conv0 = ConvBNLayer(
+            input_channels,
+            output_channels,
+            filter_size=1,
+            act="relu",
+            name=name + ".conv1")
+        self._conv1 = ConvBNLayer(
+            output_channels,
+            output_channels,
+            filter_size=3,
+            act="relu",
+            stride=stride,
+            groups=cardinality,
+            name=name + ".conv2")
+        self._conv2 = ConvBNLayer(
+            output_channels,
+            output_channels // (width // 8),
+            filter_size=1,
+            act=None,
+            name=name + ".conv3")
+        self._short = ShortCut(
+            input_channels,
+            output_channels // (width // 8),
+            stride=stride,
+            name=name + ".downsample")
+
+    def forward(self, inputs):
+        x = self._conv0(inputs)
+        x = self._conv1(x)
+        x = self._conv2(x)
+        y = self._short(inputs)
+        y = paddle.add(x, y)
+        y = F.relu(y)
+        return y
+
+
+class ResNeXt101WSL(nn.Layer):
+    def __init__(self, layers=101, cardinality=32, width=48, class_num=1000):
+        super(ResNeXt101WSL, self).__init__()
+
+        self.class_num = class_num
+
+        self.layers = layers
+        self.cardinality = cardinality
+        self.width = width
+        self.scale = width // 8
+
+        self.depth = [3, 4, 23, 3]
+        self.base_width = cardinality * width
+        num_filters = [self.base_width * i
+                       for i in [1, 2, 4, 8]]  # [256, 512, 1024, 2048]
+        self._conv_stem = ConvBNLayer(
+            3, 64, 7, stride=2, act="relu", name="conv1")
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self._conv1_0 = BottleneckBlock(
+            64,
+            num_filters[0],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer1.0")
+        self._conv1_1 = BottleneckBlock(
+            num_filters[0] // (width // 8),
+            num_filters[0],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer1.1")
+        self._conv1_2 = BottleneckBlock(
+            num_filters[0] // (width // 8),
+            num_filters[0],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer1.2")
+
+        self._conv2_0 = BottleneckBlock(
+            num_filters[0] // (width // 8),
+            num_filters[1],
+            stride=2,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.0")
+        self._conv2_1 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[1],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.1")
+        self._conv2_2 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[1],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.2")
+        self._conv2_3 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[1],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.3")
+
+        self._conv3_0 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[2],
+            stride=2,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.0")
+        self._conv3_1 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.1")
+        self._conv3_2 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.2")
+        self._conv3_3 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.3")
+        self._conv3_4 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.4")
+        self._conv3_5 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.5")
+        self._conv3_6 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.6")
+        self._conv3_7 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.7")
+        self._conv3_8 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.8")
+        self._conv3_9 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.9")
+        self._conv3_10 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.10")
+        self._conv3_11 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.11")
+        self._conv3_12 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.12")
+        self._conv3_13 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.13")
+        self._conv3_14 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.14")
+        self._conv3_15 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.15")
+        self._conv3_16 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.16")
+        self._conv3_17 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.17")
+        self._conv3_18 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.18")
+        self._conv3_19 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.19")
+        self._conv3_20 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.20")
+        self._conv3_21 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.21")
+        self._conv3_22 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.22")
+
+        self._conv4_0 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[3],
+            stride=2,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer4.0")
+        self._conv4_1 = BottleneckBlock(
+            num_filters[3] // (width // 8),
+            num_filters[3],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer4.1")
+        self._conv4_2 = BottleneckBlock(
+            num_filters[3] // (width // 8),
+            num_filters[3],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer4.2")
+
+        self._avg_pool = AdaptiveAvgPool2D(1)
+        self._out = Linear(
+            num_filters[3] // (width // 8),
+            class_num,
+            weight_attr=ParamAttr(name="fc.weight"),
+            bias_attr=ParamAttr(name="fc.bias"))
+
+    def forward(self, inputs):
+        x = self._conv_stem(inputs)
+        x = self._pool(x)
+
+        x = self._conv1_0(x)
+        x = self._conv1_1(x)
+        x = self._conv1_2(x)
+
+        x = self._conv2_0(x)
+        x = self._conv2_1(x)
+        x = self._conv2_2(x)
+        x = self._conv2_3(x)
+
+        x = self._conv3_0(x)
+        x = self._conv3_1(x)
+        x = self._conv3_2(x)
+        x = self._conv3_3(x)
+        x = self._conv3_4(x)
+        x = self._conv3_5(x)
+        x = self._conv3_6(x)
+        x = self._conv3_7(x)
+        x = self._conv3_8(x)
+        x = self._conv3_9(x)
+        x = self._conv3_10(x)
+        x = self._conv3_11(x)
+        x = self._conv3_12(x)
+        x = self._conv3_13(x)
+        x = self._conv3_14(x)
+        x = self._conv3_15(x)
+        x = self._conv3_16(x)
+        x = self._conv3_17(x)
+        x = self._conv3_18(x)
+        x = self._conv3_19(x)
+        x = self._conv3_20(x)
+        x = self._conv3_21(x)
+        x = self._conv3_22(x)
+
+        x = self._conv4_0(x)
+        x = self._conv4_1(x)
+        x = self._conv4_2(x)
+
+        x = self._avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeXt101_32x8d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=8, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x8d_wsl"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x16d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=16, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x16d_wsl"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x32d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x32d_wsl"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x48d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=48, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x48d_wsl"],
+        use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnext_vd.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnext_vd.py
new file mode 100644
index 0000000..b2bd484
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/resnext_vd.py
@@ -0,0 +1,317 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNeXt50_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_vd_32x4d_pretrained.pdparams",
+    "ResNeXt50_vd_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_vd_64x4d_pretrained.pdparams",
+    "ResNeXt101_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_vd_32x4d_pretrained.pdparams",
+    "ResNeXt101_vd_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_vd_64x4d_pretrained.pdparams",
+    "ResNeXt152_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_vd_32x4d_pretrained.pdparams",
+    "ResNeXt152_vd_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_vd_64x4d_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self, layers=50, class_num=1000, cardinality=32):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.cardinality = cardinality
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        shortcut=shortcut,
+                        if_first=block == i == 0,
+                        name=conv_name))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeXt50_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_vd_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt50_vd_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_vd_64x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_vd_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_vd_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_vd_64x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt152_vd_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_vd_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt152_vd_64x4d"],
+        use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/rexnet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/rexnet.py
new file mode 100644
index 0000000..1556a01
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/rexnet.py
@@ -0,0 +1,281 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from math import ceil
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ReXNet_1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_1_0_pretrained.pdparams",
+    "ReXNet_1_3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_1_3_pretrained.pdparams",
+    "ReXNet_1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_1_5_pretrained.pdparams",
+    "ReXNet_2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_2_0_pretrained.pdparams",
+    "ReXNet_3_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_3_0_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def conv_bn_act(out,
+                in_channels,
+                channels,
+                kernel=1,
+                stride=1,
+                pad=0,
+                num_group=1,
+                active=True,
+                relu6=False):
+    out.append(
+        nn.Conv2D(
+            in_channels,
+            channels,
+            kernel,
+            stride,
+            pad,
+            groups=num_group,
+            bias_attr=False))
+    out.append(nn.BatchNorm2D(channels))
+    if active:
+        out.append(nn.ReLU6() if relu6 else nn.ReLU())
+
+
+def conv_bn_swish(out,
+                  in_channels,
+                  channels,
+                  kernel=1,
+                  stride=1,
+                  pad=0,
+                  num_group=1):
+    out.append(
+        nn.Conv2D(
+            in_channels,
+            channels,
+            kernel,
+            stride,
+            pad,
+            groups=num_group,
+            bias_attr=False))
+    out.append(nn.BatchNorm2D(channels))
+    out.append(nn.Swish())
+
+
+class SE(nn.Layer):
+    def __init__(self, in_channels, channels, se_ratio=12):
+        super(SE, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        self.fc = nn.Sequential(
+            nn.Conv2D(
+                in_channels, channels // se_ratio, kernel_size=1, padding=0),
+            nn.BatchNorm2D(channels // se_ratio),
+            nn.ReLU(),
+            nn.Conv2D(
+                channels // se_ratio, channels, kernel_size=1, padding=0),
+            nn.Sigmoid())
+
+    def forward(self, x):
+        y = self.avg_pool(x)
+        y = self.fc(y)
+        return x * y
+
+
+class LinearBottleneck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 t,
+                 stride,
+                 use_se=True,
+                 se_ratio=12,
+                 **kwargs):
+        super(LinearBottleneck, self).__init__(**kwargs)
+        self.use_shortcut = stride == 1 and in_channels <= channels
+        self.in_channels = in_channels
+        self.out_channels = channels
+
+        out = []
+        if t != 1:
+            dw_channels = in_channels * t
+            conv_bn_swish(out, in_channels=in_channels, channels=dw_channels)
+        else:
+            dw_channels = in_channels
+
+        conv_bn_act(
+            out,
+            in_channels=dw_channels,
+            channels=dw_channels,
+            kernel=3,
+            stride=stride,
+            pad=1,
+            num_group=dw_channels,
+            active=False)
+
+        if use_se:
+            out.append(SE(dw_channels, dw_channels, se_ratio))
+
+        out.append(nn.ReLU6())
+        conv_bn_act(
+            out,
+            in_channels=dw_channels,
+            channels=channels,
+            active=False,
+            relu6=True)
+        self.out = nn.Sequential(*out)
+
+    def forward(self, x):
+        out = self.out(x)
+        if self.use_shortcut:
+            out[:, 0:self.in_channels] += x
+
+        return out
+
+
+class ReXNetV1(nn.Layer):
+    def __init__(self,
+                 input_ch=16,
+                 final_ch=180,
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 class_num=1000,
+                 use_se=True,
+                 se_ratio=12,
+                 dropout_ratio=0.2,
+                 bn_momentum=0.9):
+        super(ReXNetV1, self).__init__()
+
+        layers = [1, 2, 2, 3, 3, 5]
+        strides = [1, 2, 2, 2, 1, 2]
+        use_ses = [False, False, True, True, True, True]
+
+        layers = [ceil(element * depth_mult) for element in layers]
+        strides = sum([[element] + [1] * (layers[idx] - 1)
+                       for idx, element in enumerate(strides)], [])
+        if use_se:
+            use_ses = sum([[element] * layers[idx]
+                           for idx, element in enumerate(use_ses)], [])
+        else:
+            use_ses = [False] * sum(layers[:])
+        ts = [1] * layers[0] + [6] * sum(layers[1:])
+
+        self.depth = sum(layers[:]) * 3
+        stem_channel = 32 / width_mult if width_mult < 1.0 else 32
+        inplanes = input_ch / width_mult if width_mult < 1.0 else input_ch
+
+        features = []
+        in_channels_group = []
+        channels_group = []
+
+        # The following channel configuration is a simple instance to make each layer become an expand layer.
+        for i in range(self.depth // 3):
+            if i == 0:
+                in_channels_group.append(int(round(stem_channel * width_mult)))
+                channels_group.append(int(round(inplanes * width_mult)))
+            else:
+                in_channels_group.append(int(round(inplanes * width_mult)))
+                inplanes += final_ch / (self.depth // 3 * 1.0)
+                channels_group.append(int(round(inplanes * width_mult)))
+
+        conv_bn_swish(
+            features,
+            3,
+            int(round(stem_channel * width_mult)),
+            kernel=3,
+            stride=2,
+            pad=1)
+
+        for block_idx, (in_c, c, t, s, se) in enumerate(
+                zip(in_channels_group, channels_group, ts, strides, use_ses)):
+            features.append(
+                LinearBottleneck(
+                    in_channels=in_c,
+                    channels=c,
+                    t=t,
+                    stride=s,
+                    use_se=se,
+                    se_ratio=se_ratio))
+
+        pen_channels = int(1280 * width_mult)
+        conv_bn_swish(features, c, pen_channels)
+
+        features.append(nn.AdaptiveAvgPool2D(1))
+        self.features = nn.Sequential(*features)
+        self.output = nn.Sequential(
+            nn.Dropout(dropout_ratio),
+            nn.Conv2D(
+                pen_channels, class_num, 1, bias_attr=True))
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.output(x).squeeze(axis=-1).squeeze(axis=-1)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ReXNet_1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_1_0"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_1_3(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=1.3, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_1_3"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_1_5(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=1.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_1_5"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_2_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=2.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_2_0"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_3_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=3.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_3_0"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/se_resnet_vd.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/se_resnet_vd.py
new file mode 100644
index 0000000..205feec
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/se_resnet_vd.py
@@ -0,0 +1,390 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "SE_ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNet18_vd_pretrained.pdparams",
+    "SE_ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNet34_vd_pretrained.pdparams",
+    "SE_ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNet50_vd_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 reduction_ratio=16,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        self.scale = SELayer(
+            num_channels=num_filters * 4,
+            num_filters=num_filters * 4,
+            reduction_ratio=reduction_ratio,
+            name='fc_' + name)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        scale = self.scale(conv2)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 reduction_ratio=16,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        self.scale = SELayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc_' + name)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        scale = self.scale(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = F.sigmoid(excitation)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = input * excitation
+        return out
+
+
+class SE_ResNet_vd(nn.Layer):
+    def __init__(self, layers=50, class_num=1000):
+        super(SE_ResNet_vd, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc6_weights"),
+            bias_attr=ParamAttr(name="fc6_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SE_ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = SE_ResNet_vd(layers=18, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNet18_vd"], use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = SE_ResNet_vd(layers=34, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNet34_vd"], use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = SE_ResNet_vd(layers=50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNet50_vd"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/se_resnext.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/se_resnext.py
new file mode 100644
index 0000000..8b7149e
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/se_resnext.py
@@ -0,0 +1,364 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "SE_ResNeXt50_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt50_32x4d_pretrained.pdparams",
+    "SE_ResNeXt101_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt101_32x4d_pretrained.pdparams",
+    "SE_ResNeXt152_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt152_64x4d_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format='NCHW'):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+        bn_name = name + '_bn'
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 reduction_ratio,
+                 shortcut=True,
+                 if_first=False,
+                 name=None,
+                 data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name='conv' + name + '_x1',
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name='conv' + name + '_x2',
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name='conv' + name + '_x3',
+            data_format=data_format)
+        self.scale = SELayer(
+            num_channels=num_filters * 2 if cardinality == 32 else num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc' + name,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=stride,
+                name='conv' + name + '_prj',
+                data_format=data_format)
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        scale = self.scale(conv2)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 reduction_ratio,
+                 name=None,
+                 data_format="NCHW"):
+        super(SELayer, self).__init__()
+
+        self.data_format = data_format
+        self.pool2d_gap = AdaptiveAvgPool2D(1, data_format=self.data_format)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        self.relu = nn.ReLU()
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        if self.data_format == "NHWC":
+            pool = paddle.squeeze(pool, axis=[1, 2])
+        else:
+            pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = self.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = self.sigmoid(excitation)
+        if self.data_format == "NHWC":
+            excitation = paddle.unsqueeze(excitation, axis=[1, 2])
+        else:
+            excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = input * excitation
+        return out
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self,
+                 layers=50,
+                 class_num=1000,
+                 cardinality=32,
+                 input_image_channel=3,
+                 data_format="NCHW"):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.cardinality = cardinality
+        self.reduction_ratio = 16
+        self.data_format = data_format
+        self.input_image_channel = input_image_channel
+
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+        if layers < 152:
+            self.conv = ConvBNLayer(
+                num_channels=self.input_image_channel,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu',
+                name="conv1",
+                data_format=self.data_format)
+        else:
+            self.conv1_1 = ConvBNLayer(
+                num_channels=self.input_image_channel,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name="conv1",
+                data_format=self.data_format)
+            self.conv1_2 = ConvBNLayer(
+                num_channels=64,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name="conv2",
+                data_format=self.data_format)
+            self.conv1_3 = ConvBNLayer(
+                num_channels=64,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name="conv3",
+                data_format=self.data_format)
+
+        self.pool2d_max = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=self.data_format)
+
+        self.block_list = []
+        n = 1 if layers == 50 or layers == 101 else 3
+        for block in range(len(depth)):
+            n += 1
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        reduction_ratio=self.reduction_ratio,
+                        shortcut=shortcut,
+                        if_first=block == 0,
+                        name=str(n) + '_' + str(i + 1),
+                        data_format=self.data_format))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1, data_format=self.data_format)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc6_weights"),
+            bias_attr=ParamAttr(name="fc6_offset"))
+
+    def forward(self, inputs):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                inputs = paddle.tensor.transpose(inputs, [0, 2, 3, 1])
+                inputs.stop_gradient = True
+            if self.layers < 152:
+                y = self.conv(inputs)
+            else:
+                y = self.conv1_1(inputs)
+                y = self.conv1_2(y)
+                y = self.conv1_3(y)
+            y = self.pool2d_max(y)
+            for i, block in enumerate(self.block_list):
+                y = block(y)
+            y = self.pool2d_avg(y)
+            y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+            y = self.out(y)
+            return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SE_ResNeXt50_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNeXt50_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNeXt101_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SE_ResNeXt101_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNeXt152_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SE_ResNeXt152_64x4d"],
+        use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/se_resnext_vd.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/se_resnext_vd.py
new file mode 100644
index 0000000..ef63025
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/se_resnext_vd.py
@@ -0,0 +1,309 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "SE_ResNeXt50_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt50_vd_32x4d_pretrained.pdparams",
+    "SENet154_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SENet154_vd_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 reduction_ratio,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name='conv' + name + '_x1')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name='conv' + name + '_x2')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name='conv' + name + '_x3')
+        self.scale = SELayer(
+            num_channels=num_filters * 2 if cardinality == 32 else num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc' + name)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name='conv' + name + '_prj')
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        scale = self.scale(conv2)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        self.relu = nn.ReLU()
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = self.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = self.sigmoid(excitation)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = paddle.multiply(input, excitation)
+        return out
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self, layers=50, class_num=1000, cardinality=32):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.cardinality = cardinality
+        self.reduction_ratio = 16
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [128, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=64,
+            num_filters=128,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        n = 1 if layers == 50 or layers == 101 else 3
+        for block in range(len(depth)):
+            n += 1
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        reduction_ratio=self.reduction_ratio,
+                        shortcut=shortcut,
+                        if_first=block == 0,
+                        name=str(n) + '_' + str(i + 1)))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc6_weights"),
+            bias_attr=ParamAttr(name="fc6_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SE_ResNeXt50_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SE_ResNeXt50_vd_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SENet154_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SENet154_vd"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/shufflenet_v2.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/shufflenet_v2.py
new file mode 100644
index 0000000..d8bb69f
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/shufflenet_v2.py
@@ -0,0 +1,362 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr, reshape, transpose, concat, split
+from paddle.nn import Layer, Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm, Linear
+from paddle.nn.initializer import KaimingNormal
+from paddle.nn.functional import swish
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ShuffleNetV2_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_25_pretrained.pdparams",
+    "ShuffleNetV2_x0_33":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_33_pretrained.pdparams",
+    "ShuffleNetV2_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_5_pretrained.pdparams",
+    "ShuffleNetV2_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_0_pretrained.pdparams",
+    "ShuffleNetV2_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_5_pretrained.pdparams",
+    "ShuffleNetV2_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x2_0_pretrained.pdparams",
+    "ShuffleNetV2_swish":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_swish_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+
+    # transpose
+    x = transpose(x=x, perm=[0, 2, 1, 3, 4])
+
+    # flatten
+    x = reshape(x=x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+class ConvBNLayer(Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups=1,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), name=name + "_weights"),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            out_channels,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            act=act,
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class InvertedResidual(Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 act="relu",
+                 name=None):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv1')
+        self._conv_dw = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None,
+            name='stage_' + name + '_conv2')
+        self._conv_linear = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv3')
+
+    def forward(self, inputs):
+        x1, x2 = split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x2 = self._conv_dw(x2)
+        x2 = self._conv_linear(x2)
+        out = concat([x1, x2], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 act="relu",
+                 name=None):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None,
+            name='stage_' + name + '_conv4')
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv5')
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv1')
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None,
+            name='stage_' + name + '_conv2')
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv3')
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._conv_linear_2(x2)
+        out = concat([x1, x2], axis=1)
+
+        return channel_shuffle(out, 2)
+
+
+class ShuffleNet(Layer):
+    def __init__(self, class_num=1000, scale=1.0, act="relu"):
+        super(ShuffleNet, self).__init__()
+        self.scale = scale
+        self.class_num = class_num
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 224, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act,
+            name='stage1_conv')
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act,
+                            name=str(stage_id + 2) + '_' + str(i + 1)))
+                else:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act,
+                            name=str(stage_id + 2) + '_' + str(i + 1)))
+                self._block_list.append(block)
+        # 3. last_conv
+        self._last_conv = ConvBNLayer(
+            in_channels=stage_out_channels[-2],
+            out_channels=stage_out_channels[-1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act=act,
+            name='conv5')
+        # 4. pool
+        self._pool2d_avg = AdaptiveAvgPool2D(1)
+        self._out_c = stage_out_channels[-1]
+        # 5. fc
+        self._fc = Linear(
+            stage_out_channels[-1],
+            class_num,
+            weight_attr=ParamAttr(name='fc6_weights'),
+            bias_attr=ParamAttr(name='fc6_offset'))
+
+    def forward(self, inputs):
+        y = self._conv1(inputs)
+        y = self._max_pool(y)
+        for inv in self._block_list:
+            y = inv(y)
+        y = self._last_conv(y)
+        y = self._pool2d_avg(y)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self._fc(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ShuffleNetV2_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=0.25, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x0_25"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x0_33(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=0.33, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x0_33"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=0.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=1.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x1_5"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=2.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x2_0"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_swish(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=1.0, act="swish", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_swish"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/squeezenet.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/squeezenet.py
new file mode 100644
index 0000000..647cd2e
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/squeezenet.py
@@ -0,0 +1,194 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "SqueezeNet1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SqueezeNet1_0_pretrained.pdparams",
+    "SqueezeNet1_1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SqueezeNet1_1_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class MakeFireConv(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 padding=0,
+                 name=None):
+        super(MakeFireConv, self).__init__()
+        self._conv = Conv2D(
+            input_channels,
+            output_channels,
+            filter_size,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=ParamAttr(name=name + "_offset"))
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = F.relu(x)
+        return x
+
+
+class MakeFire(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 squeeze_channels,
+                 expand1x1_channels,
+                 expand3x3_channels,
+                 name=None):
+        super(MakeFire, self).__init__()
+        self._conv = MakeFireConv(
+            input_channels, squeeze_channels, 1, name=name + "_squeeze1x1")
+        self._conv_path1 = MakeFireConv(
+            squeeze_channels, expand1x1_channels, 1, name=name + "_expand1x1")
+        self._conv_path2 = MakeFireConv(
+            squeeze_channels,
+            expand3x3_channels,
+            3,
+            padding=1,
+            name=name + "_expand3x3")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x1 = self._conv_path1(x)
+        x2 = self._conv_path2(x)
+        return paddle.concat([x1, x2], axis=1)
+
+
+class SqueezeNet(nn.Layer):
+    def __init__(self, version, class_num=1000):
+        super(SqueezeNet, self).__init__()
+        self.version = version
+
+        if self.version == "1.0":
+            self._conv = Conv2D(
+                3,
+                96,
+                7,
+                stride=2,
+                weight_attr=ParamAttr(name="conv1_weights"),
+                bias_attr=ParamAttr(name="conv1_offset"))
+            self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+            self._conv1 = MakeFire(96, 16, 64, 64, name="fire2")
+            self._conv2 = MakeFire(128, 16, 64, 64, name="fire3")
+            self._conv3 = MakeFire(128, 32, 128, 128, name="fire4")
+
+            self._conv4 = MakeFire(256, 32, 128, 128, name="fire5")
+            self._conv5 = MakeFire(256, 48, 192, 192, name="fire6")
+            self._conv6 = MakeFire(384, 48, 192, 192, name="fire7")
+            self._conv7 = MakeFire(384, 64, 256, 256, name="fire8")
+
+            self._conv8 = MakeFire(512, 64, 256, 256, name="fire9")
+        else:
+            self._conv = Conv2D(
+                3,
+                64,
+                3,
+                stride=2,
+                padding=1,
+                weight_attr=ParamAttr(name="conv1_weights"),
+                bias_attr=ParamAttr(name="conv1_offset"))
+            self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+            self._conv1 = MakeFire(64, 16, 64, 64, name="fire2")
+            self._conv2 = MakeFire(128, 16, 64, 64, name="fire3")
+
+            self._conv3 = MakeFire(128, 32, 128, 128, name="fire4")
+            self._conv4 = MakeFire(256, 32, 128, 128, name="fire5")
+
+            self._conv5 = MakeFire(256, 48, 192, 192, name="fire6")
+            self._conv6 = MakeFire(384, 48, 192, 192, name="fire7")
+            self._conv7 = MakeFire(384, 64, 256, 256, name="fire8")
+            self._conv8 = MakeFire(512, 64, 256, 256, name="fire9")
+
+        self._drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self._conv9 = Conv2D(
+            512,
+            class_num,
+            1,
+            weight_attr=ParamAttr(name="conv10_weights"),
+            bias_attr=ParamAttr(name="conv10_offset"))
+        self._avg_pool = AdaptiveAvgPool2D(1)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = F.relu(x)
+        x = self._pool(x)
+        if self.version == "1.0":
+            x = self._conv1(x)
+            x = self._conv2(x)
+            x = self._conv3(x)
+            x = self._pool(x)
+            x = self._conv4(x)
+            x = self._conv5(x)
+            x = self._conv6(x)
+            x = self._conv7(x)
+            x = self._pool(x)
+            x = self._conv8(x)
+        else:
+            x = self._conv1(x)
+            x = self._conv2(x)
+            x = self._pool(x)
+            x = self._conv3(x)
+            x = self._conv4(x)
+            x = self._pool(x)
+            x = self._conv5(x)
+            x = self._conv6(x)
+            x = self._conv7(x)
+            x = self._conv8(x)
+        x = self._drop(x)
+        x = self._conv9(x)
+        x = F.relu(x)
+        x = self._avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SqueezeNet1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = SqueezeNet(version="1.0", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SqueezeNet1_0"], use_ssld=use_ssld)
+    return model
+
+
+def SqueezeNet1_1(pretrained=False, use_ssld=False, **kwargs):
+    model = SqueezeNet(version="1.1", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SqueezeNet1_1"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/swin_transformer.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/swin_transformer.py
new file mode 100644
index 0000000..c783ec6
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/swin_transformer.py
@@ -0,0 +1,857 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/microsoft/Swin-Transformer
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "SwinTransformer_tiny_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_tiny_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_small_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_small_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_base_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_base_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_base_patch4_window12_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_base_patch4_window12_384_pretrained.pdparams",
+    "SwinTransformer_large_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window7_224_22kto1k_pretrained.pdparams",
+    "SwinTransformer_large_patch4_window12_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformer_large_patch4_window12_384_22kto1k_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W, C):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    x = windows.reshape(
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        # 2*Wh-1 * 2*Ww-1, nH
+        self.relative_position_bias_table = self.create_parameter(
+            shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                   num_heads),
+            default_initializer=zeros_)
+        self.add_parameter("relative_position_bias_table",
+                           self.relative_position_bias_table)
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+
+        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
+        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
+        relative_coords = coords_flatten_1 - coords_flatten_2
+
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [B_, N, 3, self.num_heads, C // self.num_heads]).transpose(
+                [2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+
+        index = self.relative_position_index.reshape([-1])
+
+        relative_position_bias = paddle.index_select(
+            self.relative_position_bias_table, index)
+        relative_position_bias = relative_position_bias.reshape([
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1], -1
+        ])  # Wh*Ww,Wh*Ww,nH
+
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        # x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
+        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([B_, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self):
+        return "dim={}, window_size={}, num_heads={}".format(
+            self.dim, self.window_size, self.num_heads)
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+class SwinTransformerBlock(nn.Layer):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = paddle.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = window_partition(
+                img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.reshape(
+                [-1, self.window_size * self.window_size])
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+
+            huns = -100.0 * paddle.ones_like(attn_mask)
+            attn_mask = huns * (attn_mask != 0).astype("float32")
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([B, H, W, C])
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [-1, self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [-1, self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W,
+                                   C)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+        x = x.reshape([B, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+    def extra_repr(self):
+        return "dim={}, input_resolution={}, num_heads={}, window_size={}, shift_size={}, mlp_ratio={}".format(
+            self.dim, self.input_resolution, self.num_heads, self.window_size,
+            self.shift_size, self.mlp_ratio)
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Layer):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, "x size ({}*{}) are not even.".format(
+            H, W)
+
+        x = x.reshape([B, H, W, C])
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.reshape([B, H * W // 4, 4 * C])  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self):
+        return "input_resolution={}, dim={}".format(self.input_resolution,
+                                                    self.dim)
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        for blk in self.blocks:
+            x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+    def extra_repr(self):
+        return "dim={}, input_resolution={}, depth={}".format(
+            self.dim, self.input_resolution, self.depth)
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [
+            img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        ]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # TODO (littletomatodonkey), uncomment the line will cause failure of jit.save
+        # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
+        x = self.proj(x)
+
+        x = x.flatten(2).transpose([0, 2, 1])  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (
+            self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+
+class SwinTransformer(nn.Layer):
+    """ Swin Transformer
+        A PaddlePaddle impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 use_checkpoint=False,
+                 **kwargs):
+        super(SwinTransformer, self).__init__()
+
+        self.num_classes = num_classes = class_num
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = self.create_parameter(
+                shape=(1, num_patches, embed_dim), default_initializer=zeros_)
+            self.add_parameter("absolute_pos_embed", self.absolute_pos_embed)
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate,
+                          sum(depths)).tolist()  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                input_resolution=(patches_resolution[0] // (2**i_layer),
+                                  patches_resolution[1] // (2**i_layer)),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1D(1)
+        self.head = nn.Linear(
+            self.num_features,
+            num_classes) if self.num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x.transpose([0, 2, 1]))  # B C 1
+        x = paddle.flatten(x, 1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for _, layer in enumerate(self.layers):
+            flops += layer.flops()
+        flops += self.num_features * self.patches_resolution[
+            0] * self.patches_resolution[1] // (2**self.num_layers)
+        flops += self.num_features * self.num_classes
+        return flops
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SwinTransformer_tiny_patch4_window7_224(pretrained=False,
+                                            use_ssld=False,
+                                            **kwargs):
+    model = SwinTransformer(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        drop_path_rate=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_tiny_patch4_window7_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformer_small_patch4_window7_224(pretrained=False,
+                                             use_ssld=False,
+                                             **kwargs):
+    model = SwinTransformer(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_small_patch4_window7_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformer_base_patch4_window7_224(pretrained=False,
+                                            use_ssld=False,
+                                            **kwargs):
+    model = SwinTransformer(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        drop_path_rate=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_base_patch4_window7_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformer_base_patch4_window12_384(pretrained=False,
+                                             use_ssld=False,
+                                             **kwargs):
+    model = SwinTransformer(
+        img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.5,  # NOTE: do not appear in offical code
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_base_patch4_window12_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformer_large_patch4_window7_224(pretrained=False,
+                                             use_ssld=False,
+                                             **kwargs):
+    model = SwinTransformer(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_large_patch4_window7_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformer_large_patch4_window12_384(pretrained=False,
+                                              use_ssld=False,
+                                              **kwargs):
+    model = SwinTransformer(
+        img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_large_patch4_window12_384"],
+        use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/tnt.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/tnt.py
new file mode 100644
index 0000000..dcffcf4
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/tnt.py
@@ -0,0 +1,386 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/huawei-noah/CV-Backbones/tree/master/tnt_pytorch
+
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from ppcls.arch.backbone.base.theseus_layer import Identity
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "TNT_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TNT_small_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+trunc_normal_ = TruncatedNormal(std=.02)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = paddle.add(keep_prob, paddle.rand(shape, dtype=x.dtype))
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 hidden_dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        head_dim = hidden_dim // num_heads
+        self.head_dim = head_dim
+        self.scale = head_dim**-0.5
+
+        self.qk = nn.Linear(dim, hidden_dim * 2, bias_attr=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qk = self.qk(x).reshape(
+            (B, N, 2, self.num_heads, self.head_dim)).transpose(
+                (2, 0, 3, 1, 4))
+
+        q, k = qk[0], qk[1]
+        v = self.v(x).reshape(
+            (B, N, self.num_heads, x.shape[-1] // self.num_heads)).transpose(
+                (0, 2, 1, 3))
+
+        attn = paddle.matmul(q, k.transpose((0, 1, 3, 2))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = paddle.matmul(attn, v)
+        x = x.transpose((0, 2, 1, 3)).reshape(
+            (B, N, x.shape[-1] * x.shape[-3]))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 in_dim,
+                 num_pixel,
+                 num_heads=12,
+                 in_num_head=4,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        # Inner transformer
+        self.norm_in = norm_layer(in_dim)
+        self.attn_in = Attention(
+            in_dim,
+            in_dim,
+            num_heads=in_num_head,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.norm_mlp_in = norm_layer(in_dim)
+        self.mlp_in = Mlp(in_features=in_dim,
+                          hidden_features=int(in_dim * 4),
+                          out_features=in_dim,
+                          act_layer=act_layer,
+                          drop=drop)
+
+        self.norm1_proj = norm_layer(in_dim)
+        self.proj = nn.Linear(in_dim * num_pixel, dim)
+        # Outer transformer
+        self.norm_out = norm_layer(dim)
+        self.attn_out = Attention(
+            dim,
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+
+        self.norm_mlp = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       out_features=dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, pixel_embed, patch_embed):
+        # inner
+        pixel_embed = paddle.add(
+            pixel_embed,
+            self.drop_path(self.attn_in(self.norm_in(pixel_embed))))
+        pixel_embed = paddle.add(
+            pixel_embed,
+            self.drop_path(self.mlp_in(self.norm_mlp_in(pixel_embed))))
+        # outer
+        B, N, C = patch_embed.shape
+        norm1_proj = self.norm1_proj(pixel_embed)
+        norm1_proj = norm1_proj.reshape(
+            (B, N - 1, norm1_proj.shape[1] * norm1_proj.shape[2]))
+        patch_embed[:, 1:] = paddle.add(patch_embed[:, 1:],
+                                        self.proj(norm1_proj))
+        patch_embed = paddle.add(
+            patch_embed,
+            self.drop_path(self.attn_out(self.norm_out(patch_embed))))
+        patch_embed = paddle.add(
+            patch_embed, self.drop_path(self.mlp(self.norm_mlp(patch_embed))))
+        return pixel_embed, patch_embed
+
+
+class PixelEmbed(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 in_dim=48,
+                 stride=4):
+        super().__init__()
+        num_patches = (img_size // patch_size)**2
+        self.img_size = img_size
+        self.num_patches = num_patches
+        self.in_dim = in_dim
+        new_patch_size = math.ceil(patch_size / stride)
+        self.new_patch_size = new_patch_size
+
+        self.proj = nn.Conv2D(
+            in_chans, self.in_dim, kernel_size=7, padding=3, stride=stride)
+
+    def forward(self, x, pixel_pos):
+        B, C, H, W = x.shape
+        assert H == self.img_size and W == self.img_size, f"Input image size ({H}*{W}) doesn't match model ({self.img_size}*{self.img_size})."
+
+        x = self.proj(x)
+        x = nn.functional.unfold(x, self.new_patch_size, self.new_patch_size)
+        x = x.transpose((0, 2, 1)).reshape(
+            (-1, self.in_dim, self.new_patch_size, self.new_patch_size))
+        x = x + pixel_pos
+        x = x.reshape((-1, self.in_dim, self.new_patch_size *
+                       self.new_patch_size)).transpose((0, 2, 1))
+        return x
+
+
+class TNT(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 in_dim=48,
+                 depth=12,
+                 num_heads=12,
+                 in_num_head=4,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 first_stride=4,
+                 class_num=1000):
+        super().__init__()
+        self.class_num = class_num
+        # num_features for consistency with other models
+        self.num_features = self.embed_dim = embed_dim
+
+        self.pixel_embed = PixelEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            in_dim=in_dim,
+            stride=first_stride)
+        num_patches = self.pixel_embed.num_patches
+        self.num_patches = num_patches
+        new_patch_size = self.pixel_embed.new_patch_size
+        num_pixel = new_patch_size**2
+
+        self.norm1_proj = norm_layer(num_pixel * in_dim)
+        self.proj = nn.Linear(num_pixel * in_dim, embed_dim)
+        self.norm2_proj = norm_layer(embed_dim)
+
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+
+        self.patch_pos = self.create_parameter(
+            shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("patch_pos", self.patch_pos)
+
+        self.pixel_pos = self.create_parameter(
+            shape=(1, in_dim, new_patch_size, new_patch_size),
+            default_initializer=zeros_)
+        self.add_parameter("pixel_pos", self.pixel_pos)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        blocks = []
+        for i in range(depth):
+            blocks.append(
+                Block(
+                    dim=embed_dim,
+                    in_dim=in_dim,
+                    num_pixel=num_pixel,
+                    num_heads=num_heads,
+                    in_num_head=in_num_head,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer))
+        self.blocks = nn.LayerList(blocks)
+        self.norm = norm_layer(embed_dim)
+
+        if class_num > 0:
+            self.head = nn.Linear(embed_dim, class_num)
+
+        trunc_normal_(self.cls_token)
+        trunc_normal_(self.patch_pos)
+        trunc_normal_(self.pixel_pos)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = paddle.shape(x)[0]
+        pixel_embed = self.pixel_embed(x, self.pixel_pos)
+
+        patch_embed = self.norm2_proj(
+            self.proj(
+                self.norm1_proj(
+                    pixel_embed.reshape((-1, self.num_patches, pixel_embed.
+                                         shape[-1] * pixel_embed.shape[-2])))))
+        patch_embed = paddle.concat(
+            (self.cls_token.expand((B, -1, -1)), patch_embed), axis=1)
+        patch_embed = patch_embed + self.patch_pos
+        patch_embed = self.pos_drop(patch_embed)
+
+        for blk in self.blocks:
+            pixel_embed, patch_embed = blk(pixel_embed, patch_embed)
+
+        patch_embed = self.norm(patch_embed)
+        return patch_embed[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+
+        if self.class_num > 0:
+            x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def TNT_small(pretrained=False, use_ssld=False, **kwargs):
+    model = TNT(patch_size=16,
+                embed_dim=384,
+                in_dim=24,
+                depth=12,
+                num_heads=6,
+                in_num_head=4,
+                qkv_bias=False,
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["TNT_small"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/vision_transformer.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/vision_transformer.py
new file mode 100644
index 0000000..c71c026
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/vision_transformer.py
@@ -0,0 +1,458 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ViT_small_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_small_patch16_224_pretrained.pdparams",
+    "ViT_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams",
+    "ViT_base_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_384_pretrained.pdparams",
+    "ViT_base_patch32_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch32_384_pretrained.pdparams",
+    "ViT_large_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch16_224_pretrained.pdparams",
+    "ViT_large_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch16_384_pretrained.pdparams",
+    "ViT_large_patch32_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch32_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+trunc_normal_ = TruncatedNormal(std=.02)
+normal_ = Normal
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        # B= paddle.shape(x)[0]
+        N, C = x.shape[1:]
+        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //
+                                   self.num_heads)).transpose((2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * \
+            (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+
+        x = self.proj(x).flatten(2).transpose((0, 2, 1))
+        return x
+
+
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 **kwargs):
+        super().__init__()
+        self.class_num = class_num
+
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.pos_embed = self.create_parameter(
+            shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim,
+                              class_num) if class_num > 0 else Identity()
+
+        trunc_normal_(self.pos_embed)
+        trunc_normal_(self.cls_token)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        # B = x.shape[0]
+        B = paddle.shape(x)[0]
+        x = self.patch_embed(x)
+        cls_tokens = self.cls_token.expand((B, -1, -1))
+        x = paddle.concat((cls_tokens, x), axis=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ViT_small_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=8,
+        num_heads=8,
+        mlp_ratio=3,
+        qk_scale=768**-0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_small_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_base_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_base_patch16_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_base_patch16_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_base_patch32_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_base_patch32_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_large_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_large_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_large_patch16_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_large_patch16_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_large_patch32_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_large_patch32_384"],
+        use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/xception.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/xception.py
new file mode 100644
index 0000000..2b84378
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/xception.py
@@ -0,0 +1,377 @@
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+import sys
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "Xception41":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception41_pretrained.pdparams",
+    "Xception65":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception65_pretrained.pdparams",
+    "Xception71":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception71_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = "bn_" + name
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class SeparableConv(nn.Layer):
+    def __init__(self, input_channels, output_channels, stride=1, name=None):
+        super(SeparableConv, self).__init__()
+
+        self._pointwise_conv = ConvBNLayer(
+            input_channels, output_channels, 1, name=name + "_sep")
+        self._depthwise_conv = ConvBNLayer(
+            output_channels,
+            output_channels,
+            3,
+            stride=stride,
+            groups=output_channels,
+            name=name + "_dw")
+
+    def forward(self, inputs):
+        x = self._pointwise_conv(inputs)
+        x = self._depthwise_conv(x)
+        return x
+
+
+class EntryFlowBottleneckBlock(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 stride=2,
+                 name=None,
+                 relu_first=False):
+        super(EntryFlowBottleneckBlock, self).__init__()
+        self.relu_first = relu_first
+
+        self._short = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            weight_attr=ParamAttr(name + "_branch1_weights"),
+            bias_attr=False)
+        self._conv1 = SeparableConv(
+            input_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2a_weights")
+        self._conv2 = SeparableConv(
+            output_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2b_weights")
+        self._pool = MaxPool2D(kernel_size=3, stride=stride, padding=1)
+
+    def forward(self, inputs):
+        conv0 = inputs
+        short = self._short(inputs)
+        if self.relu_first:
+            conv0 = F.relu(conv0)
+        conv1 = self._conv1(conv0)
+        conv2 = F.relu(conv1)
+        conv2 = self._conv2(conv2)
+        pool = self._pool(conv2)
+        return paddle.add(x=short, y=pool)
+
+
+class EntryFlow(nn.Layer):
+    def __init__(self, block_num=3):
+        super(EntryFlow, self).__init__()
+
+        name = "entry_flow"
+        self.block_num = block_num
+        self._conv1 = ConvBNLayer(
+            3, 32, 3, stride=2, act="relu", name=name + "_conv1")
+        self._conv2 = ConvBNLayer(32, 64, 3, act="relu", name=name + "_conv2")
+        if block_num == 3:
+            self._conv_0 = EntryFlowBottleneckBlock(
+                64, 128, stride=2, name=name + "_0", relu_first=False)
+            self._conv_1 = EntryFlowBottleneckBlock(
+                128, 256, stride=2, name=name + "_1", relu_first=True)
+            self._conv_2 = EntryFlowBottleneckBlock(
+                256, 728, stride=2, name=name + "_2", relu_first=True)
+        elif block_num == 5:
+            self._conv_0 = EntryFlowBottleneckBlock(
+                64, 128, stride=2, name=name + "_0", relu_first=False)
+            self._conv_1 = EntryFlowBottleneckBlock(
+                128, 256, stride=1, name=name + "_1", relu_first=True)
+            self._conv_2 = EntryFlowBottleneckBlock(
+                256, 256, stride=2, name=name + "_2", relu_first=True)
+            self._conv_3 = EntryFlowBottleneckBlock(
+                256, 728, stride=1, name=name + "_3", relu_first=True)
+            self._conv_4 = EntryFlowBottleneckBlock(
+                728, 728, stride=2, name=name + "_4", relu_first=True)
+        else:
+            sys.exit(-1)
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+
+        if self.block_num == 3:
+            x = self._conv_0(x)
+            x = self._conv_1(x)
+            x = self._conv_2(x)
+        elif self.block_num == 5:
+            x = self._conv_0(x)
+            x = self._conv_1(x)
+            x = self._conv_2(x)
+            x = self._conv_3(x)
+            x = self._conv_4(x)
+        return x
+
+
+class MiddleFlowBottleneckBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, name):
+        super(MiddleFlowBottleneckBlock, self).__init__()
+
+        self._conv_0 = SeparableConv(
+            input_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2a_weights")
+        self._conv_1 = SeparableConv(
+            output_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2b_weights")
+        self._conv_2 = SeparableConv(
+            output_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2c_weights")
+
+    def forward(self, inputs):
+        conv0 = F.relu(inputs)
+        conv0 = self._conv_0(conv0)
+        conv1 = F.relu(conv0)
+        conv1 = self._conv_1(conv1)
+        conv2 = F.relu(conv1)
+        conv2 = self._conv_2(conv2)
+        return paddle.add(x=inputs, y=conv2)
+
+
+class MiddleFlow(nn.Layer):
+    def __init__(self, block_num=8):
+        super(MiddleFlow, self).__init__()
+
+        self.block_num = block_num
+        self._conv_0 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_0")
+        self._conv_1 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_1")
+        self._conv_2 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_2")
+        self._conv_3 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_3")
+        self._conv_4 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_4")
+        self._conv_5 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_5")
+        self._conv_6 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_6")
+        self._conv_7 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_7")
+        if block_num == 16:
+            self._conv_8 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_8")
+            self._conv_9 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_9")
+            self._conv_10 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_10")
+            self._conv_11 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_11")
+            self._conv_12 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_12")
+            self._conv_13 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_13")
+            self._conv_14 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_14")
+            self._conv_15 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_15")
+
+    def forward(self, inputs):
+        x = self._conv_0(inputs)
+        x = self._conv_1(x)
+        x = self._conv_2(x)
+        x = self._conv_3(x)
+        x = self._conv_4(x)
+        x = self._conv_5(x)
+        x = self._conv_6(x)
+        x = self._conv_7(x)
+        if self.block_num == 16:
+            x = self._conv_8(x)
+            x = self._conv_9(x)
+            x = self._conv_10(x)
+            x = self._conv_11(x)
+            x = self._conv_12(x)
+            x = self._conv_13(x)
+            x = self._conv_14(x)
+            x = self._conv_15(x)
+        return x
+
+
+class ExitFlowBottleneckBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels1, output_channels2,
+                 name):
+        super(ExitFlowBottleneckBlock, self).__init__()
+
+        self._short = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels2,
+            kernel_size=1,
+            stride=2,
+            padding=0,
+            weight_attr=ParamAttr(name + "_branch1_weights"),
+            bias_attr=False)
+        self._conv_1 = SeparableConv(
+            input_channels,
+            output_channels1,
+            stride=1,
+            name=name + "_branch2a_weights")
+        self._conv_2 = SeparableConv(
+            output_channels1,
+            output_channels2,
+            stride=1,
+            name=name + "_branch2b_weights")
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, inputs):
+        short = self._short(inputs)
+        conv0 = F.relu(inputs)
+        conv1 = self._conv_1(conv0)
+        conv2 = F.relu(conv1)
+        conv2 = self._conv_2(conv2)
+        pool = self._pool(conv2)
+        return paddle.add(x=short, y=pool)
+
+
+class ExitFlow(nn.Layer):
+    def __init__(self, class_num):
+        super(ExitFlow, self).__init__()
+
+        name = "exit_flow"
+
+        self._conv_0 = ExitFlowBottleneckBlock(
+            728, 728, 1024, name=name + "_1")
+        self._conv_1 = SeparableConv(1024, 1536, stride=1, name=name + "_2")
+        self._conv_2 = SeparableConv(1536, 2048, stride=1, name=name + "_3")
+        self._pool = AdaptiveAvgPool2D(1)
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self._out = Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(
+                name="fc_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        conv0 = self._conv_0(inputs)
+        conv1 = self._conv_1(conv0)
+        conv1 = F.relu(conv1)
+        conv2 = self._conv_2(conv1)
+        conv2 = F.relu(conv2)
+        pool = self._pool(conv2)
+        pool = paddle.flatten(pool, start_axis=1, stop_axis=-1)
+        out = self._out(pool)
+        return out
+
+
+class Xception(nn.Layer):
+    def __init__(self,
+                 entry_flow_block_num=3,
+                 middle_flow_block_num=8,
+                 class_num=1000):
+        super(Xception, self).__init__()
+        self.entry_flow_block_num = entry_flow_block_num
+        self.middle_flow_block_num = middle_flow_block_num
+        self._entry_flow = EntryFlow(entry_flow_block_num)
+        self._middle_flow = MiddleFlow(middle_flow_block_num)
+        self._exit_flow = ExitFlow(class_num)
+
+    def forward(self, inputs):
+        x = self._entry_flow(inputs)
+        x = self._middle_flow(x)
+        x = self._exit_flow(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Xception41(pretrained=False, use_ssld=False, **kwargs):
+    model = Xception(entry_flow_block_num=3, middle_flow_block_num=8, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception41"], use_ssld=use_ssld)
+    return model
+
+
+def Xception65(pretrained=False, use_ssld=False, **kwargs):
+    model = Xception(
+        entry_flow_block_num=3, middle_flow_block_num=16, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception65"], use_ssld=use_ssld)
+    return model
+
+
+def Xception71(pretrained=False, use_ssld=False, **kwargs):
+    model = Xception(
+        entry_flow_block_num=5, middle_flow_block_num=16, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception71"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/model_zoo/xception_deeplab.py b/src/PaddleClas/ppcls/arch/backbone/model_zoo/xception_deeplab.py
new file mode 100644
index 0000000..c52769b
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/model_zoo/xception_deeplab.py
@@ -0,0 +1,421 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "Xception41_deeplab":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception41_deeplab_pretrained.pdparams",
+    "Xception65_deeplab":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception65_deeplab_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def check_data(data, number):
+    if type(data) == int:
+        return [data] * number
+    assert len(data) == number
+    return data
+
+
+def check_stride(s, os):
+    if s <= os:
+        return True
+    else:
+        return False
+
+
+def check_points(count, points):
+    if points is None:
+        return False
+    else:
+        if isinstance(points, list):
+            return (True if count in points else False)
+        else:
+            return (True if count == points else False)
+
+
+def gen_bottleneck_params(backbone='xception_65'):
+    if backbone == 'xception_65':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_41':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (8, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_71':
+        bottleneck_params = {
+            "entry_flow": (5, [2, 1, 2, 1, 2], [128, 256, 256, 728, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    else:
+        raise Exception(
+            "xception backbont only support xception_41/xception_65/xception_71"
+        )
+    return bottleneck_params
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + "/weights"),
+            bias_attr=False)
+        self._bn = BatchNorm(
+            num_channels=output_channels,
+            act=act,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=ParamAttr(name=name + "/BatchNorm/gamma"),
+            bias_attr=ParamAttr(name=name + "/BatchNorm/beta"),
+            moving_mean_name=name + "/BatchNorm/moving_mean",
+            moving_variance_name=name + "/BatchNorm/moving_variance")
+
+    def forward(self, inputs):
+        return self._bn(self._conv(inputs))
+
+
+class Seperate_Conv(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 stride,
+                 filter,
+                 dilation=1,
+                 act=None,
+                 name=None):
+        super(Seperate_Conv, self).__init__()
+
+        self._conv1 = Conv2D(
+            in_channels=input_channels,
+            out_channels=input_channels,
+            kernel_size=filter,
+            stride=stride,
+            groups=input_channels,
+            padding=(filter) // 2 * dilation,
+            dilation=dilation,
+            weight_attr=ParamAttr(name=name + "/depthwise/weights"),
+            bias_attr=False)
+        self._bn1 = BatchNorm(
+            input_channels,
+            act=act,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=ParamAttr(name=name + "/depthwise/BatchNorm/gamma"),
+            bias_attr=ParamAttr(name=name + "/depthwise/BatchNorm/beta"),
+            moving_mean_name=name + "/depthwise/BatchNorm/moving_mean",
+            moving_variance_name=name + "/depthwise/BatchNorm/moving_variance")
+        self._conv2 = Conv2D(
+            input_channels,
+            output_channels,
+            1,
+            stride=1,
+            groups=1,
+            padding=0,
+            weight_attr=ParamAttr(name=name + "/pointwise/weights"),
+            bias_attr=False)
+        self._bn2 = BatchNorm(
+            output_channels,
+            act=act,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=ParamAttr(name=name + "/pointwise/BatchNorm/gamma"),
+            bias_attr=ParamAttr(name=name + "/pointwise/BatchNorm/beta"),
+            moving_mean_name=name + "/pointwise/BatchNorm/moving_mean",
+            moving_variance_name=name + "/pointwise/BatchNorm/moving_variance")
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._bn1(x)
+        x = self._conv2(x)
+        x = self._bn2(x)
+        return x
+
+
+class Xception_Block(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 strides=1,
+                 filter_size=3,
+                 dilation=1,
+                 skip_conv=True,
+                 has_skip=True,
+                 activation_fn_in_separable_conv=False,
+                 name=None):
+        super(Xception_Block, self).__init__()
+
+        repeat_number = 3
+        output_channels = check_data(output_channels, repeat_number)
+        filter_size = check_data(filter_size, repeat_number)
+        strides = check_data(strides, repeat_number)
+
+        self.has_skip = has_skip
+        self.skip_conv = skip_conv
+        self.activation_fn_in_separable_conv = activation_fn_in_separable_conv
+        if not activation_fn_in_separable_conv:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                dilation=dilation,
+                name=name + "/separable_conv3")
+        else:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv3")
+
+        if has_skip and skip_conv:
+            self._short = ConvBNLayer(
+                input_channels,
+                output_channels[-1],
+                1,
+                stride=strides[-1],
+                padding=0,
+                name=name + "/shortcut")
+
+    def forward(self, inputs):
+        if not self.activation_fn_in_separable_conv:
+            x = F.relu(inputs)
+            x = self._conv1(x)
+            x = F.relu(x)
+            x = self._conv2(x)
+            x = F.relu(x)
+            x = self._conv3(x)
+        else:
+            x = self._conv1(inputs)
+            x = self._conv2(x)
+            x = self._conv3(x)
+        if self.has_skip:
+            if self.skip_conv:
+                skip = self._short(inputs)
+            else:
+                skip = inputs
+            return paddle.add(x, skip)
+        else:
+            return x
+
+
+class XceptionDeeplab(nn.Layer):
+    def __init__(self, backbone, class_num=1000):
+        super(XceptionDeeplab, self).__init__()
+
+        bottleneck_params = gen_bottleneck_params(backbone)
+        self.backbone = backbone
+
+        self._conv1 = ConvBNLayer(
+            3,
+            32,
+            3,
+            stride=2,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv1")
+        self._conv2 = ConvBNLayer(
+            32,
+            64,
+            3,
+            stride=1,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv2")
+
+        self.block_num = bottleneck_params["entry_flow"][0]
+        self.strides = bottleneck_params["entry_flow"][1]
+        self.chns = bottleneck_params["entry_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+
+        self.entry_flow = []
+        self.middle_flow = []
+
+        self.stride = 2
+        self.output_stride = 32
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/entry_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=64 if i == 0 else self.chns[i - 1],
+                    output_channels=self.chns[i],
+                    strides=[1, 1, self.stride],
+                    name=self.backbone + "/entry_flow/block" + str(i + 1)))
+            self.entry_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["middle_flow"][0]
+        self.strides = bottleneck_params["middle_flow"][1]
+        self.chns = bottleneck_params["middle_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/middle_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=728,
+                    output_channels=728,
+                    strides=[1, 1, self.strides[i]],
+                    skip_conv=False,
+                    name=self.backbone + "/middle_flow/block" + str(i + 1)))
+            self.middle_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["exit_flow"][0]
+        self.strides = bottleneck_params["exit_flow"][1]
+        self.chns = bottleneck_params["exit_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+        stride = self.strides[0] if check_stride(s * self.strides[0],
+                                                 self.output_stride) else 1
+        self._exit_flow_1 = Xception_Block(
+            728,
+            self.chns[0], [1, 1, stride],
+            name=self.backbone + "/exit_flow/block1")
+        s = s * stride
+        stride = self.strides[1] if check_stride(s * self.strides[1],
+                                                 self.output_stride) else 1
+        self._exit_flow_2 = Xception_Block(
+            self.chns[0][-1],
+            self.chns[1], [1, 1, stride],
+            dilation=2,
+            has_skip=False,
+            activation_fn_in_separable_conv=True,
+            name=self.backbone + "/exit_flow/block2")
+        s = s * stride
+
+        self.stride = s
+
+        self._drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self._pool = AdaptiveAvgPool2D(1)
+        self._fc = Linear(
+            self.chns[1][-1],
+            class_num,
+            weight_attr=ParamAttr(name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_bias"))
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        for ef in self.entry_flow:
+            x = ef(x)
+        for mf in self.middle_flow:
+            x = mf(x)
+        x = self._exit_flow_1(x)
+        x = self._exit_flow_2(x)
+        x = self._drop(x)
+        x = self._pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Xception41_deeplab(pretrained=False, use_ssld=False, **kwargs):
+    model = XceptionDeeplab('xception_41', **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception41_deeplab"], use_ssld=use_ssld)
+    return model
+
+
+def Xception65_deeplab(pretrained=False, use_ssld=False, **kwargs):
+    model = XceptionDeeplab("xception_65", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception65_deeplab"], use_ssld=use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/variant_models/__init__.py b/src/PaddleClas/ppcls/arch/backbone/variant_models/__init__.py
new file mode 100644
index 0000000..75cf29f
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/variant_models/__init__.py
@@ -0,0 +1,3 @@
+from .resnet_variant import ResNet50_last_stage_stride1
+from .vgg_variant import VGG19Sigmoid
+from .pp_lcnet_variant import PPLCNet_x2_5_Tanh
diff --git a/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..429ee93
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/pp_lcnet_variant.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/pp_lcnet_variant.cpython-39.pyc
new file mode 100644
index 0000000..5f60704
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/pp_lcnet_variant.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/resnet_variant.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/resnet_variant.cpython-39.pyc
new file mode 100644
index 0000000..a86f989
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/resnet_variant.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/vgg_variant.cpython-39.pyc b/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/vgg_variant.cpython-39.pyc
new file mode 100644
index 0000000..06aa183
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/backbone/variant_models/__pycache__/vgg_variant.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/backbone/variant_models/pp_lcnet_variant.py b/src/PaddleClas/ppcls/arch/backbone/variant_models/pp_lcnet_variant.py
new file mode 100644
index 0000000..dc9747a
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/variant_models/pp_lcnet_variant.py
@@ -0,0 +1,29 @@
+import paddle
+from paddle.nn import Sigmoid
+from paddle.nn import Tanh
+from ppcls.arch.backbone.legendary_models.pp_lcnet import PPLCNet_x2_5
+
+__all__ = ["PPLCNet_x2_5_Tanh"]
+
+
+class TanhSuffix(paddle.nn.Layer):
+    def __init__(self, origin_layer):
+        super(TanhSuffix, self).__init__()
+        self.origin_layer = origin_layer
+        self.tanh = Tanh()
+
+    def forward(self, input, res_dict=None, **kwargs):
+        x = self.origin_layer(input)
+        x = self.tanh(x)
+        return x
+
+
+def PPLCNet_x2_5_Tanh(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(origin_layer, pattern):
+        new_layer = TanhSuffix(origin_layer)
+        return new_layer
+
+    pattern = "fc"
+    model = PPLCNet_x2_5(pretrained=pretrained, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/variant_models/resnet_variant.py b/src/PaddleClas/ppcls/arch/backbone/variant_models/resnet_variant.py
new file mode 100644
index 0000000..0219344
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/variant_models/resnet_variant.py
@@ -0,0 +1,23 @@
+from paddle.nn import Conv2D
+from ppcls.arch.backbone.legendary_models.resnet import ResNet50, MODEL_URLS, _load_pretrained
+
+__all__ = ["ResNet50_last_stage_stride1"]
+
+
+def ResNet50_last_stage_stride1(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(conv, pattern):
+        new_conv = Conv2D(
+            in_channels=conv._in_channels,
+            out_channels=conv._out_channels,
+            kernel_size=conv._kernel_size,
+            stride=1,
+            padding=conv._padding,
+            groups=conv._groups,
+            bias_attr=conv._bias_attr)
+        return new_conv
+
+    pattern = ["blocks[13].conv1.conv", "blocks[13].short.conv"]
+    model = ResNet50(pretrained=False, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/backbone/variant_models/vgg_variant.py b/src/PaddleClas/ppcls/arch/backbone/variant_models/vgg_variant.py
new file mode 100644
index 0000000..c1f75ba
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/backbone/variant_models/vgg_variant.py
@@ -0,0 +1,28 @@
+import paddle
+from paddle.nn import Sigmoid
+from ppcls.arch.backbone.legendary_models.vgg import VGG19
+
+__all__ = ["VGG19Sigmoid"]
+
+
+class SigmoidSuffix(paddle.nn.Layer):
+    def __init__(self, origin_layer):
+        super().__init__()
+        self.origin_layer = origin_layer
+        self.sigmoid = Sigmoid()
+
+    def forward(self, input, res_dict=None, **kwargs):
+        x = self.origin_layer(input)
+        x = self.sigmoid(x)
+        return x
+
+
+def VGG19Sigmoid(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(origin_layer, pattern):
+        new_layer = SigmoidSuffix(origin_layer)
+        return new_layer
+
+    pattern = "fc2"
+    model = VGG19(pretrained=pretrained, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    return model
diff --git a/src/PaddleClas/ppcls/arch/gears/__init__.py b/src/PaddleClas/ppcls/arch/gears/__init__.py
new file mode 100644
index 0000000..75ca41d
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/gears/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .arcmargin import ArcMargin
+from .cosmargin import CosMargin
+from .circlemargin import CircleMargin
+from .fc import FC
+from .vehicle_neck import VehicleNeck
+
+__all__ = ['build_gear']
+
+
+def build_gear(config):
+    support_dict = [
+        'ArcMargin', 'CosMargin', 'CircleMargin', 'FC', 'VehicleNeck'
+    ]
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception(
+        'head only support {}'.format(support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
diff --git a/src/PaddleClas/ppcls/arch/gears/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/arch/gears/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..8e51faa
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/gears/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/gears/__pycache__/arcmargin.cpython-39.pyc b/src/PaddleClas/ppcls/arch/gears/__pycache__/arcmargin.cpython-39.pyc
new file mode 100644
index 0000000..02443b1
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/gears/__pycache__/arcmargin.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/gears/__pycache__/circlemargin.cpython-39.pyc b/src/PaddleClas/ppcls/arch/gears/__pycache__/circlemargin.cpython-39.pyc
new file mode 100644
index 0000000..f7d1261
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/gears/__pycache__/circlemargin.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/gears/__pycache__/cosmargin.cpython-39.pyc b/src/PaddleClas/ppcls/arch/gears/__pycache__/cosmargin.cpython-39.pyc
new file mode 100644
index 0000000..85a7df3
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/gears/__pycache__/cosmargin.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/gears/__pycache__/fc.cpython-39.pyc b/src/PaddleClas/ppcls/arch/gears/__pycache__/fc.cpython-39.pyc
new file mode 100644
index 0000000..361b877
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/gears/__pycache__/fc.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/gears/__pycache__/identity_head.cpython-39.pyc b/src/PaddleClas/ppcls/arch/gears/__pycache__/identity_head.cpython-39.pyc
new file mode 100644
index 0000000..2a99da9
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/gears/__pycache__/identity_head.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/gears/__pycache__/vehicle_neck.cpython-39.pyc b/src/PaddleClas/ppcls/arch/gears/__pycache__/vehicle_neck.cpython-39.pyc
new file mode 100644
index 0000000..ede4fd6
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/gears/__pycache__/vehicle_neck.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/gears/arcmargin.py b/src/PaddleClas/ppcls/arch/gears/arcmargin.py
new file mode 100644
index 0000000..22cc76e
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/gears/arcmargin.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import math
+
+
+class ArcMargin(nn.Layer):
+    def __init__(self,
+                 embedding_size,
+                 class_num,
+                 margin=0.5,
+                 scale=80.0,
+                 easy_margin=False):
+        super().__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        self.margin = margin
+        self.scale = scale
+        self.easy_margin = easy_margin
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label=None):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+        sin = paddle.sqrt(1.0 - paddle.square(cos) + 1e-6)
+        cos_m = math.cos(self.margin)
+        sin_m = math.sin(self.margin)
+        phi = cos * cos_m - sin * sin_m
+
+        th = math.cos(self.margin) * (-1)
+        mm = math.sin(self.margin) * self.margin
+        if self.easy_margin:
+            phi = self._paddle_where_more_than(cos, 0, phi, cos)
+        else:
+            phi = self._paddle_where_more_than(cos, th, phi, cos - mm)
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, phi) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
+
+    def _paddle_where_more_than(self, target, limit, x, y):
+        mask = paddle.cast(x=(target > limit), dtype='float32')
+        output = paddle.multiply(mask, x) + paddle.multiply((1.0 - mask), y)
+        return output
diff --git a/src/PaddleClas/ppcls/arch/gears/circlemargin.py b/src/PaddleClas/ppcls/arch/gears/circlemargin.py
new file mode 100644
index 0000000..d1bce83
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/gears/circlemargin.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class CircleMargin(nn.Layer):
+    def __init__(self, embedding_size, class_num, margin, scale):
+        super(CircleMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        feat_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, feat_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        logits = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return logits
+
+        alpha_p = paddle.clip(-logits.detach() + 1 + self.margin, min=0.)
+        alpha_n = paddle.clip(logits.detach() + self.margin, min=0.)
+        delta_p = 1 - self.margin
+        delta_n = self.margin
+
+        m_hot = F.one_hot(label.reshape([-1]), num_classes=logits.shape[1])
+
+        logits_p = alpha_p * (logits - delta_p)
+        logits_n = alpha_n * (logits - delta_n)
+        pre_logits = logits_p * m_hot + logits_n * (1 - m_hot)
+        pre_logits = self.scale * pre_logits
+
+        return pre_logits
diff --git a/src/PaddleClas/ppcls/arch/gears/cosmargin.py b/src/PaddleClas/ppcls/arch/gears/cosmargin.py
new file mode 100644
index 0000000..578b64c
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/gears/cosmargin.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import math
+import paddle.nn as nn
+
+
+class CosMargin(paddle.nn.Layer):
+    def __init__(self, embedding_size, class_num, margin=0.35, scale=64.0):
+        super(CosMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        label.stop_gradient = True
+
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+
+        cos_m = cos - self.margin
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, cos_m) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
diff --git a/src/PaddleClas/ppcls/arch/gears/fc.py b/src/PaddleClas/ppcls/arch/gears/fc.py
new file mode 100644
index 0000000..b324741
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/gears/fc.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class FC(nn.Layer):
+    def __init__(self, embedding_size, class_num):
+        super(FC, self).__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierNormal())
+        self.fc = paddle.nn.Linear(
+            self.embedding_size, self.class_num, weight_attr=weight_attr)
+
+    def forward(self, input, label=None):
+        out = self.fc(input)
+        return out
diff --git a/src/PaddleClas/ppcls/arch/gears/identity_head.py b/src/PaddleClas/ppcls/arch/gears/identity_head.py
new file mode 100644
index 0000000..7d11e57
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/gears/identity_head.py
@@ -0,0 +1,9 @@
+from paddle import nn
+
+
+class IdentityHead(nn.Layer):
+    def __init__(self):
+        super(IdentityHead, self).__init__()
+
+    def forward(self, x, label=None):
+        return {"features": x, "logits": None}
diff --git a/src/PaddleClas/ppcls/arch/gears/vehicle_neck.py b/src/PaddleClas/ppcls/arch/gears/vehicle_neck.py
new file mode 100644
index 0000000..05f4e33
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/gears/vehicle_neck.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class VehicleNeck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW'):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            padding_mode=padding_mode,
+            weight_attr=weight_attr,
+            bias_attr=weight_attr,
+            data_format=data_format)
+        self.flatten = nn.Flatten()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.flatten(x)
+        return x
diff --git a/src/PaddleClas/ppcls/arch/slim/__init__.py b/src/PaddleClas/ppcls/arch/slim/__init__.py
new file mode 100644
index 0000000..3733059
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/slim/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.arch.slim.prune import prune_model
+from ppcls.arch.slim.quant import quantize_model
diff --git a/src/PaddleClas/ppcls/arch/slim/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/arch/slim/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..823977a
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/slim/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/slim/__pycache__/prune.cpython-39.pyc b/src/PaddleClas/ppcls/arch/slim/__pycache__/prune.cpython-39.pyc
new file mode 100644
index 0000000..46a6c65
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/slim/__pycache__/prune.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/slim/__pycache__/quant.cpython-39.pyc b/src/PaddleClas/ppcls/arch/slim/__pycache__/quant.cpython-39.pyc
new file mode 100644
index 0000000..70884e9
Binary files /dev/null and b/src/PaddleClas/ppcls/arch/slim/__pycache__/quant.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/arch/slim/prune.py b/src/PaddleClas/ppcls/arch/slim/prune.py
new file mode 100644
index 0000000..c0c9d22
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/slim/prune.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ppcls.utils import logger
+
+
+def prune_model(config, model):
+    if config.get("Slim", False) and config["Slim"].get("prune", False):
+        import paddleslim
+        prune_method_name = config["Slim"]["prune"]["name"].lower()
+        assert prune_method_name in [
+            "fpgm", "l1_norm"
+        ], "The prune methods only support 'fpgm' and 'l1_norm'"
+        if prune_method_name == "fpgm":
+            model.pruner = paddleslim.dygraph.FPGMFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+        else:
+            model.pruner = paddleslim.dygraph.L1NormFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+
+        # prune model
+        _prune_model(config, model)
+    else:
+        model.pruner = None
+
+
+
+def _prune_model(config, model):
+    from paddleslim.analysis import dygraph_flops as flops
+    logger.info("FLOPs before pruning: {}GFLOPs".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9))
+    model.eval()
+
+    params = []
+    for sublayer in model.sublayers():
+        for param in sublayer.parameters(include_sublayers=False):
+            if isinstance(sublayer, paddle.nn.Conv2D):
+                params.append(param.name)
+    ratios = {}
+    for param in params:
+        ratios[param] = config["Slim"]["prune"]["pruned_ratio"]
+    plan = model.pruner.prune_vars(ratios, [0])
+
+    logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9,
+        plan.pruned_flops))
+
+    for param in model.parameters():
+        if "conv2d" in param.name:
+            logger.info("{}\t{}".format(param.name, param.shape))
+
+    model.train()
diff --git a/src/PaddleClas/ppcls/arch/slim/quant.py b/src/PaddleClas/ppcls/arch/slim/quant.py
new file mode 100644
index 0000000..b8f59a7
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/slim/quant.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ppcls.utils import logger
+
+QUANT_CONFIG = {
+    # weight preprocess type, default is None and no preprocessing is performed.
+    'weight_preprocess_type': None,
+    # activation preprocess type, default is None and no preprocessing is performed.
+    'activation_preprocess_type': None,
+    # weight quantize type, default is 'channel_wise_abs_max'
+    'weight_quantize_type': 'channel_wise_abs_max',
+    # activation quantize type, default is 'moving_average_abs_max'
+    'activation_quantize_type': 'moving_average_abs_max',
+    # weight quantize bit num, default is 8
+    'weight_bits': 8,
+    # activation quantize bit num, default is 8
+    'activation_bits': 8,
+    # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
+    'dtype': 'int8',
+    # window size for 'range_abs_max' quantization. default is 10000
+    'window_size': 10000,
+    # The decay coefficient of moving average, default is 0.9
+    'moving_rate': 0.9,
+    # for dygraph quantization, layers of type in quantizable_layer_type will be quantized
+    'quantizable_layer_type': ['Conv2D', 'Linear'],
+}
+
+
+def quantize_model(config, model):
+    if config.get("Slim", False) and config["Slim"].get("quant", False):
+        from paddleslim.dygraph.quant import QAT
+        assert config["Slim"]["quant"]["name"].lower(
+        ) == 'pact', 'Only PACT quantization method is supported now'
+        QUANT_CONFIG["activation_preprocess_type"] = "PACT"
+        model.quanter = QAT(config=QUANT_CONFIG)
+        model.quanter.quantize(model)
+        logger.info("QAT model summary:")
+        paddle.summary(model, (1, 3, 224, 224))
+    else:
+        model.quanter = None
+    return
diff --git a/src/PaddleClas/ppcls/arch/utils.py b/src/PaddleClas/ppcls/arch/utils.py
new file mode 100644
index 0000000..308475d
--- /dev/null
+++ b/src/PaddleClas/ppcls/arch/utils.py
@@ -0,0 +1,53 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import types
+from difflib import SequenceMatcher
+
+from . import backbone
+
+
+def get_architectures():
+    """
+    get all of model architectures
+    """
+    names = []
+    for k, v in backbone.__dict__.items():
+        if isinstance(v, (types.FunctionType, six.class_types)):
+            names.append(k)
+    return names
+
+
+def get_blacklist_model_in_static_mode():
+    from ppcls.arch.backbone import distilled_vision_transformer
+    from ppcls.arch.backbone import vision_transformer
+    blacklist = distilled_vision_transformer.__all__ + vision_transformer.__all__
+    return blacklist
+
+
+def similar_architectures(name='', names=[], thresh=0.1, topk=10):
+    """
+    inferred similar architectures
+    """
+    scores = []
+    for idx, n in enumerate(names):
+        if n.startswith('__'):
+            continue
+        score = SequenceMatcher(None, n.lower(), name.lower()).quick_ratio()
+        if score > thresh:
+            scores.append((idx, score))
+    scores.sort(key=lambda x: x[1], reverse=True)
+    similar_names = [names[s[0]] for s in scores[:min(topk, len(scores))]]
+    return similar_names
diff --git a/src/PaddleClas/ppcls/configs/Cartoonface/ResNet50_icartoon.yaml b/src/PaddleClas/ppcls/configs/Cartoonface/ResNet50_icartoon.yaml
new file mode 100644
index 0000000..3d1b993
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/Cartoonface/ResNet50_icartoon.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_mode: "retrieval"
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_shape: [3, 224, 224]
+  infer_imgs:
+  save_inference_dir: "./inference"
+  feature_normalize: True
+
+Arch:  
+  name: "RecModel"
+  Backbone:
+    name: "ResNet50"
+    pretrained: True
+  BackboneStopLayer: 
+    name: "flatten"
+    output_dim: 2048
+  Head:
+    name: "FC"
+    class_num: 5013
+    embedding_size: 2048
+    # margin: 0.5
+    # scale:  80
+  infer_output_key:  "features"
+  infer_add_softmax: "false"
+
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    # - TripletLoss:  
+    #     margin: 0.1
+    #     weight: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+DataLoader:
+  Train:
+    dataset:
+      name: ICartoonDataset
+      image_root:  "./dataset/iCartoonFace"
+      cls_label_path:  "./dataset/iCartoonFace/train_list.txt"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+        name: DistributedBatchSampler
+        #num_instances: 2
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  
+  Eval:
+    Query:
+      dataset: 
+        name: ICartoonDataset
+        image_root: "./dataset/iCartoonFace"
+        cls_label_path: "./dataset/iCartoonFace/query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+          name: DistributedBatchSampler
+          batch_size: 64
+          drop_last: False
+          shuffle: False
+      loader:
+          num_workers: 8
+          use_shared_memory: True
+
+    Gallery:
+      dataset: 
+          name: ICartoonDataset
+          image_root: "./dataset/iCartoonFace"
+          cls_label_path: "./dataset/iCartoonFace/gallery.txt"
+          transform_ops:
+            - DecodeImage:
+                to_rgb: True
+                channel_first: False
+            - ResizeImage:
+                resize_short: 256
+            - CropImage:
+                size: 224
+            - NormalizeImage:
+                scale: 0.00392157
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+                order: ''
+      sampler:
+          name: DistributedBatchSampler
+          batch_size: 64
+          drop_last: False
+          shuffle: False
+      loader:
+          num_workers: 8
+          use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - Recallk:
+        topk: [1]
diff --git a/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml b/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
new file mode 100644
index 0000000..626dd7c
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone: 
+    name: PPLCNet_x2_5
+    pretrained: True
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: ArcMargin 
+    embedding_size: 512
+    class_num: 185341
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml b/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml
new file mode 100644
index 0000000..728942f
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "sign"
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: PPLCNet_x2_5_Tanh
+    pretrained: True
+    use_ssld: True
+    class_num: 512
+  Head:
+    name: FC
+    embedding_size: 512
+    class_num: 185341
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/all_data
+      cls_label_path: ./dataset/all_data/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml b/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
new file mode 100644
index 0000000..b6c4536
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
@@ -0,0 +1,188 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+  infer_model_name: "Student"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  models:
+    - Teacher:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+    - Student:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        key: "logits"
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: "logits"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml b/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml
new file mode 100644
index 0000000..bcaea03
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml
@@ -0,0 +1,193 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+  infer_model_name: "Student"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  models:
+    - Teacher:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+    - Student:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        key: "logits"
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: "logits"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationDistanceLoss:
+        weight: 1.0
+        key: "backbone"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/AlexNet/AlexNet.yaml b/src/PaddleClas/ppcls/configs/ImageNet/AlexNet/AlexNet.yaml
new file mode 100644
index 0000000..ea2e073
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/AlexNet/AlexNet.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: AlexNet
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    decay_epochs: [30, 60, 90]
+    values: [0.01, 0.001, 0.0001, 0.00001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/CSPNet/CSPDarkNet53.yaml b/src/PaddleClas/ppcls/configs/ImageNet/CSPNet/CSPDarkNet53.yaml
new file mode 100644
index 0000000..4848cfc
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/CSPNet/CSPDarkNet53.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: CSPDarkNet53
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA102.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA102.yaml
new file mode 100644
index 0000000..b6033f7
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA102.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA102
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA102x.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA102x.yaml
new file mode 100644
index 0000000..a1e2c09
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA102x.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA102x
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA102x2.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA102x2.yaml
new file mode 100644
index 0000000..8bd4c46
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA102x2.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA102x2
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA169.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA169.yaml
new file mode 100644
index 0000000..18c244d
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA169.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA169
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA34.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA34.yaml
new file mode 100644
index 0000000..d9218df
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA34.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA34
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA46_c.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA46_c.yaml
new file mode 100644
index 0000000..8d20341
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA46_c.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA46_c
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA46x_c.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA46x_c.yaml
new file mode 100644
index 0000000..e7f7d67
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA46x_c.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA46x_c
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA60.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA60.yaml
new file mode 100644
index 0000000..a255f05
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA60.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA60
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA60x.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA60x.yaml
new file mode 100644
index 0000000..143b87f
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA60x.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA60x
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA60x_c.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA60x_c.yaml
new file mode 100644
index 0000000..7792819
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DLA/DLA60x_c.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DLA60x_c
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN107.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN107.yaml
new file mode 100644
index 0000000..7df1256
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN107.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DPN107
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN131.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN131.yaml
new file mode 100644
index 0000000..88f1b57
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN131.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DPN131
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN68.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN68.yaml
new file mode 100644
index 0000000..c1e2808
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN68.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DPN68
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN92.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN92.yaml
new file mode 100644
index 0000000..fb5b0ed
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN92.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DPN92
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN98.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN98.yaml
new file mode 100644
index 0000000..e394710
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DPN/DPN98.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DPN98
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
new file mode 100644
index 0000000..ec0f822
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DarkNet53
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
new file mode 100644
index 0000000..ab4c29c
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
new file mode 100644
index 0000000..d75fede
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
new file mode 100644
index 0000000..2fefb9f
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - CutmixOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
new file mode 100644
index 0000000..4bf5306
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - Cutout:
+            n_holes: 1
+            length: 112
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
new file mode 100644
index 0000000..c0016aa
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - GridMask:
+            d1: 96
+            d2: 224
+            rotate: 1
+            ratio: 0.5
+            mode: 0
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
new file mode 100644
index 0000000..12e4ac8
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - HideAndSeek:
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
new file mode 100644
index 0000000..3434cab
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
new file mode 100644
index 0000000..153451e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugment:
+            num_layers: 2 
+            magnitude: 5
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
new file mode 100644
index 0000000..8e89c5c
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
new file mode 100644
index 0000000..979a04a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DeiT_base_distilled_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
new file mode 100644
index 0000000..859f57d
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DeiT_base_distilled_patch16_384
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384 
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
new file mode 100644
index 0000000..3cdd102
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DeiT_base_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
new file mode 100644
index 0000000..88a8fba
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DeiT_base_patch16_384
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384 
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
new file mode 100644
index 0000000..54d962e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DeiT_small_distilled_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
new file mode 100644
index 0000000..05c3ac1
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DeiT_small_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
new file mode 100644
index 0000000..f666176
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DeiT_tiny_distilled_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
new file mode 100644
index 0000000..647050a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DeiT_tiny_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet121.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet121.yaml
new file mode 100644
index 0000000..42c7e78
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet121.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DenseNet121
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet161.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet161.yaml
new file mode 100644
index 0000000..3f9bbb6
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet161.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DenseNet161
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet169.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet169.yaml
new file mode 100644
index 0000000..3a046fb
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet169.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DenseNet169
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet201.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet201.yaml
new file mode 100644
index 0000000..ba62682
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet201.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DenseNet201
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet264.yaml b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet264.yaml
new file mode 100644
index 0000000..a0a8193
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/DenseNet/DenseNet264.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: DenseNet264
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
new file mode 100644
index 0000000..a7265b0
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
@@ -0,0 +1,157 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_dali: false
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: MobileNetV3_large_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        dropout_prob: null
+    - Student:
+        name: MobileNetV3_small_x1_0
+        class_num: *class_num
+        pretrained: False
+        dropout_prob: null
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: DistillationPostProcess
+    func: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml
new file mode 100644
index 0000000..b34ba07
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: ESNet_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml
new file mode 100644
index 0000000..0b82e08
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: ESNet_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml
new file mode 100644
index 0000000..7662397
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: ESNet_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml
new file mode 100644
index 0000000..583efd2
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: ESNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB0.yaml
new file mode 100644
index 0000000..2d5b7d0
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB0.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: EfficientNetB0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB1.yaml b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB1.yaml
new file mode 100644
index 0000000..b23030f
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB1.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 240, 240]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: EfficientNetB1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 240
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 272
+        - CropImage:
+            size: 240
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 272
+    - CropImage:
+        size: 240
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB2.yaml b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB2.yaml
new file mode 100644
index 0000000..de48d03
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB2.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 260, 260]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: EfficientNetB2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 260
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+        - CropImage:
+            size: 260
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 260
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB3.yaml b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB3.yaml
new file mode 100644
index 0000000..3f0b559
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB3.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 300, 300]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: EfficientNetB3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 300
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 332
+        - CropImage:
+            size: 300
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 332
+    - CropImage:
+        size: 300
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB4.yaml b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB4.yaml
new file mode 100644
index 0000000..e3a009a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB4.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 380, 380]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: EfficientNetB4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 380
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 412
+        - CropImage:
+            size: 380
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 412
+    - CropImage:
+        size: 380
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB5.yaml
new file mode 100644
index 0000000..795dfa1
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB5.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 456, 456]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: EfficientNetB5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 456
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 488
+        - CropImage:
+            size: 456
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 488
+    - CropImage:
+        size: 456
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB6.yaml b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB6.yaml
new file mode 100644
index 0000000..f86dd04
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB6.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 528, 528]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: EfficientNetB6
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 528
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 560
+        - CropImage:
+            size: 528
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 560
+    - CropImage:
+        size: 528
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB7.yaml b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB7.yaml
new file mode 100644
index 0000000..d57d841
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/EfficientNet/EfficientNetB7.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 600, 600]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: EfficientNetB7
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 600
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 632
+        - CropImage:
+            size: 600
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 632
+    - CropImage:
+        size: 600
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
new file mode 100644
index 0000000..ba44691
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: GhostNet_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
new file mode 100644
index 0000000..a4e6e37
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: GhostNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml b/src/PaddleClas/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
new file mode 100644
index 0000000..69921be
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: GhostNet_x1_3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml
new file mode 100644
index 0000000..935b0b5
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HRNet_W18_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W30_C.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W30_C.yaml
new file mode 100644
index 0000000..5f7067c
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W30_C.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HRNet_W30_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W32_C.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W32_C.yaml
new file mode 100644
index 0000000..fcc6dc1
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W32_C.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HRNet_W32_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W40_C.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W40_C.yaml
new file mode 100644
index 0000000..a709677
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W40_C.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HRNet_W40_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W44_C.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W44_C.yaml
new file mode 100644
index 0000000..f530cc2
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W44_C.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HRNet_W44_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
new file mode 100644
index 0000000..1c7ffc9
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HRNet_W48_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W64_C.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W64_C.yaml
new file mode 100644
index 0000000..e72b0b3
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HRNet/HRNet_W64_C.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HRNet_W64_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet39_ds.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet39_ds.yaml
new file mode 100644
index 0000000..2aa8e68
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet39_ds.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HarDNet39_ds
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet68.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet68.yaml
new file mode 100644
index 0000000..2f0ef12
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet68.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HarDNet68
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet68_ds.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet68_ds.yaml
new file mode 100644
index 0000000..cf8f2ed
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet68_ds.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HarDNet68_ds
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet85.yaml b/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet85.yaml
new file mode 100644
index 0000000..8512859
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/HarDNet/HarDNet85.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: HarDNet85
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Inception/GoogLeNet.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Inception/GoogLeNet.yaml
new file mode 100644
index 0000000..5bc3c9e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Inception/GoogLeNet.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: GoogLeNet
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - GoogLeNetLoss:
+        weight: 1.0
+  Eval:
+    - GoogLeNetLoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - GoogLeNetTopkAcc:
+        topk: [1, 5]
+  Eval:
+    - GoogLeNetTopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Inception/InceptionV3.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Inception/InceptionV3.yaml
new file mode 100644
index 0000000..3749ed8
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Inception/InceptionV3.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: InceptionV3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Inception/InceptionV4.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Inception/InceptionV4.yaml
new file mode 100644
index 0000000..7df00cc
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Inception/InceptionV4.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: InceptionV4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_128.yaml b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_128.yaml
new file mode 100644
index 0000000..a1a4f73
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_128.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: LeViT_128
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml
new file mode 100644
index 0000000..bfc6eb4
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: LeViT_128S
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_192.yaml b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_192.yaml
new file mode 100644
index 0000000..9596e86
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_192.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: LeViT_192
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_256.yaml b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_256.yaml
new file mode 100644
index 0000000..fb42700
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_256.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: LeViT_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_384.yaml b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_384.yaml
new file mode 100644
index 0000000..8347c4a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/LeViT/LeViT_384.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: LeViT_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MixNet/MixNet_L.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MixNet/MixNet_L.yaml
new file mode 100644
index 0000000..54bb18d
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MixNet/MixNet_L.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: MixNet_L
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MixNet/MixNet_M.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MixNet/MixNet_M.yaml
new file mode 100644
index 0000000..2c2a18d
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MixNet/MixNet_M.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: MixNet_M
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MixNet/MixNet_S.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MixNet/MixNet_S.yaml
new file mode 100644
index 0000000..e0f5c6a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MixNet/MixNet_S.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: MixNet_S
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
new file mode 100644
index 0000000..281015d
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: MobileNetV1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
new file mode 100644
index 0000000..86324cf
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV1_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
new file mode 100644
index 0000000..1693e78
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV1_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
new file mode 100644
index 0000000..b8b0477
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV1_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
new file mode 100644
index 0000000..2fe1f5c
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: MobileNetV2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
new file mode 100644
index 0000000..d9f30fd
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV2_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
new file mode 100644
index 0000000..7abddd4
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV2_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
new file mode 100644
index 0000000..e620d70
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV2_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
new file mode 100644
index 0000000..f9d6abc
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV2_x1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
new file mode 100644
index 0000000..fa5bf68
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV2_x2_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml
new file mode 100644
index 0000000..0c81ebc
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x0_35
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml
new file mode 100644
index 0000000..76c7028
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml
new file mode 100644
index 0000000..a1e9126
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
new file mode 100644
index 0000000..3e3ad70
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml
new file mode 100644
index 0000000..097c41e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 0000000..30ea2eb
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml
new file mode 100644
index 0000000..3c13bbb
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml
new file mode 100644
index 0000000..45608df
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml
new file mode 100644
index 0000000..02a3949
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml
new file mode 100644
index 0000000..eeae690
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml
new file mode 100644
index 0000000..8b0924c
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x0_25
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml
new file mode 100644
index 0000000..ed2501e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x0_35
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml
new file mode 100644
index 0000000..0f01d58
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml
new file mode 100644
index 0000000..7857882
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
new file mode 100644
index 0000000..f55a044
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml
new file mode 100644
index 0000000..d654d42
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml
new file mode 100644
index 0000000..50b19aa
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x2_0
+  class_num: 1000 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml
new file mode 100644
index 0000000..4f677e5
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x2_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
new file mode 100644
index 0000000..6c0854c
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: PVT_V2_B0
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
new file mode 100644
index 0000000..42134c7
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: PVT_V2_B1
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
new file mode 100644
index 0000000..4d0d5a4
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: PVT_V2_B2
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
new file mode 100644
index 0000000..a5feb26
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: PVT_V2_B2_Linear
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
new file mode 100644
index 0000000..be300ac
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: PVT_V2_B3
+  class_num: 1000
+  drop_path_rate: 0.3
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1.0
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
new file mode 100644
index 0000000..b6a8953
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: PVT_V2_B4
+  class_num: 1000
+  drop_path_rate: 0.3
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1.0
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
new file mode 100644
index 0000000..9d36b28
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: PVT_V2_B5
+  class_num: 1000
+  drop_path_rate: 0.3
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1.0
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_1_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_1_0.yaml
new file mode 100644
index 0000000..709d72f
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_1_0.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ReXNet_1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_1_3.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_1_3.yaml
new file mode 100644
index 0000000..18607c6
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_1_3.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ReXNet_1_3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_1_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_1_5.yaml
new file mode 100644
index 0000000..99dca8b
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_1_5.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ReXNet_1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_2_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_2_0.yaml
new file mode 100644
index 0000000..285b8df
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_2_0.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ReXNet_2_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_3_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_3_0.yaml
new file mode 100644
index 0000000..a44294e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ReXNet/ReXNet_3_0.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ReXNet_3_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet101.yaml b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet101.yaml
new file mode 100644
index 0000000..95ea518
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet101.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: RedNet101
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet152.yaml b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet152.yaml
new file mode 100644
index 0000000..7d5cc03
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet152.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: RedNet152
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet26.yaml b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet26.yaml
new file mode 100644
index 0000000..089db6f
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet26.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: RedNet26
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet38.yaml b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet38.yaml
new file mode 100644
index 0000000..c2fb863
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet38.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: RedNet38
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet50.yaml b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet50.yaml
new file mode 100644
index 0000000..02e045a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/RedNet/RedNet50.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: RedNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
new file mode 100644
index 0000000..ed16b03
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Res2Net101_vd_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
new file mode 100644
index 0000000..af1f438
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Res2Net200_vd_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
new file mode 100644
index 0000000..7824052
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Res2Net50_14w_8s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
new file mode 100644
index 0000000..60767ba
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Res2Net50_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
new file mode 100644
index 0000000..977c144
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Res2Net50_vd_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeSt/ResNeSt101.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeSt/ResNeSt101.yaml
new file mode 100644
index 0000000..d99e885
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeSt/ResNeSt101.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeSt101
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeSt/ResNeSt50.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeSt/ResNeSt50.yaml
new file mode 100644
index 0000000..d822c8b
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeSt/ResNeSt50.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeSt50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
new file mode 100644
index 0000000..eb973af
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeSt50_fast_1s1x64d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_32x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_32x4d.yaml
new file mode 100644
index 0000000..e0d0a5b
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_32x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_64x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_64x4d.yaml
new file mode 100644
index 0000000..d68f5f7
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_64x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00015
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml
new file mode 100644
index 0000000..eadd9ee
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_vd_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
new file mode 100644
index 0000000..5c59e5a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_vd_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_32x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_32x4d.yaml
new file mode 100644
index 0000000..8bad3f6
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_32x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt152_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_64x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_64x4d.yaml
new file mode 100644
index 0000000..104f37a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_64x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt152_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00018
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml
new file mode 100644
index 0000000..638feef
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt152_vd_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml
new file mode 100644
index 0000000..7c05197
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt152_vd_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_32x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_32x4d.yaml
new file mode 100644
index 0000000..ef78f60
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_32x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt50_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_64x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_64x4d.yaml
new file mode 100644
index 0000000..b750357
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_64x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt50_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml
new file mode 100644
index 0000000..baf38e3
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt50_vd_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml
new file mode 100644
index 0000000..dba5f86
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt50_vd_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
new file mode 100644
index 0000000..71193aa
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x16d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
new file mode 100644
index 0000000..346d2ea
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x32d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
new file mode 100644
index 0000000..2db3bd6
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x48d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
new file mode 100644
index 0000000..bed3cc2
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x8d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet101.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet101.yaml
new file mode 100644
index 0000000..2c98acf
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet101.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ResNet101
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet101_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet101_vd.yaml
new file mode 100644
index 0000000..d62b7bc
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet101_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet101_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet152.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet152.yaml
new file mode 100644
index 0000000..0dbbaf8
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet152.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ResNet152
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet152_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet152_vd.yaml
new file mode 100644
index 0000000..735c84b
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet152_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet152_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet18.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet18.yaml
new file mode 100644
index 0000000..4e0e460
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet18.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet18
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet18_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet18_vd.yaml
new file mode 100644
index 0000000..0150633
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet18_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet18_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet200_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet200_vd.yaml
new file mode 100644
index 0000000..c9209f1
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet200_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet200_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet34.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet34.yaml
new file mode 100644
index 0000000..5b90cf0
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet34.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet34
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet34_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet34_vd.yaml
new file mode 100644
index 0000000..a894ea4
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet34_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet34_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
new file mode 100644
index 0000000..c2da23f
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
new file mode 100644
index 0000000..53e9ae2
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_channel: &image_channel 4
+  # used for static mode and model export
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2.yaml
new file mode 100644
index 0000000..6a4425b
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_channel: &image_channel 4
+  # used for static mode and model export
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: True
+            channel_num: *image_channel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: True
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        output_fp16: True
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
new file mode 100644
index 0000000..be7b2d9
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SENet/SENet154_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SENet154_vd.yaml
new file mode 100644
index 0000000..6545cbf
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SENet154_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SENet154_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
new file mode 100644
index 0000000..f97430e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SE_ResNeXt101_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
new file mode 100644
index 0000000..da005d3
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_channel: &image_channel 4
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SE_ResNeXt101_32x4d
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+# mixed precision training
+AMP:
+    scale_loss: 128.0
+    use_dynamic_loss_scaling: True
+    # O2: pure fp16
+    level: O2
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: True
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: True
+            channel_num: *image_channel
+    sampler:
+      name: BatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        output_fp16: True
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
new file mode 100644
index 0000000..b31250b
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SE_ResNeXt50_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
new file mode 100644
index 0000000..292b52d
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SE_ResNeXt50_vd_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNet18_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
new file mode 100644
index 0000000..47d1754
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SE_ResNet18_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNet34_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
new file mode 100644
index 0000000..174c181
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SE_ResNet34_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNet50_vd.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
new file mode 100644
index 0000000..f503ea6
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SE_ResNet50_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
new file mode 100644
index 0000000..e01891e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_swish
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
new file mode 100644
index 0000000..c2e9805
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
new file mode 100644
index 0000000..dc7a5ef
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_33
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
new file mode 100644
index 0000000..796fb7a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
new file mode 100644
index 0000000..809fb2a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
new file mode 100644
index 0000000..eb3e013
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.25
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
new file mode 100644
index 0000000..730cf43
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x2_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.25
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml
new file mode 100644
index 0000000..28eba49
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SqueezeNet1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml
new file mode 100644
index 0000000..b61a28c
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SqueezeNet1_1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
new file mode 100644
index 0000000..4dd0ac4
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
@@ -0,0 +1,159 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_base_patch4_window12_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
new file mode 100644
index 0000000..a42dea1
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
@@ -0,0 +1,159 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_base_patch4_window7_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
new file mode 100644
index 0000000..36b5e5e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
@@ -0,0 +1,159 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_large_patch4_window12_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
new file mode 100644
index 0000000..96a9bef
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
@@ -0,0 +1,159 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_large_patch4_window7_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
new file mode 100644
index 0000000..ffbbcf0
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
@@ -0,0 +1,159 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_small_patch4_window7_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 0000000..066db71
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,159 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/TNT/TNT_small.yaml b/src/PaddleClas/ppcls/configs/ImageNet/TNT/TNT_small.yaml
new file mode 100644
index 0000000..1eab423
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/TNT/TNT_small.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: TNT_small
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
new file mode 100644
index 0000000..74c402e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: alt_gvt_base
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.3
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
new file mode 100644
index 0000000..ca66e9a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: alt_gvt_large
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.5
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
new file mode 100644
index 0000000..9e97c0f
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: alt_gvt_small
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
new file mode 100644
index 0000000..7831e92
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: pcpvt_base
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.3
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
new file mode 100644
index 0000000..8e160b3
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: pcpvt_large
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.5
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
new file mode 100644
index 0000000..582382d
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: pcpvt_small
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG11.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG11.yaml
new file mode 100644
index 0000000..e55c4d0
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG11.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 90
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: VGG11
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG13.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG13.yaml
new file mode 100644
index 0000000..b4a0ee3
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG13.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 90
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: VGG13
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG16.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG16.yaml
new file mode 100644
index 0000000..154c468
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG16.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 90
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: VGG16
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG19.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG19.yaml
new file mode 100644
index 0000000..0a7022e
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VGG/VGG19.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 150
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: VGG19
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml
new file mode 100644
index 0000000..6d5857d
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ViT_base_patch16_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml
new file mode 100644
index 0000000..925d827
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ViT_base_patch16_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml
new file mode 100644
index 0000000..fc4747b
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ViT_base_patch32_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml
new file mode 100644
index 0000000..3882c55
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ViT_large_patch16_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml
new file mode 100644
index 0000000..3bdb387
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ViT_large_patch16_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml
new file mode 100644
index 0000000..25212dd
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ViT_large_patch32_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml
new file mode 100644
index 0000000..0a956b4
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ViT_small_patch16_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception41.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception41.yaml
new file mode 100644
index 0000000..45e64a1
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception41.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Xception41
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception41_deeplab.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception41_deeplab.yaml
new file mode 100644
index 0000000..daf0598
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception41_deeplab.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Xception41_deeplab
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception65.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception65.yaml
new file mode 100644
index 0000000..c6bb529
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception65.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Xception65
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception65_deeplab.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception65_deeplab.yaml
new file mode 100644
index 0000000..1248a29
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception65_deeplab.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Xception65_deeplab
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception71.yaml b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception71.yaml
new file mode 100644
index 0000000..7f714cc
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/ImageNet/Xception/Xception71.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Xception71
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0225
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/Logo/ResNet50_ReID.yaml b/src/PaddleClas/ppcls/configs/Logo/ResNet50_ReID.yaml
new file mode 100644
index 0000000..0949add
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/Logo/ResNet50_ReID.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone:
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "CircleMargin"
+    margin: 0.35
+    scale:  64
+    embedding_size: 512
+    class_num: 3000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - PairwiseCosface:
+        margin: 0.35
+        gamma: 64
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "dataset/LogoDet-3K-crop/train/"
+        cls_label_path: "dataset/LogoDet-3K-crop/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: True
+
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset:
+        name: ImageNetDataset
+        image_root: "dataset/LogoDet-3K-crop/val/"
+        cls_label_path: "dataset/LogoDet-3K-crop/query_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+          name: DistributedBatchSampler
+          batch_size: 128
+          drop_last: False
+          shuffle: False
+      loader:
+          num_workers: 8
+          use_shared_memory: True
+
+    Gallery:
+      dataset:
+          name: ImageNetDataset
+          image_root: "dataset/LogoDet-3K-crop/train/"
+          cls_label_path: "dataset/LogoDet-3K-crop/train_list.txt"
+          transform_ops:
+            - DecodeImage:
+                to_rgb: True
+                channel_first: False
+            - ResizeImage:
+                size: 224
+            - NormalizeImage:
+                scale: 0.00392157
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+                order: ''
+      sampler:
+          name: DistributedBatchSampler
+          batch_size: 128
+          drop_last: False
+          shuffle: False
+      loader:
+          num_workers: 8
+          use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/src/PaddleClas/ppcls/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml b/src/PaddleClas/ppcls/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml
new file mode 100644
index 0000000..ad77ea9
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_dlbhc/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  #eval_mode: "retrieval"
+  print_batch_step: 10
+  use_visualdl: False
+
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "round"
+
+# model architecture
+Arch:  
+  name: "RecModel"
+  Backbone:
+    name: "MobileNetV3_large_x1_0"
+    pretrained: True
+    class_num: 512
+  Head:
+    name: "FC"
+    class_num: 50030
+    embedding_size: 512
+    
+  infer_output_key:  "features"
+  infer_add_softmax: "false"
+ 
+# loss function config for train/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [50, 150]
+    values: [0.1, 0.01, 0.001]
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 256
+        - RandCropImage:
+            size: 227
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2023, 0.1994, 0.2010]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 227
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2023, 0.1994, 0.2010]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 227
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.4914, 0.4822, 0.4465]
+        std: [0.2023, 0.1994, 0.2010]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+        
+# switch to metric below when eval by retrieval
+#     - Recallk:
+#         topk: [1]
+#     - mAP:
+#     - Precisionk:
+#         topk: [1]
+
diff --git a/src/PaddleClas/ppcls/configs/Products/ResNet50_vd_Aliproduct.yaml b/src/PaddleClas/ppcls/configs/Products/ResNet50_vd_Aliproduct.yaml
new file mode 100644
index 0000000..70f8056
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/Products/ResNet50_vd_Aliproduct.yaml
@@ -0,0 +1,119 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: classification
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone: 
+    name: ResNet50_vd
+    pretrained: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: 512
+  Head:
+    name: FC  
+    embedding_size: 512
+    class_num: 50030
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/src/PaddleClas/ppcls/configs/Products/ResNet50_vd_Inshop.yaml b/src/PaddleClas/ppcls/configs/Products/ResNet50_vd_Inshop.yaml
new file mode 100644
index 0000000..18ddfa3
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/Products/ResNet50_vd_Inshop.yaml
@@ -0,0 +1,157 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams"
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  
+  Backbone: 
+    name: ResNet50_vd
+    pretrained: False
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: 512
+  Head:
+    name: ArcMargin  
+    embedding_size: 512
+    class_num: 3997
+    margin: 0.15
+    scale: 30
+   
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.5
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.04
+    milestones: [30, 60, 70, 80, 90, 100]
+    gamma: 0.5
+    verbose: False
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/Inshop/
+      cls_label_path: ./dataset/Inshop/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+    sampler:
+      name: PKSampler
+      batch_size: 64
+      sample_per_id: 2
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+      
+  Eval:
+    Query:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/Inshop/
+        cls_label_path: ./dataset/Inshop/query_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/Inshop/
+        cls_label_path: ./dataset/Inshop/gallery_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+
diff --git a/src/PaddleClas/ppcls/configs/Products/ResNet50_vd_SOP.yaml b/src/PaddleClas/ppcls/configs/Products/ResNet50_vd_SOP.yaml
new file mode 100644
index 0000000..7728a66
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/Products/ResNet50_vd_SOP.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams"
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+
+# model architecture
+Arch:
+  name: RecModel
+  Backbone: 
+    name: ResNet50_vd
+    pretrained: False
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: 512
+  Head:
+    name: ArcMargin  
+    embedding_size: 512
+    class_num: 11319
+    margin: 0.15
+    scale: 30
+  infer_output_key: features
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.5
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.01
+    milestones: [30, 60, 70, 80, 90, 100]
+    gamma: 0.5
+    verbose: False
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: VeriWild
+      image_root: ./dataset/SOP/
+      cls_label_path: ./dataset/SOP/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+
+    sampler:
+      name: PKSampler
+      batch_size: 64
+      sample_per_id: 2
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/SOP/
+        cls_label_path: ./dataset/SOP/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/SOP/
+        cls_label_path: ./dataset/SOP/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/src/PaddleClas/ppcls/configs/Vehicle/PPLCNet_2.5x_ReID.yaml b/src/PaddleClas/ppcls/configs/Vehicle/PPLCNet_2.5x_ReID.yaml
new file mode 100644
index 0000000..eb9f145
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/Vehicle/PPLCNet_2.5x_ReID.yaml
@@ -0,0 +1,158 @@
+# global configs
+# pretrained_model: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_reid_PPLCNet2.5x_VERIWild_v1.0_pretrained.pdparams
+# VeriWild v1 small: recall1: 0.93736, recall5: 0.98427, mAP: 0.82125
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output_reid/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "PPLCNet_x2_5"
+    pretrained: True
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: "FC"
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: True
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/src/PaddleClas/ppcls/configs/Vehicle/ResNet50.yaml b/src/PaddleClas/ppcls/configs/Vehicle/ResNet50.yaml
new file mode 100644
index 0000000..6b61724
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/Vehicle/ResNet50.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 431
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        bbox_crop: True
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/train_label.txt"
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/test_label.txt"
+        bbox_crop: True
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/src/PaddleClas/ppcls/configs/Vehicle/ResNet50_ReID.yaml b/src/PaddleClas/ppcls/configs/Vehicle/ResNet50_ReID.yaml
new file mode 100644
index 0000000..c13d59a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/Vehicle/ResNet50_ReID.yaml
@@ -0,0 +1,155 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: True
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/src/PaddleClas/ppcls/configs/quick_start/MobileNetV1_retrieval.yaml b/src/PaddleClas/ppcls/configs/quick_start/MobileNetV1_retrieval.yaml
new file mode 100644
index 0000000..f088e1c
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/MobileNetV1_retrieval.yaml
@@ -0,0 +1,158 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  
+  Backbone: 
+    name: MobileNetV1
+    pretrained: False
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1024
+    class_num: 512
+  Head:
+    name: ArcMargin  
+    embedding_size: 512
+    class_num: 101
+    margin: 0.15
+    scale: 30
+   
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.5
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.01
+    milestones: [20, 30, 40]
+    gamma: 0.5
+    verbose: False
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: VeriWild
+      image_root: ./dataset/CUB_200_2011/
+      cls_label_path: ./dataset/CUB_200_2011/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 2
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+      
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/CUB_200_2011/
+        cls_label_path: ./dataset/CUB_200_2011/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/CUB_200_2011/
+        cls_label_path: ./dataset/CUB_200_2011/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/src/PaddleClas/ppcls/configs/quick_start/MobileNetV3_large_x1_0.yaml b/src/PaddleClas/ppcls/configs/quick_start/MobileNetV3_large_x1_0.yaml
new file mode 100644
index 0000000..d87dc09
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/MobileNetV3_large_x1_0.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 102
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.00375
+    warmup_epoch: 5
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.000001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ./dataset/flowers102/flowers102_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/quick_start/ResNet50_vd.yaml b/src/PaddleClas/ppcls/configs/quick_start/ResNet50_vd.yaml
new file mode 100644
index 0000000..90b2c88
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/ResNet50_vd.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd 
+  class_num: 102
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ./dataset/flowers102/flowers102_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml b/src/PaddleClas/ppcls/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml
new file mode 100644
index 0000000..6a461cc
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml
@@ -0,0 +1,68 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'HRNet_W18_C'
+pretrained_model: "./pretrained/HRNet_W18_C_pretrained"
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 10
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.00375
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/src/PaddleClas/ppcls/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml b/src/PaddleClas/ppcls/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml
new file mode 100644
index 0000000..7fad5ee
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml
@@ -0,0 +1,69 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'ResNet50_vd'
+pretrained_model: "./pretrained/ResNet50_vd_pretrained"
+load_static_weights: true
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 20
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.00375
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 1
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 1
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/src/PaddleClas/ppcls/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml b/src/PaddleClas/ppcls/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml
new file mode 100644
index 0000000..389a5f3
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml
@@ -0,0 +1,70 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'VGG16'
+    params:
+        stop_grad_layers: 5
+pretrained_model: "./pretrained/VGG16_pretrained"
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 20
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.0005
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/src/PaddleClas/ppcls/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml b/src/PaddleClas/ppcls/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml
new file mode 100644
index 0000000..6ba38b9
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml
@@ -0,0 +1,70 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'VGG19'
+    params:
+        stop_grad_layers: 5
+pretrained_model: "./pretrained/VGG19_pretrained"
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 20
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.0005
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/src/PaddleClas/ppcls/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml b/src/PaddleClas/ppcls/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml
new file mode 100644
index 0000000..1246366
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: cpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_25
+  class_num: 102
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ./dataset/flowers102/flowers102_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/quick_start/professional/MobileNetV1_multilabel.yaml b/src/PaddleClas/ppcls/configs/quick_start/professional/MobileNetV1_multilabel.yaml
new file mode 100644
index 0000000..6838710
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/professional/MobileNetV1_multilabel.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  use_multilabel: True
+# model architecture
+Arch:
+  name: MobileNetV1
+  class_num: 33
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/images/
+      cls_label_path: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/multilabel_train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: MultiLabelDataset
+      image_root: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/images/
+      cls_label_path: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/multilabel_test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/0517_2715693311.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelTopk
+    topk: 5
+    class_id_map_file: None
+
+Metric:
+  Train:
+    - HammingDistance:
+    - AccuracyScore:
+  Eval:
+    - HammingDistance:
+    - AccuracyScore:
diff --git a/src/PaddleClas/ppcls/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml b/src/PaddleClas/ppcls/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml
new file mode 100644
index 0000000..423a453
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml
@@ -0,0 +1,127 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 100
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 32
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 36
+        - CropImage:
+            size: 32
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 36
+    - CropImage:
+        size: 32
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml b/src/PaddleClas/ppcls/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml
new file mode 100644
index 0000000..a27068d
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet50_vd
+        class_num: 100
+        pretrained: "./pretrained/best_model"
+    - Student:
+        name: MobileNetV3_large_x1_0
+        class_num: 100
+        pretrained: True
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/CIFAR100/"
+        cls_label_path: "./dataset/CIFAR100/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 32
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 512
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/CIFAR100/"
+        cls_label_path: "./dataset/CIFAR100/test_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 36
+          - CropImage:
+              size: 32
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 36
+      - CropImage:
+          size: 32
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: DistillationPostProcess
+    func: Topk
+    topk: 5
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml b/src/PaddleClas/ppcls/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml
new file mode 100644
index 0000000..ca0794f
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml
@@ -0,0 +1,127 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: cpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 100
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 32
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 36
+        - CropImage:
+            size: 32
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 36
+    - CropImage:
+        size: 32
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml b/src/PaddleClas/ppcls/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml
new file mode 100644
index 0000000..d8ff817
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml
@@ -0,0 +1,127 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 100
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 32
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 36
+        - CropImage:
+            size: 32
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 36
+    - CropImage:
+        size: 32
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml b/src/PaddleClas/ppcls/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml
new file mode 100644
index 0000000..9722882
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  eval_mode: "retrieval"
+  epochs: 128
+  print_batch_step: 10
+  use_visualdl: False
+
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "round"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  Backbone:
+    name: "VGG19Sigmoid"
+    pretrained: True
+    class_num: 48
+  Head:
+    name: "FC"
+    class_num: 10
+    embedding_size: 48
+    
+  infer_output_key:  "features"
+  infer_add_softmax: "false"
+
+# loss function config for train/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.01
+    decay_epochs: [200]
+    values: [0.01, 0.001]
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/cifar10/
+      cls_label_path: ./dataset/cifar10/cifar10-2/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 256
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2023, 0.1994, 0.2010]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/cifar10/
+        cls_label_path: ./dataset/cifar10/cifar10-2/test.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.4914, 0.4822, 0.4465]
+              std: [0.2023, 0.1994, 0.2010]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 512
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/cifar10/
+        cls_label_path: ./dataset/cifar10/cifar10-2/database.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.4914, 0.4822, 0.4465]
+              std: [0.2023, 0.1994, 0.2010]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 512
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - mAP:
+    - Precisionk:
+        topk: [1, 5]
+        
diff --git a/src/PaddleClas/ppcls/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml b/src/PaddleClas/ppcls/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml
new file mode 100644
index 0000000..7b21d0b
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml
@@ -0,0 +1,154 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/general_PPLCNet_x2_5_pretrained_v1.0.pdparams
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone: 
+    name: PPLCNet_x2_5
+    pretrained: False
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: ArcMargin 
+    embedding_size: 512
+    class_num: 185341
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.002
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/slim/MobileNetV3_large_x1_0_prune.yaml b/src/PaddleClas/ppcls/configs/slim/MobileNetV3_large_x1_0_prune.yaml
new file mode 100644
index 0000000..6655c3a
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/MobileNetV3_large_x1_0_prune.yaml
@@ -0,0 +1,139 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantization or prune model 
+Slim:
+  ## for prune
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/slim/MobileNetV3_large_x1_0_quantization.yaml b/src/PaddleClas/ppcls/configs/slim/MobileNetV3_large_x1_0_quantization.yaml
new file mode 100644
index 0000000..517c467
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/MobileNetV3_large_x1_0_quantization.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantalization or prune model 
+Slim:
+  ## for quantization
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.065
+    warmup_epoch: 0
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/slim/PPLCNet_x1_0_quantization.yaml b/src/PaddleClas/ppcls/configs/slim/PPLCNet_x1_0_quantization.yaml
new file mode 100644
index 0000000..40111a0
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/PPLCNet_x1_0_quantization.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantalization or prune model 
+Slim:
+  ## for quantization
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 0
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/slim/ResNet50_vd_prune.yaml b/src/PaddleClas/ppcls/configs/slim/ResNet50_vd_prune.yaml
new file mode 100644
index 0000000..7bfc537
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/ResNet50_vd_prune.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null 
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantization or prune model 
+Slim:
+  ## for prune
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/slim/ResNet50_vd_quantization.yaml b/src/PaddleClas/ppcls/configs/slim/ResNet50_vd_quantization.yaml
new file mode 100644
index 0000000..f9db410
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/ResNet50_vd_quantization.yaml
@@ -0,0 +1,137 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null 
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantalization or prune model 
+Slim:
+  ## for quantization
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_cls_prune.yaml b/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_cls_prune.yaml
new file mode 100644
index 0000000..1f6fea8
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_cls_prune.yaml
@@ -0,0 +1,135 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_cls_ResNet50_CompCars_v1.2_pretrained.pdparams"
+  output_dir: "./output_vehicle_cls_prune/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+Slim:
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 431
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        bbox_crop: True
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/train_label.txt"
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/test_label.txt"
+        bbox_crop: True
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_cls_quantization.yaml b/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_cls_quantization.yaml
new file mode 100644
index 0000000..026b865
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_cls_quantization.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_cls_ResNet50_CompCars_v1.2_pretrained.pdparams"
+  output_dir: "./output_vehicle_cls_pact/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 80
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+Slim:
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 431
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        bbox_crop: True
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/train_label.txt"
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 64
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/test_label.txt"
+        bbox_crop: True
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_reid_prune.yaml b/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_reid_prune.yaml
new file mode 100644
index 0000000..63b87f1
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_reid_prune.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_reid_ResNet50_VERIWild_v1.1_pretrained.pdparams"
+  output_dir: "./output_vehicle_reid_prune/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_reid_quantization.yaml b/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_reid_quantization.yaml
new file mode 100644
index 0000000..cca9915
--- /dev/null
+++ b/src/PaddleClas/ppcls/configs/slim/ResNet50_vehicle_reid_quantization.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_reid_ResNet50_VERIWild_v1.1_pretrained.pdparams"
+  output_dir: "./output_vehicle_reid_pact/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 64
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/src/PaddleClas/ppcls/data/__init__.py b/src/PaddleClas/ppcls/data/__init__.py
new file mode 100644
index 0000000..cffac81
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/__init__.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import copy
+import paddle
+import numpy as np
+from paddle.io import DistributedBatchSampler, BatchSampler, DataLoader
+from ppcls.utils import logger
+
+from ppcls.data import dataloader
+# dataset
+from ppcls.data.dataloader.imagenet_dataset import ImageNetDataset
+from ppcls.data.dataloader.multilabel_dataset import MultiLabelDataset
+from ppcls.data.dataloader.common_dataset import create_operators
+from ppcls.data.dataloader.vehicle_dataset import CompCars, VeriWild
+from ppcls.data.dataloader.logo_dataset import LogoDataset
+from ppcls.data.dataloader.icartoon_dataset import ICartoonDataset
+from ppcls.data.dataloader.mix_dataset import MixDataset
+
+# sampler
+from ppcls.data.dataloader.DistributedRandomIdentitySampler import DistributedRandomIdentitySampler
+from ppcls.data.dataloader.pk_sampler import PKSampler
+from ppcls.data.dataloader.mix_sampler import MixSampler
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+
+
+def create_operators(params, class_num=None):
+    """
+    create operators based on the config
+
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    assert isinstance(params, list), ('operator config should be a list')
+    ops = []
+    for operator in params:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        op_func = getattr(preprocess, op_name)
+        if "class_num" in inspect.getfullargspec(op_func).args:
+            param.update({"class_num": class_num})
+        op = op_func(**param)
+        ops.append(op)
+
+    return ops
+
+
+def build_dataloader(config, mode, device, use_dali=False, seed=None):
+    assert mode in [
+        'Train', 'Eval', 'Test', 'Gallery', 'Query'
+    ], "Dataset mode should be Train, Eval, Test, Gallery, Query"
+    # build dataset
+    if use_dali:
+        from ppcls.data.dataloader.dali import dali_dataloader
+        return dali_dataloader(config, mode, paddle.device.get_device(), seed)
+
+    class_num = config.get("class_num", None)
+    config_dataset = config[mode]['dataset']
+    config_dataset = copy.deepcopy(config_dataset)
+    dataset_name = config_dataset.pop('name')
+    if 'batch_transform_ops' in config_dataset:
+        batch_transform = config_dataset.pop('batch_transform_ops')
+    else:
+        batch_transform = None
+
+    dataset = eval(dataset_name)(**config_dataset)
+
+    logger.debug("build dataset({}) success...".format(dataset))
+
+    # build sampler
+    config_sampler = config[mode]['sampler']
+    if "name" not in config_sampler:
+        batch_sampler = None
+        batch_size = config_sampler["batch_size"]
+        drop_last = config_sampler["drop_last"]
+        shuffle = config_sampler["shuffle"]
+    else:
+        sampler_name = config_sampler.pop("name")
+        batch_sampler = eval(sampler_name)(dataset, **config_sampler)
+
+    logger.debug("build batch_sampler({}) success...".format(batch_sampler))
+
+    # build batch operator
+    def mix_collate_fn(batch):
+        batch = transform(batch, batch_ops)
+        # batch each field
+        slots = []
+        for items in batch:
+            for i, item in enumerate(items):
+                if len(slots) < len(items):
+                    slots.append([item])
+                else:
+                    slots[i].append(item)
+        return [np.stack(slot, axis=0) for slot in slots]
+
+    if isinstance(batch_transform, list):
+        batch_ops = create_operators(batch_transform, class_num)
+        batch_collate_fn = mix_collate_fn
+    else:
+        batch_collate_fn = None
+
+    # build dataloader
+    config_loader = config[mode]['loader']
+    num_workers = config_loader["num_workers"]
+    use_shared_memory = config_loader["use_shared_memory"]
+
+    if batch_sampler is None:
+        data_loader = DataLoader(
+            dataset=dataset,
+            places=device,
+            num_workers=num_workers,
+            return_list=True,
+            use_shared_memory=use_shared_memory,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            collate_fn=batch_collate_fn)
+    else:
+        data_loader = DataLoader(
+            dataset=dataset,
+            places=device,
+            num_workers=num_workers,
+            return_list=True,
+            use_shared_memory=use_shared_memory,
+            batch_sampler=batch_sampler,
+            collate_fn=batch_collate_fn)
+
+    logger.debug("build data_loader({}) success...".format(data_loader))
+    return data_loader
diff --git a/src/PaddleClas/ppcls/data/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/data/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..a93f9c4
Binary files /dev/null and b/src/PaddleClas/ppcls/data/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/.imagenet_dataset.py.un~ b/src/PaddleClas/ppcls/data/dataloader/.imagenet_dataset.py.un~
new file mode 100644
index 0000000..f7f916d
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/.imagenet_dataset.py.un~ differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/DistributedRandomIdentitySampler.py b/src/PaddleClas/ppcls/data/dataloader/DistributedRandomIdentitySampler.py
new file mode 100644
index 0000000..1203803
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/DistributedRandomIdentitySampler.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from collections import defaultdict
+import numpy as np
+import copy
+import random
+from paddle.io import DistributedBatchSampler, Sampler
+
+
+class DistributedRandomIdentitySampler(DistributedBatchSampler):
+    """
+    Randomly sample N identities, then for each identity,
+    randomly sample K instances, therefore batch size is N*K.
+    Args:
+    - data_source (list): list of (img_path, pid, camid).
+    - num_instances (int): number of instances per identity in a batch.
+    - batch_size (int): number of examples in a batch.
+    """
+
+    def __init__(self, dataset, batch_size, num_instances, drop_last, **args):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_instances = num_instances
+        self.drop_last = drop_last
+        self.num_pids_per_batch = self.batch_size // self.num_instances
+        self.index_dic = defaultdict(list)
+        for index, pid in enumerate(self.dataset.labels):
+            self.index_dic[pid].append(index)
+        self.pids = list(self.index_dic.keys())
+        # estimate number of examples in an epoch
+        self.length = 0
+        for pid in self.pids:
+            idxs = self.index_dic[pid]
+            num = len(idxs)
+            if num < self.num_instances:
+                num = self.num_instances
+            self.length += num - num % self.num_instances
+
+    def __iter__(self):
+        batch_idxs_dict = defaultdict(list)
+        for pid in self.pids:
+            idxs = copy.deepcopy(self.index_dic[pid])
+            if len(idxs) < self.num_instances:
+                idxs = np.random.choice(
+                    idxs, size=self.num_instances, replace=True)
+            random.shuffle(idxs)
+            batch_idxs = []
+            for idx in idxs:
+                batch_idxs.append(idx)
+                if len(batch_idxs) == self.num_instances:
+                    batch_idxs_dict[pid].append(batch_idxs)
+                    batch_idxs = []
+        avai_pids = copy.deepcopy(self.pids)
+        final_idxs = []
+        while len(avai_pids) >= self.num_pids_per_batch:
+            selected_pids = random.sample(avai_pids, self.num_pids_per_batch)
+            for pid in selected_pids:
+                batch_idxs = batch_idxs_dict[pid].pop(0)
+                final_idxs.extend(batch_idxs)
+                if len(batch_idxs_dict[pid]) == 0:
+                    avai_pids.remove(pid)
+        _sample_iter = iter(final_idxs)
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        if self.drop_last:
+            return self.length // self.batch_size
+        else:
+            return (self.length + self.batch_size - 1) // self.batch_size
diff --git a/src/PaddleClas/ppcls/data/dataloader/__init__.py b/src/PaddleClas/ppcls/data/dataloader/__init__.py
new file mode 100644
index 0000000..8f81921
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/__init__.py
@@ -0,0 +1,9 @@
+from ppcls.data.dataloader.imagenet_dataset import ImageNetDataset
+from ppcls.data.dataloader.multilabel_dataset import MultiLabelDataset
+from ppcls.data.dataloader.common_dataset import create_operators
+from ppcls.data.dataloader.vehicle_dataset import CompCars, VeriWild
+from ppcls.data.dataloader.logo_dataset import LogoDataset
+from ppcls.data.dataloader.icartoon_dataset import ICartoonDataset
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data.dataloader.mix_sampler import MixSampler
+from ppcls.data.dataloader.pk_sampler import PKSampler
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/DistributedRandomIdentitySampler.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/DistributedRandomIdentitySampler.cpython-39.pyc
new file mode 100644
index 0000000..297a252
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/DistributedRandomIdentitySampler.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..94af5ef
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/common_dataset.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/common_dataset.cpython-39.pyc
new file mode 100644
index 0000000..a35db70
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/common_dataset.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/icartoon_dataset.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/icartoon_dataset.cpython-39.pyc
new file mode 100644
index 0000000..4ce7f8f
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/icartoon_dataset.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/imagenet_dataset.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/imagenet_dataset.cpython-39.pyc
new file mode 100644
index 0000000..669c776
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/imagenet_dataset.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/logo_dataset.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/logo_dataset.cpython-39.pyc
new file mode 100644
index 0000000..4044dcd
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/logo_dataset.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/mix_dataset.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/mix_dataset.cpython-39.pyc
new file mode 100644
index 0000000..cb07e0a
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/mix_dataset.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/mix_sampler.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/mix_sampler.cpython-39.pyc
new file mode 100644
index 0000000..06d9c7f
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/mix_sampler.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/multilabel_dataset.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/multilabel_dataset.cpython-39.pyc
new file mode 100644
index 0000000..7d7b90f
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/multilabel_dataset.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/pk_sampler.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/pk_sampler.cpython-39.pyc
new file mode 100644
index 0000000..70d4598
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/pk_sampler.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/__pycache__/vehicle_dataset.cpython-39.pyc b/src/PaddleClas/ppcls/data/dataloader/__pycache__/vehicle_dataset.cpython-39.pyc
new file mode 100644
index 0000000..fbc8c11
Binary files /dev/null and b/src/PaddleClas/ppcls/data/dataloader/__pycache__/vehicle_dataset.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/dataloader/common_dataset.py b/src/PaddleClas/ppcls/data/dataloader/common_dataset.py
new file mode 100644
index 0000000..b7b03d8
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/common_dataset.py
@@ -0,0 +1,84 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+from paddle.io import Dataset
+import cv2
+
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+
+def create_operators(params):
+    """
+    create operators based on the config
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    assert isinstance(params, list), ('operator config should be a list')
+    ops = []
+    for operator in params:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        op = getattr(preprocess, op_name)(**param)
+        ops.append(op)
+
+    return ops
+
+
+class CommonDataset(Dataset):
+    def __init__(
+            self,
+            image_root,
+            cls_label_path,
+            transform_ops=None, ):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+
+        self.images = []
+        self.labels = []
+        self._load_anno()
+
+    def _load_anno(self):
+        pass
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            return (img, self.labels[idx])
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
diff --git a/src/PaddleClas/ppcls/data/dataloader/dali.py b/src/PaddleClas/ppcls/data/dataloader/dali.py
new file mode 100644
index 0000000..a15c231
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/dali.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import copy
+import os
+
+import numpy as np
+import nvidia.dali.ops as ops
+import nvidia.dali.types as types
+import paddle
+from nvidia.dali import fn
+from nvidia.dali.pipeline import Pipeline
+from nvidia.dali.plugin.base_iterator import LastBatchPolicy
+from nvidia.dali.plugin.paddle import DALIGenericIterator
+
+
+class HybridTrainPipe(Pipeline):
+    def __init__(self,
+                 file_root,
+                 file_list,
+                 batch_size,
+                 resize_shorter,
+                 crop,
+                 min_area,
+                 lower,
+                 upper,
+                 interp,
+                 mean,
+                 std,
+                 device_id,
+                 shard_id=0,
+                 num_shards=1,
+                 random_shuffle=True,
+                 num_threads=4,
+                 seed=42,
+                 pad_output=False,
+                 output_dtype=types.FLOAT,
+                 dataset='Train'):
+        super(HybridTrainPipe, self).__init__(
+            batch_size, num_threads, device_id, seed=seed)
+        self.input = ops.readers.File(
+            file_root=file_root,
+            file_list=file_list,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            random_shuffle=random_shuffle)
+        # set internal nvJPEG buffers size to handle full-sized ImageNet images
+        # without additional reallocations
+        device_memory_padding = 211025920
+        host_memory_padding = 140544512
+        self.decode = ops.decoders.ImageRandomCrop(
+            device='mixed',
+            output_type=types.DALIImageType.RGB,
+            device_memory_padding=device_memory_padding,
+            host_memory_padding=host_memory_padding,
+            random_aspect_ratio=[lower, upper],
+            random_area=[min_area, 1.0],
+            num_attempts=100)
+        self.res = ops.Resize(
+            device='gpu', resize_x=crop, resize_y=crop, interp_type=interp)
+        self.cmnp = ops.CropMirrorNormalize(
+            device="gpu",
+            dtype=output_dtype,
+            output_layout='CHW',
+            crop=(crop, crop),
+            mean=mean,
+            std=std,
+            pad_output=pad_output)
+        self.coin = ops.random.CoinFlip(probability=0.5)
+        self.to_int64 = ops.Cast(dtype=types.DALIDataType.INT64, device="gpu")
+
+    def define_graph(self):
+        rng = self.coin()
+        jpegs, labels = self.input(name="Reader")
+        images = self.decode(jpegs)
+        images = self.res(images)
+        output = self.cmnp(images.gpu(), mirror=rng)
+        return [output, self.to_int64(labels.gpu())]
+
+    def __len__(self):
+        return self.epoch_size("Reader")
+
+
+class HybridValPipe(Pipeline):
+    def __init__(self,
+                 file_root,
+                 file_list,
+                 batch_size,
+                 resize_shorter,
+                 crop,
+                 interp,
+                 mean,
+                 std,
+                 device_id,
+                 shard_id=0,
+                 num_shards=1,
+                 random_shuffle=False,
+                 num_threads=4,
+                 seed=42,
+                 pad_output=False,
+                 output_dtype=types.FLOAT):
+        super(HybridValPipe, self).__init__(
+            batch_size, num_threads, device_id, seed=seed)
+        self.input = ops.readers.File(
+            file_root=file_root,
+            file_list=file_list,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            random_shuffle=random_shuffle)
+        self.decode = ops.decoders.Image(device="mixed")
+        self.res = ops.Resize(
+            device="gpu", resize_shorter=resize_shorter, interp_type=interp)
+        self.cmnp = ops.CropMirrorNormalize(
+            device="gpu",
+            dtype=output_dtype,
+            output_layout='CHW',
+            crop=(crop, crop),
+            mean=mean,
+            std=std,
+            pad_output=pad_output)
+        self.to_int64 = ops.Cast(dtype=types.DALIDataType.INT64, device="gpu")
+
+    def define_graph(self):
+        jpegs, labels = self.input(name="Reader")
+        images = self.decode(jpegs)
+        images = self.res(images)
+        output = self.cmnp(images)
+        return [output, self.to_int64(labels.gpu())]
+
+    def __len__(self):
+        return self.epoch_size("Reader")
+
+
+def dali_dataloader(config, mode, device, seed=None):
+    assert "gpu" in device, "gpu training is required for DALI"
+    device_id = int(device.split(':')[1])
+    config_dataloader = config[mode]
+    seed = 42 if seed is None else seed
+    ops = [
+        list(x.keys())[0]
+        for x in config_dataloader["dataset"]["transform_ops"]
+    ]
+    support_ops_train = [
+        "DecodeImage", "NormalizeImage", "RandFlipImage", "RandCropImage"
+    ]
+    support_ops_eval = [
+        "DecodeImage", "ResizeImage", "CropImage", "NormalizeImage"
+    ]
+
+    if mode.lower() == 'train':
+        assert set(ops) == set(
+            support_ops_train
+        ), "The supported trasform_ops for train_dataset in dali is : {}".format(
+            ",".join(support_ops_train))
+    else:
+        assert set(ops) == set(
+            support_ops_eval
+        ), "The supported trasform_ops for eval_dataset in dali is : {}".format(
+            ",".join(support_ops_eval))
+
+    normalize_ops = [
+        op for op in config_dataloader["dataset"]["transform_ops"]
+        if "NormalizeImage" in op
+    ][0]["NormalizeImage"]
+    channel_num = normalize_ops.get("channel_num", 3)
+    output_dtype = types.FLOAT16 if normalize_ops.get("output_fp16",
+                                                      False) else types.FLOAT
+
+    env = os.environ
+    #  assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
+    #      "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
+    #      " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"
+
+    gpu_num = paddle.distributed.get_world_size()
+
+    batch_size = config_dataloader["sampler"]["batch_size"]
+
+    file_root = config_dataloader["dataset"]["image_root"]
+    file_list = config_dataloader["dataset"]["cls_label_path"]
+
+    interp = 1  # settings.interpolation or 1  # default to linear
+    interp_map = {
+        0: types.DALIInterpType.INTERP_NN,  # cv2.INTER_NEAREST
+        1: types.DALIInterpType.INTERP_LINEAR,  # cv2.INTER_LINEAR
+        2: types.DALIInterpType.INTERP_CUBIC,  # cv2.INTER_CUBIC
+        3: types.DALIInterpType.
+        INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
+    }
+
+    assert interp in interp_map, "interpolation method not supported by DALI"
+    interp = interp_map[interp]
+    pad_output = channel_num == 4
+
+    transforms = {
+        k: v
+        for d in config_dataloader["dataset"]["transform_ops"]
+        for k, v in d.items()
+    }
+
+    scale = transforms["NormalizeImage"].get("scale", 1.0 / 255)
+    scale = eval(scale) if isinstance(scale, str) else scale
+    mean = transforms["NormalizeImage"].get("mean", [0.485, 0.456, 0.406])
+    std = transforms["NormalizeImage"].get("std", [0.229, 0.224, 0.225])
+    mean = [v / scale for v in mean]
+    std = [v / scale for v in std]
+
+    sampler_name = config_dataloader["sampler"].get("name",
+                                                    "DistributedBatchSampler")
+    assert sampler_name in ["DistributedBatchSampler", "BatchSampler"]
+
+    if mode.lower() == "train":
+        resize_shorter = 256
+        crop = transforms["RandCropImage"]["size"]
+        scale = transforms["RandCropImage"].get("scale", [0.08, 1.])
+        ratio = transforms["RandCropImage"].get("ratio", [3.0 / 4, 4.0 / 3])
+        min_area = scale[0]
+        lower = ratio[0]
+        upper = ratio[1]
+
+        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env:
+            shard_id = int(env['PADDLE_TRAINER_ID'])
+            num_shards = int(env['PADDLE_TRAINERS_NUM'])
+            device_id = int(env['FLAGS_selected_gpus'])
+            pipe = HybridTrainPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                min_area,
+                lower,
+                upper,
+                interp,
+                mean,
+                std,
+                device_id,
+                shard_id,
+                num_shards,
+                seed=seed + shard_id,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
+            pipe.build()
+            pipelines = [pipe]
+            #  sample_per_shard = len(pipe) // num_shards
+        else:
+            pipe = HybridTrainPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                min_area,
+                lower,
+                upper,
+                interp,
+                mean,
+                std,
+                device_id=device_id,
+                shard_id=0,
+                num_shards=1,
+                seed=seed,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
+            pipe.build()
+            pipelines = [pipe]
+            #  sample_per_shard = len(pipelines[0])
+        return DALIGenericIterator(
+            pipelines, ['data', 'label'], reader_name='Reader')
+    else:
+        resize_shorter = transforms["ResizeImage"].get("resize_short", 256)
+        crop = transforms["CropImage"]["size"]
+        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env and sampler_name == "DistributedBatchSampler":
+            shard_id = int(env['PADDLE_TRAINER_ID'])
+            num_shards = int(env['PADDLE_TRAINERS_NUM'])
+            device_id = int(env['FLAGS_selected_gpus'])
+
+            pipe = HybridValPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                interp,
+                mean,
+                std,
+                device_id=device_id,
+                shard_id=shard_id,
+                num_shards=num_shards,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
+        else:
+            pipe = HybridValPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                interp,
+                mean,
+                std,
+                device_id=device_id,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
+        pipe.build()
+        return DALIGenericIterator(
+            [pipe], ['data', 'label'], reader_name="Reader")
diff --git a/src/PaddleClas/ppcls/data/dataloader/icartoon_dataset.py b/src/PaddleClas/ppcls/data/dataloader/icartoon_dataset.py
new file mode 100644
index 0000000..18e3b4b
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/icartoon_dataset.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from .common_dataset import CommonDataset
+
+
+class ICartoonDataset(CommonDataset):
+    def _load_anno(self, seed=None):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                self.images.append(os.path.join(self._img_root, l[0]))
+                self.labels.append(np.int64(l[1]))
+                assert os.path.exists(self.images[-1])
diff --git a/src/PaddleClas/ppcls/data/dataloader/imagenet_dataset.py b/src/PaddleClas/ppcls/data/dataloader/imagenet_dataset.py
new file mode 100644
index 0000000..1166ab3
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/imagenet_dataset.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from .common_dataset import CommonDataset
+
+
+class ImageNetDataset(CommonDataset):
+    def _load_anno(self, seed=None):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            if seed is not None:
+                np.random.RandomState(seed).shuffle(lines)
+            for l in lines:
+                l = l.strip().split(" ")
+                self.images.append(os.path.join(self._img_root, l[0]))
+                self.labels.append(np.int64(l[1]))
+                assert os.path.exists(self.images[-1])
diff --git a/src/PaddleClas/ppcls/data/dataloader/logo_dataset.py b/src/PaddleClas/ppcls/data/dataloader/logo_dataset.py
new file mode 100644
index 0000000..132ead9
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/logo_dataset.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import io
+import tarfile
+import numpy as np
+from PIL import Image  #all use default backend
+
+import paddle
+from paddle.io import Dataset
+import pickle
+import os
+import cv2
+import random
+
+from .common_dataset import CommonDataset
+
+
+class LogoDataset(CommonDataset):
+    def _load_anno(self):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                if l[0] == 'image_id':
+                    continue
+                self.images.append(os.path.join(self._img_root, l[3]))
+                self.labels.append(np.int64(l[1]) - 1)
+                assert os.path.exists(self.images[-1])
diff --git a/src/PaddleClas/ppcls/data/dataloader/mix_dataset.py b/src/PaddleClas/ppcls/data/dataloader/mix_dataset.py
new file mode 100644
index 0000000..cbf4b40
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/mix_dataset.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from paddle.io import Dataset
+from .. import dataloader
+
+
+class MixDataset(Dataset):
+    def __init__(self, datasets_config):
+        super().__init__()
+        self.dataset_list = []
+        start_idx = 0
+        end_idx = 0
+        for config_i in datasets_config:
+            dataset_name = config_i.pop('name')
+            dataset = getattr(dataloader, dataset_name)(**config_i)
+            end_idx += len(dataset)
+            self.dataset_list.append([end_idx, start_idx, dataset])
+            start_idx = end_idx
+
+        self.length = end_idx
+
+    def __getitem__(self, idx):
+        for dataset_i in self.dataset_list:
+            if dataset_i[0] > idx:
+                dataset_i_idx = idx - dataset_i[1]
+                return dataset_i[2][dataset_i_idx]
+
+    def __len__(self):
+        return self.length
+
+    def get_dataset_list(self):
+        return self.dataset_list
diff --git a/src/PaddleClas/ppcls/data/dataloader/mix_sampler.py b/src/PaddleClas/ppcls/data/dataloader/mix_sampler.py
new file mode 100644
index 0000000..2df3109
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/mix_sampler.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+
+from paddle.io import DistributedBatchSampler, Sampler
+
+from ppcls.utils import logger
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data import dataloader
+
+
+class MixSampler(DistributedBatchSampler):
+    def __init__(self, dataset, batch_size, sample_configs, iter_per_epoch):
+        super().__init__(dataset, batch_size)
+        assert isinstance(dataset,
+                          MixDataset), "MixSampler only support MixDataset"
+        self.sampler_list = []
+        self.batch_size = batch_size
+        self.start_list = []
+        self.length = iter_per_epoch
+        dataset_list = dataset.get_dataset_list()
+        batch_size_left = self.batch_size
+        self.iter_list = []
+        for i, config_i in enumerate(sample_configs):
+            self.start_list.append(dataset_list[i][1])
+            sample_method = config_i.pop("name")
+            ratio_i = config_i.pop("ratio")
+            if i < len(sample_configs) - 1:
+                batch_size_i = int(self.batch_size * ratio_i)
+                batch_size_left -= batch_size_i
+            else:
+                batch_size_i = batch_size_left
+            assert batch_size_i <= len(dataset_list[i][2])
+            config_i["batch_size"] = batch_size_i
+            if sample_method == "DistributedBatchSampler":
+                sampler_i = DistributedBatchSampler(dataset_list[i][2],
+                                                    **config_i)
+            else:
+                sampler_i = getattr(dataloader, sample_method)(
+                    dataset_list[i][2], **config_i)
+            self.sampler_list.append(sampler_i)
+            self.iter_list.append(iter(sampler_i))
+            self.length += len(dataset_list[i][2]) * ratio_i
+            self.iter_counter = 0
+
+    def __iter__(self):
+        while self.iter_counter < self.length:
+            batch = []
+            for i, iter_i in enumerate(self.iter_list):
+                batch_i = next(iter_i, None)
+                if batch_i is None:
+                    iter_i = iter(self.sampler_list[i])
+                    self.iter_list[i] = iter_i
+                    batch_i = next(iter_i, None)
+                    assert batch_i is not None, "dataset {} return None".format(
+                        i)
+                batch += [idx + self.start_list[i] for idx in batch_i]
+            if len(batch) == self.batch_size:
+                self.iter_counter += 1
+                yield batch
+            else:
+                logger.info("Some dataset reaches end")
+        self.iter_counter = 0
+
+    def __len__(self):
+        return self.length
diff --git a/src/PaddleClas/ppcls/data/dataloader/multilabel_dataset.py b/src/PaddleClas/ppcls/data/dataloader/multilabel_dataset.py
new file mode 100644
index 0000000..2c1ed77
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/multilabel_dataset.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+import cv2
+
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+from .common_dataset import CommonDataset
+
+
+class MultiLabelDataset(CommonDataset):
+    def _load_anno(self):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                self.images.append(os.path.join(self._img_root, l[0]))
+
+                labels = l[1].split(',')
+                labels = [np.int64(i) for i in labels]
+
+                self.labels.append(labels)
+                assert os.path.exists(self.images[-1])
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            label = np.array(self.labels[idx]).astype("float32")
+            return (img, label)
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
diff --git a/src/PaddleClas/ppcls/data/dataloader/pk_sampler.py b/src/PaddleClas/ppcls/data/dataloader/pk_sampler.py
new file mode 100644
index 0000000..bf563a6
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/pk_sampler.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from collections import defaultdict
+import numpy as np
+import random
+from paddle.io import DistributedBatchSampler
+
+from ppcls.utils import logger
+
+
+class PKSampler(DistributedBatchSampler):
+    """
+    First, randomly sample P identities.
+    Then for each identity randomly sample K instances.
+    Therefore batch size is P*K, and the sampler called PKSampler.
+    Args:
+        dataset (paddle.io.Dataset): list of (img_path, pid, cam_id).
+        sample_per_id(int): number of instances per identity in a batch.
+        batch_size (int): number of examples in a batch.
+        shuffle(bool): whether to shuffle indices order before generating
+            batch indices. Default False.
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 sample_per_id,
+                 shuffle=True,
+                 drop_last=True,
+                 sample_method="sample_avg_prob"):
+        super().__init__(
+            dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+        assert batch_size % sample_per_id == 0, \
+            "PKSampler configs error, Sample_per_id must be a divisor of batch_size."
+        assert hasattr(self.dataset,
+                       "labels"), "Dataset must have labels attribute."
+        self.sample_per_label = sample_per_id
+        self.label_dict = defaultdict(list)
+        self.sample_method = sample_method
+        for idx, label in enumerate(self.dataset.labels):
+            self.label_dict[label].append(idx)
+        self.label_list = list(self.label_dict)
+        assert len(self.label_list) * self.sample_per_label > self.batch_size, \
+            "batch size should be smaller than "
+        if self.sample_method == "id_avg_prob":
+            self.prob_list = np.array([1 / len(self.label_list)] *
+                                      len(self.label_list))
+        elif self.sample_method == "sample_avg_prob":
+            counter = []
+            for label_i in self.label_list:
+                counter.append(len(self.label_dict[label_i]))
+            self.prob_list = np.array(counter) / sum(counter)
+        else:
+            logger.error(
+                "PKSampler only support id_avg_prob and sample_avg_prob sample method, "
+                "but receive {}.".format(self.sample_method))
+        diff = np.abs(sum(self.prob_list) - 1)
+        if diff > 0.00000001:
+            self.prob_list[-1] = 1 - sum(self.prob_list[:-1])
+            if self.prob_list[-1] > 1 or self.prob_list[-1] < 0:
+                logger.error("PKSampler prob list error")
+            else:
+                logger.info(
+                    "PKSampler: sum of prob list not equal to 1, diff is {}, change the last prob".format(diff)
+                )
+
+    def __iter__(self):
+        label_per_batch = self.batch_size // self.sample_per_label
+        for _ in range(len(self)):
+            batch_index = []
+            batch_label_list = np.random.choice(
+                self.label_list,
+                size=label_per_batch,
+                replace=False,
+                p=self.prob_list)
+            for label_i in batch_label_list:
+                label_i_indexes = self.label_dict[label_i]
+                if self.sample_per_label <= len(label_i_indexes):
+                    batch_index.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_label,
+                            replace=False))
+                else:
+                    batch_index.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_label,
+                            replace=True))
+            if not self.drop_last or len(batch_index) == self.batch_size:
+                yield batch_index
diff --git a/src/PaddleClas/ppcls/data/dataloader/vehicle_dataset.py b/src/PaddleClas/ppcls/data/dataloader/vehicle_dataset.py
new file mode 100644
index 0000000..2981a57
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/dataloader/vehicle_dataset.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle.io import Dataset
+import os
+import cv2
+
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+from .common_dataset import create_operators
+
+
+class CompCars(Dataset):
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 label_root=None,
+                 transform_ops=None,
+                 bbox_crop=False):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        self._label_root = label_root
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self._bbox_crop = bbox_crop
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno()
+
+    def _load_anno(self):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        if self._bbox_crop:
+            assert os.path.exists(self._label_root)
+        self.images = []
+        self.labels = []
+        self.bboxes = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split()
+                if not self._bbox_crop:
+                    self.images.append(os.path.join(self._img_root, l[0]))
+                    self.labels.append(int(l[1]))
+                else:
+                    label_path = os.path.join(self._label_root,
+                                              l[0].split('.')[0] + '.txt')
+                    assert os.path.exists(label_path)
+                    with open(label_path) as f:
+                        bbox = f.readlines()[-1].strip().split()
+                    bbox = [int(x) for x in bbox]
+                    self.images.append(os.path.join(self._img_root, l[0]))
+                    self.labels.append(int(l[1]))
+                    self.bboxes.append(bbox)
+                    assert os.path.exists(self.images[-1])
+
+    def __getitem__(self, idx):
+        img = cv2.imread(self.images[idx])
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        if self._bbox_crop:
+            bbox = self.bboxes[idx]
+            img = img[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+        if self._transform_ops:
+            img = transform(img, self._transform_ops)
+        img = img.transpose((2, 0, 1))
+        return (img, self.labels[idx])
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
+
+
+class VeriWild(Dataset):
+    def __init__(
+            self,
+            image_root,
+            cls_label_path,
+            transform_ops=None, ):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno()
+
+    def _load_anno(self):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+        self.cameras = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split()
+                self.images.append(os.path.join(self._img_root, l[0]))
+                self.labels.append(np.int64(l[1]))
+                self.cameras.append(np.int64(l[2]))
+                assert os.path.exists(self.images[-1])
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            return (img, self.labels[idx], self.cameras[idx])
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
diff --git a/src/PaddleClas/ppcls/data/postprocess/__init__.py b/src/PaddleClas/ppcls/data/postprocess/__init__.py
new file mode 100644
index 0000000..831a4da
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/postprocess/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import importlib
+
+from . import topk
+
+from .topk import Topk, MultiLabelTopk
+
+
+def build_postprocess(config):
+    config = copy.deepcopy(config)
+    model_name = config.pop("name")
+    mod = importlib.import_module(__name__)
+    postprocess_func = getattr(mod, model_name)(**config)
+    return postprocess_func
+
+
+class DistillationPostProcess(object):
+    def __init__(self, model_name="Student", key=None, func="Topk", **kargs):
+        super().__init__()
+        self.func = eval(func)(**kargs)
+        self.model_name = model_name
+        self.key = key
+
+    def __call__(self, x, file_names=None):
+        x = x[self.model_name]
+        if self.key is not None:
+            x = x[self.key]
+        return self.func(x, file_names=file_names)
diff --git a/src/PaddleClas/ppcls/data/postprocess/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/data/postprocess/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..ff25208
Binary files /dev/null and b/src/PaddleClas/ppcls/data/postprocess/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/postprocess/__pycache__/topk.cpython-39.pyc b/src/PaddleClas/ppcls/data/postprocess/__pycache__/topk.cpython-39.pyc
new file mode 100644
index 0000000..a88a23e
Binary files /dev/null and b/src/PaddleClas/ppcls/data/postprocess/__pycache__/topk.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/postprocess/topk.py b/src/PaddleClas/ppcls/data/postprocess/topk.py
new file mode 100644
index 0000000..9c1371b
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/postprocess/topk.py
@@ -0,0 +1,85 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+class Topk(object):
+    def __init__(self, topk=1, class_id_map_file=None):
+        assert isinstance(topk, (int, ))
+        self.class_id_map = self.parse_class_id_map(class_id_map_file)
+        self.topk = topk
+
+    def parse_class_id_map(self, class_id_map_file):
+        if class_id_map_file is None:
+            return None
+        if not os.path.exists(class_id_map_file):
+            print(
+                "Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!"
+            )
+            return None
+
+        try:
+            class_id_map = {}
+            with open(class_id_map_file, "r") as fin:
+                lines = fin.readlines()
+                for line in lines:
+                    partition = line.split("\n")[0].partition(" ")
+                    class_id_map[int(partition[0])] = str(partition[-1])
+        except Exception as ex:
+            print(ex)
+            class_id_map = None
+        return class_id_map
+
+    def __call__(self, x, file_names=None, multilabel=False):
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.softmax(x, axis=-1) if not multilabel else F.sigmoid(x)
+        x = x.numpy()
+        y = []
+        for idx, probs in enumerate(x):
+            index = probs.argsort(axis=0)[-self.topk:][::-1].astype(
+                "int32") if not multilabel else np.where(
+                    probs >= 0.5)[0].astype("int32")
+            clas_id_list = []
+            score_list = []
+            label_name_list = []
+            for i in index:
+                clas_id_list.append(i.item())
+                score_list.append(probs[i].item())
+                if self.class_id_map is not None:
+                    label_name_list.append(self.class_id_map[i.item()])
+            result = {
+                "class_ids": clas_id_list,
+                "scores": np.around(
+                    score_list, decimals=5).tolist(),
+            }
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            if label_name_list is not None:
+                result["label_names"] = label_name_list
+            y.append(result)
+        return y
+
+
+class MultiLabelTopk(Topk):
+    def __init__(self, topk=1, class_id_map_file=None):
+        super().__init__()
+
+    def __call__(self, x, file_names=None):
+        return super().__call__(x, file_names, multilabel=True)
diff --git a/src/PaddleClas/ppcls/data/preprocess/__init__.py b/src/PaddleClas/ppcls/data/preprocess/__init__.py
new file mode 100644
index 0000000..075ee89
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/__init__.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.data.preprocess.ops.autoaugment import ImageNetPolicy as RawImageNetPolicy
+from ppcls.data.preprocess.ops.randaugment import RandAugment as RawRandAugment
+from ppcls.data.preprocess.ops.timm_autoaugment import RawTimmAutoAugment
+from ppcls.data.preprocess.ops.cutout import Cutout
+
+from ppcls.data.preprocess.ops.hide_and_seek import HideAndSeek
+from ppcls.data.preprocess.ops.random_erasing import RandomErasing
+from ppcls.data.preprocess.ops.grid import GridMask
+
+from ppcls.data.preprocess.ops.operators import DecodeImage
+from ppcls.data.preprocess.ops.operators import ResizeImage
+from ppcls.data.preprocess.ops.operators import CropImage
+from ppcls.data.preprocess.ops.operators import RandCropImage
+from ppcls.data.preprocess.ops.operators import RandFlipImage
+from ppcls.data.preprocess.ops.operators import NormalizeImage
+from ppcls.data.preprocess.ops.operators import ToCHWImage
+from ppcls.data.preprocess.ops.operators import AugMix
+
+from ppcls.data.preprocess.batch_ops.batch_operators import MixupOperator, CutmixOperator, OpSampler, FmixOperator
+
+import numpy as np
+from PIL import Image
+
+
+def transform(data, ops=[]):
+    """ transform """
+    for op in ops:
+        data = op(data)
+    return data
+
+
+class AutoAugment(RawImageNetPolicy):
+    """ ImageNetPolicy wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugment(RawRandAugment):
+    """ RandAugment wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class TimmAutoAugment(RawTimmAutoAugment):
+    """ TimmAutoAugment wrapper to auto fit different img tyeps. """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
diff --git a/src/PaddleClas/ppcls/data/preprocess/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..aad9c48
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/batch_ops/__init__.py b/src/PaddleClas/ppcls/data/preprocess/batch_ops/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/batch_ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/PaddleClas/ppcls/data/preprocess/batch_ops/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/batch_ops/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..b641a17
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/batch_ops/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/batch_ops/__pycache__/batch_operators.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/batch_ops/__pycache__/batch_operators.cpython-39.pyc
new file mode 100644
index 0000000..fb04253
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/batch_ops/__pycache__/batch_operators.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/batch_ops/batch_operators.py b/src/PaddleClas/ppcls/data/preprocess/batch_ops/batch_operators.py
new file mode 100644
index 0000000..6f0abb8
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/batch_ops/batch_operators.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import random
+
+import numpy as np
+
+from ppcls.utils import logger
+from ppcls.data.preprocess.ops.fmix import sample_mask
+
+
+class BatchOperator(object):
+    """ BatchOperator """
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def _unpack(self, batch):
+        """ _unpack """
+        assert isinstance(batch, list), \
+                'batch should be a list filled with tuples (img, label)'
+        bs = len(batch)
+        assert bs > 0, 'size of the batch data should > 0'
+        #imgs, labels = list(zip(*batch))
+        imgs = []
+        labels = []
+        for item in batch:
+            imgs.append(item[0])
+            labels.append(item[1])
+        return np.array(imgs), np.array(labels), bs
+
+    def _one_hot(self, targets):
+        return np.eye(self.class_num, dtype="float32")[targets]
+
+    def _mix_target(self, targets0, targets1, lam):
+        one_hots0 = self._one_hot(targets0)
+        one_hots1 = self._one_hot(targets1)
+        return one_hots0 * lam + one_hots1 * (1 - lam)
+
+    def __call__(self, batch):
+        return batch
+
+
+class MixupOperator(BatchOperator):
+    """ Mixup operator 
+    reference: https://arxiv.org/abs/1710.09412
+
+    """
+
+    def __init__(self, class_num, alpha: float=1.):
+        """Build Mixup operator
+
+        Args:
+            alpha (float, optional): The parameter alpha of mixup. Defaults to 1..
+
+        Raises:
+            Exception: The value of parameter is illegal.
+        """
+        if alpha <= 0:
+            raise Exception(
+                f"Parameter \"alpha\" of Mixup should be greater than 0. \"alpha\": {alpha}."
+            )
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"MixupOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self.class_num = class_num
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+        imgs = lam * imgs + (1 - lam) * imgs[idx]
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class CutmixOperator(BatchOperator):
+    """ Cutmix operator
+    reference: https://arxiv.org/abs/1905.04899
+
+    """
+
+    def __init__(self, class_num, alpha=0.2):
+        """Build Cutmix operator
+
+        Args:
+            alpha (float, optional): The parameter alpha of cutmix. Defaults to 0.2.
+
+        Raises:
+            Exception: The value of parameter is illegal.
+        """
+        if alpha <= 0:
+            raise Exception(
+                f"Parameter \"alpha\" of Cutmix should be greater than 0. \"alpha\": {alpha}."
+            )
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"CutmixOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self.class_num = class_num
+
+    def _rand_bbox(self, size, lam):
+        """ _rand_bbox """
+        w = size[2]
+        h = size[3]
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = int(w * cut_rat)
+        cut_h = int(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w)
+        bby1 = np.clip(cy - cut_h // 2, 0, h)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w)
+        bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+        return bbx1, bby1, bbx2, bby2
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+
+        bbx1, bby1, bbx2, bby2 = self._rand_bbox(imgs.shape, lam)
+        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.shape[-2] * imgs.shape[-1]))
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class FmixOperator(BatchOperator):
+    """ Fmix operator 
+    reference: https://arxiv.org/abs/2002.12047
+    
+    """
+
+    def __init__(self,
+                 class_num,
+                 alpha=1,
+                 decay_power=3,
+                 max_soft=0.,
+                 reformulate=False):
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"FmixOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self._decay_power = decay_power
+        self._max_soft = max_soft
+        self._reformulate = reformulate
+        self.class_num = class_num
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        size = (imgs.shape[2], imgs.shape[3])
+        lam, mask = sample_mask(self._alpha, self._decay_power, \
+                size, self._max_soft, self._reformulate)
+        imgs = mask * imgs + (1 - mask) * imgs[idx]
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class OpSampler(object):
+    """ Sample a operator from  """
+
+    def __init__(self, class_num, **op_dict):
+        """Build OpSampler
+
+        Raises:
+            Exception: The parameter \"prob\" of operator(s) are be set error.
+        """
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"OpSampler\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        if len(op_dict) < 1:
+            msg = f"ConfigWarning: No operator in \"OpSampler\". \"OpSampler\" has been skipped."
+            logger.warning(msg)
+
+        self.ops = {}
+        total_prob = 0
+        for op_name in op_dict:
+            param = op_dict[op_name]
+            if "prob" not in param:
+                msg = f"ConfigWarning: Parameter \"prob\" should be set when use operator in \"OpSampler\". The operator \"{op_name}\"'s prob has been set \"0\"."
+                logger.warning(msg)
+            prob = param.pop("prob", 0)
+            total_prob += prob
+            param.update({"class_num": class_num})
+            op = eval(op_name)(**param)
+            self.ops.update({op: prob})
+
+        if total_prob > 1:
+            msg = f"ConfigError: The total prob of operators in \"OpSampler\" should be less 1."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        # add "None Op" when total_prob < 1, "None Op" do nothing
+        self.ops[None] = 1 - total_prob
+
+    def __call__(self, batch):
+        op = random.choices(
+            list(self.ops.keys()), weights=list(self.ops.values()), k=1)[0]
+        # return batch directly when None Op
+        return op(batch) if op else batch
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__init__.py b/src/PaddleClas/ppcls/data/preprocess/ops/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..44a6c65
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/autoaugment.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/autoaugment.cpython-39.pyc
new file mode 100644
index 0000000..6bc1a31
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/autoaugment.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/cutout.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/cutout.cpython-39.pyc
new file mode 100644
index 0000000..087e231
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/cutout.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/fmix.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/fmix.cpython-39.pyc
new file mode 100644
index 0000000..a0d5b32
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/fmix.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/functional.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000..b8c9e26
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/functional.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/grid.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/grid.cpython-39.pyc
new file mode 100644
index 0000000..b45f8b6
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/grid.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/hide_and_seek.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/hide_and_seek.cpython-39.pyc
new file mode 100644
index 0000000..3fd0a2d
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/hide_and_seek.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/operators.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/operators.cpython-39.pyc
new file mode 100644
index 0000000..5f3acbd
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/operators.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/randaugment.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/randaugment.cpython-39.pyc
new file mode 100644
index 0000000..e599957
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/randaugment.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/random_erasing.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/random_erasing.cpython-39.pyc
new file mode 100644
index 0000000..a86492f
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/random_erasing.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/timm_autoaugment.cpython-39.pyc b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/timm_autoaugment.cpython-39.pyc
new file mode 100644
index 0000000..5f8e988
Binary files /dev/null and b/src/PaddleClas/ppcls/data/preprocess/ops/__pycache__/timm_autoaugment.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/autoaugment.py b/src/PaddleClas/ppcls/data/preprocess/ops/autoaugment.py
new file mode 100644
index 0000000..330220a
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/autoaugment.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/DeepVoltaire/AutoAugment/blob/master/autoaugment.py
+
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+
+
+class ImageNetPolicy(object):
+    """ Randomly choose one of the best 24 Sub-policies on ImageNet.
+
+        Example:
+        >>> policy = ImageNetPolicy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     ImageNetPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.4, "posterize", 8, 0.6, "rotate", 9, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "posterize", 7, 0.6, "posterize", 6, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.4, "equalize", 4, 0.8, "rotate", 8, fillcolor),
+            SubPolicy(0.6, "solarize", 3, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.8, "posterize", 5, 1.0, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "rotate", 3, 0.6, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "equalize", 8, 0.4, "posterize", 6, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 0.4, "color", 0, fillcolor),
+            SubPolicy(0.4, "rotate", 9, 0.6, "equalize", 2, fillcolor),
+            SubPolicy(0.0, "equalize", 7, 0.8, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 1.0, "color", 2, fillcolor),
+            SubPolicy(0.8, "color", 8, 0.8, "solarize", 7, fillcolor),
+            SubPolicy(0.4, "sharpness", 7, 0.6, "invert", 8, fillcolor),
+            SubPolicy(0.6, "shearX", 5, 1.0, "equalize", 9, fillcolor),
+            SubPolicy(0.4, "color", 0, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment ImageNet Policy"
+
+
+class CIFAR10Policy(object):
+    """ Randomly choose one of the best 25 Sub-policies on CIFAR10.
+
+        Example:
+        >>> policy = CIFAR10Policy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     CIFAR10Policy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.1, "invert", 7, 0.2, "contrast", 6, fillcolor),
+            SubPolicy(0.7, "rotate", 2, 0.3, "translateX", 9, fillcolor),
+            SubPolicy(0.8, "sharpness", 1, 0.9, "sharpness", 3, fillcolor),
+            SubPolicy(0.5, "shearY", 8, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.5, "autocontrast", 8, 0.9, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "shearY", 7, 0.3, "posterize", 7, fillcolor),
+            SubPolicy(0.4, "color", 3, 0.6, "brightness", 7, fillcolor),
+            SubPolicy(0.3, "sharpness", 9, 0.7, "brightness", 9, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.5, "equalize", 1, fillcolor),
+            SubPolicy(0.6, "contrast", 7, 0.6, "sharpness", 5, fillcolor),
+            SubPolicy(0.7, "color", 7, 0.5, "translateX", 8, fillcolor),
+            SubPolicy(0.3, "equalize", 7, 0.4, "autocontrast", 8, fillcolor),
+            SubPolicy(0.4, "translateY", 3, 0.2, "sharpness", 6, fillcolor),
+            SubPolicy(0.9, "brightness", 6, 0.2, "color", 8, fillcolor),
+            SubPolicy(0.5, "solarize", 2, 0.0, "invert", 3, fillcolor),
+            SubPolicy(0.2, "equalize", 0, 0.6, "autocontrast", 0, fillcolor),
+            SubPolicy(0.2, "equalize", 8, 0.8, "equalize", 4, fillcolor),
+            SubPolicy(0.9, "color", 9, 0.6, "equalize", 6, fillcolor),
+            SubPolicy(0.8, "autocontrast", 4, 0.2, "solarize", 8, fillcolor),
+            SubPolicy(0.1, "brightness", 3, 0.7, "color", 0, fillcolor),
+            SubPolicy(0.4, "solarize", 5, 0.9, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "translateY", 9, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.9, "autocontrast", 2, 0.8, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.1, "invert", 3, fillcolor),
+            SubPolicy(0.7, "translateY", 9, 0.9, "autocontrast", 1, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment CIFAR10 Policy"
+
+
+class SVHNPolicy(object):
+    """ Randomly choose one of the best 25 Sub-policies on SVHN.
+
+        Example:
+        >>> policy = SVHNPolicy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     SVHNPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.9, "shearX", 4, 0.2, "invert", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.7, "invert", 5, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.6, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 3, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "equalize", 1, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.8, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.4, "invert", 5, fillcolor),
+            SubPolicy(0.9, "shearY", 5, 0.2, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 6, 0.8, "autocontrast", 1, fillcolor),
+            SubPolicy(0.6, "equalize", 3, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.3, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "shearY", 8, 0.7, "invert", 4, fillcolor),
+            SubPolicy(0.9, "equalize", 5, 0.6, "translateY", 6, fillcolor),
+            SubPolicy(0.9, "invert", 4, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.3, "contrast", 3, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(0.8, "invert", 5, 0.0, "translateY", 2, fillcolor),
+            SubPolicy(0.7, "shearY", 6, 0.4, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(
+                0.3, "shearY", 7, 0.9, "translateX", 3, fillcolor), SubPolicy(
+                    0.1, "shearX", 6, 0.6, "invert", 5, fillcolor), SubPolicy(
+                        0.7, "solarize", 2, 0.6, "translateY", 7,
+                        fillcolor), SubPolicy(0.8, "shearY", 4, 0.8, "invert",
+                                              8, fillcolor), SubPolicy(
+                                                  0.7, "shearX", 9, 0.8,
+                                                  "translateY", 3,
+                                                  fillcolor), SubPolicy(
+                                                      0.8, "shearY", 5, 0.7,
+                                                      "autocontrast", 3,
+                                                      fillcolor),
+            SubPolicy(0.7, "shearX", 2, 0.1, "invert", 5, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment SVHN Policy"
+
+
+class SubPolicy(object):
+    def __init__(self,
+                 p1,
+                 operation1,
+                 magnitude_idx1,
+                 p2,
+                 operation2,
+                 magnitude_idx2,
+                 fillcolor=(128, 128, 128)):
+        ranges = {
+            "shearX": np.linspace(0, 0.3, 10),
+            "shearY": np.linspace(0, 0.3, 10),
+            "translateX": np.linspace(0, 150 / 331, 10),
+            "translateY": np.linspace(0, 150 / 331, 10),
+            "rotate": np.linspace(0, 30, 10),
+            "color": np.linspace(0.0, 0.9, 10),
+            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int),
+            "solarize": np.linspace(256, 0, 10),
+            "contrast": np.linspace(0.0, 0.9, 10),
+            "sharpness": np.linspace(0.0, 0.9, 10),
+            "brightness": np.linspace(0.0, 0.9, 10),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10
+        }
+
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+
+        self.p1 = p1
+        self.operation1 = func[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        self.p2 = p2
+        self.operation2 = func[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img, self.magnitude1)
+        if random.random() < self.p2:
+            img = self.operation2(img, self.magnitude2)
+        return img
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/cutout.py b/src/PaddleClas/ppcls/data/preprocess/ops/cutout.py
new file mode 100644
index 0000000..b906e14
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/cutout.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/uoguelph-mlrg/Cutout
+
+import numpy as np
+import random
+
+
+class Cutout(object):
+    def __init__(self, n_holes=1, length=112):
+        self.n_holes = n_holes
+        self.length = length
+
+    def __call__(self, img):
+        """ cutout_image """
+        h, w = img.shape[:2]
+        mask = np.ones((h, w), np.float32)
+
+        for n in range(self.n_holes):
+            y = np.random.randint(h)
+            x = np.random.randint(w)
+
+            y1 = np.clip(y - self.length // 2, 0, h)
+            y2 = np.clip(y + self.length // 2, 0, h)
+            x1 = np.clip(x - self.length // 2, 0, w)
+            x2 = np.clip(x + self.length // 2, 0, w)
+
+            img[y1:y2, x1:x2] = 0
+        return img
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/fmix.py b/src/PaddleClas/ppcls/data/preprocess/ops/fmix.py
new file mode 100644
index 0000000..dc2ef91
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/fmix.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+
+import numpy as np
+from scipy.stats import beta
+
+
+def fftfreqnd(h, w=None, z=None):
+    """ Get bin values for discrete fourier transform of size (h, w, z)
+
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    fz = fx = 0
+    fy = np.fft.fftfreq(h)
+
+    if w is not None:
+        fy = np.expand_dims(fy, -1)
+
+        if w % 2 == 1:
+            fx = np.fft.fftfreq(w)[:w // 2 + 2]
+        else:
+            fx = np.fft.fftfreq(w)[:w // 2 + 1]
+
+    if z is not None:
+        fy = np.expand_dims(fy, -1)
+        if z % 2 == 1:
+            fz = np.fft.fftfreq(z)[:, None]
+        else:
+            fz = np.fft.fftfreq(z)[:, None]
+
+    return np.sqrt(fx * fx + fy * fy + fz * fz)
+
+
+def get_spectrum(freqs, decay_power, ch, h, w=0, z=0):
+    """ Samples a fourier image with given size and frequencies decayed by decay power
+
+    :param freqs: Bin values for the discrete fourier transform
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param ch: Number of channels for the resulting mask
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    scale = np.ones(1) / (np.maximum(freqs, np.array([1. / max(w, h, z)]))
+                          **decay_power)
+
+    param_size = [ch] + list(freqs.shape) + [2]
+    param = np.random.randn(*param_size)
+
+    scale = np.expand_dims(scale, -1)[None, :]
+
+    return scale * param
+
+
+def make_low_freq_image(decay, shape, ch=1):
+    """ Sample a low frequency image from fourier space
+
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param ch: Number of channels for desired mask
+    """
+    freqs = fftfreqnd(*shape)
+    spectrum = get_spectrum(freqs, decay, ch,
+                            *shape)  #.reshape((1, *shape[:-1], -1))
+    spectrum = spectrum[:, 0] + 1j * spectrum[:, 1]
+    mask = np.real(np.fft.irfftn(spectrum, shape))
+
+    if len(shape) == 1:
+        mask = mask[:1, :shape[0]]
+    if len(shape) == 2:
+        mask = mask[:1, :shape[0], :shape[1]]
+    if len(shape) == 3:
+        mask = mask[:1, :shape[0], :shape[1], :shape[2]]
+
+    mask = mask
+    mask = (mask - mask.min())
+    mask = mask / mask.max()
+    return mask
+
+
+def sample_lam(alpha, reformulate=False):
+    """ Sample a lambda from symmetric beta distribution with given alpha
+
+    :param alpha: Alpha value for beta distribution
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if reformulate:
+        lam = beta.rvs(alpha + 1, alpha)
+    else:
+        lam = beta.rvs(alpha, alpha)
+
+    return lam
+
+
+def binarise_mask(mask, lam, in_shape, max_soft=0.0):
+    """ Binarises a given low frequency image such that it has mean lambda.
+
+    :param mask: Low frequency image, usually the result of `make_low_freq_image`
+    :param lam: Mean value of final mask
+    :param in_shape: Shape of inputs
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :return:
+    """
+    idx = mask.reshape(-1).argsort()[::-1]
+    mask = mask.reshape(-1)
+    num = math.ceil(lam * mask.size) if random.random() > 0.5 else math.floor(
+        lam * mask.size)
+
+    eff_soft = max_soft
+    if max_soft > lam or max_soft > (1 - lam):
+        eff_soft = min(lam, 1 - lam)
+
+    soft = int(mask.size * eff_soft)
+    num_low = int(num - soft)
+    num_high = int(num + soft)
+
+    mask[idx[:num_high]] = 1
+    mask[idx[num_low:]] = 0
+    mask[idx[num_low:num_high]] = np.linspace(1, 0, (num_high - num_low))
+
+    mask = mask.reshape((1, 1, in_shape[0], in_shape[1]))
+    return mask
+
+
+def sample_mask(alpha, decay_power, shape, max_soft=0.0, reformulate=False):
+    """ Samples a mean lambda from beta distribution parametrised by alpha, creates a low frequency image and binarises
+    it based on this lambda
+
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+
+    # Choose lambda
+    lam = sample_lam(alpha, reformulate)
+
+    # Make mask, get mean / std
+    mask = make_low_freq_image(decay_power, shape)
+    mask = binarise_mask(mask, lam, shape, max_soft)
+
+    return float(lam), mask
+
+
+def sample_and_apply(x,
+                     alpha,
+                     decay_power,
+                     shape,
+                     max_soft=0.0,
+                     reformulate=False):
+    """
+
+    :param x: Image batch on which to apply fmix of shape [b, c, shape*]
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    :return: mixed input, permutation indices, lambda value of mix,
+    """
+    lam, mask = sample_mask(alpha, decay_power, shape, max_soft, reformulate)
+    index = np.random.permutation(x.shape[0])
+
+    x1, x2 = x * mask, x[index] * (1 - mask)
+    return x1 + x2, index, lam
+
+
+class FMixBase:
+    """ FMix augmentation
+
+        Args:
+            decay_power (float): Decay power for frequency decay prop 1/f**d
+            alpha (float): Alpha value for beta distribution from which to sample mean of mask
+            size ([int] | [int, int] | [int, int, int]): Shape of desired mask, list up to 3 dims
+            max_soft (float): Softening value between 0 and 0.5 which smooths hard edges in the mask.
+            reformulate (bool): If True, uses the reformulation of [1].
+    """
+
+    def __init__(self,
+                 decay_power=3,
+                 alpha=1,
+                 size=(32, 32),
+                 max_soft=0.0,
+                 reformulate=False):
+        super().__init__()
+        self.decay_power = decay_power
+        self.reformulate = reformulate
+        self.size = size
+        self.alpha = alpha
+        self.max_soft = max_soft
+        self.index = None
+        self.lam = None
+
+    def __call__(self, x):
+        raise NotImplementedError
+
+    def loss(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/functional.py b/src/PaddleClas/ppcls/data/preprocess/ops/functional.py
new file mode 100644
index 0000000..9f1369e
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/functional.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# encoding: utf-8
+
+import numpy as np
+from PIL import Image, ImageOps, ImageEnhance
+
+
+
+def int_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval .
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      An int that results from scaling `maxval` according to `level`.
+    """
+    return int(level * maxval / 10)
+
+
+def float_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval.
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      A float that results from scaling `maxval` according to `level`.
+    """
+    return float(level) * maxval / 10.
+
+
+def sample_level(n):
+    return np.random.uniform(low=0.1, high=n)
+
+
+def autocontrast(pil_img, *args):
+    return ImageOps.autocontrast(pil_img)
+
+
+def equalize(pil_img, *args):
+    return ImageOps.equalize(pil_img)
+
+
+def posterize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 4)
+    return ImageOps.posterize(pil_img, 4 - level)
+
+
+def rotate(pil_img, level, *args):
+    degrees = int_parameter(sample_level(level), 30)
+    if np.random.uniform() > 0.5:
+        degrees = -degrees
+    return pil_img.rotate(degrees, resample=Image.BILINEAR)
+
+
+def solarize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 256)
+    return ImageOps.solarize(pil_img, 256 - level)
+
+
+def shear_x(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, level, 0, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def shear_y(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, level, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_x(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[0] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, level, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_y(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[1] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, 0, 1, level),
+                             resample=Image.BILINEAR)
+
+
+# operation that overlaps with ImageNet-C's test set
+def color(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Color(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def contrast(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Contrast(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def brightness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Brightness(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def sharpness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Sharpness(pil_img).enhance(level)
+
+
+augmentations = [
+    autocontrast, equalize, posterize, rotate, solarize, shear_x, shear_y,
+    translate_x, translate_y
+]
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/grid.py b/src/PaddleClas/ppcls/data/preprocess/ops/grid.py
new file mode 100644
index 0000000..6f0b2dc
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/grid.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/akuxcw/GridMask
+
+import numpy as np
+from PIL import Image
+import pdb
+
+# curr
+CURR_EPOCH = 0
+# epoch for the prob to be the upper limit
+NUM_EPOCHS = 240
+
+
+class GridMask(object):
+    def __init__(self, d1=96, d2=224, rotate=1, ratio=0.5, mode=0, prob=1.):
+        self.d1 = d1
+        self.d2 = d2
+        self.rotate = rotate
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.last_prob = -1
+
+    def set_prob(self):
+        global CURR_EPOCH
+        global NUM_EPOCHS
+        self.prob = self.st_prob * min(1, 1.0 * CURR_EPOCH / NUM_EPOCHS)
+
+    def __call__(self, img):
+        self.set_prob()
+        if abs(self.last_prob - self.prob) > 1e-10:
+            global CURR_EPOCH
+            global NUM_EPOCHS
+            print(
+                "self.prob is updated, self.prob={}, CURR_EPOCH: {}, NUM_EPOCHS: {}".
+                format(self.prob, CURR_EPOCH, NUM_EPOCHS))
+            self.last_prob = self.prob
+        # print("CURR_EPOCH: {}, NUM_EPOCHS: {}, self.prob is set as: {}".format(CURR_EPOCH, NUM_EPOCHS, self.prob) )
+        if np.random.rand() > self.prob:
+            return img
+        _, h, w = img.shape
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(self.d1, self.d2)
+        #d = self.d
+        self.l = int(d * self.ratio + 0.5)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        for i in range(-1, hh // d + 1):
+            s = d * i + st_h
+            t = s + self.l
+            s = max(min(s, hh), 0)
+            t = max(min(t, hh), 0)
+            mask[s:t, :] *= 0
+        for i in range(-1, ww // d + 1):
+            s = d * i + st_w
+            t = s + self.l
+            s = max(min(s, ww), 0)
+            t = max(min(t, ww), 0)
+            mask[:, s:t] *= 0
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) //
+                    2 + w]
+
+        if self.mode == 1:
+            mask = 1 - mask
+
+        mask = np.expand_dims(mask, axis=0)
+        img = (img * mask).astype(img.dtype)
+
+        return img
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/hide_and_seek.py b/src/PaddleClas/ppcls/data/preprocess/ops/hide_and_seek.py
new file mode 100644
index 0000000..33f25f2
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/hide_and_seek.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/kkanshul/Hide-and-Seek
+
+import numpy as np
+import random
+
+
+class HideAndSeek(object):
+    def __init__(self):
+        # possible grid size, 0 means no hiding
+        self.grid_sizes = [0, 16, 32, 44, 56]
+        # hiding probability
+        self.hide_prob = 0.5
+
+    def __call__(self, img):
+        # randomly choose one grid size
+        grid_size = np.random.choice(self.grid_sizes)
+
+        _, h, w = img.shape
+
+        # hide the patches
+        if grid_size == 0:
+            return img
+        for x in range(0, w, grid_size):
+            for y in range(0, h, grid_size):
+                x_end = min(w, x + grid_size)
+                y_end = min(h, y + grid_size)
+                if (random.random() <= self.hide_prob):
+                    img[:, x:x_end, y:y_end] = 0
+
+        return img
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/operators.py b/src/PaddleClas/ppcls/data/preprocess/ops/operators.py
new file mode 100644
index 0000000..9cdc58b
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/operators.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from functools import partial
+import six
+import math
+import random
+import cv2
+import numpy as np
+from PIL import Image
+from paddle.vision.transforms import ColorJitter as RawColorJitter
+
+from .autoaugment import ImageNetPolicy
+from .functional import augmentations
+from ppcls.utils import logger
+
+
+class UnifiedResize(object):
+    def __init__(self, interpolation=None, backend="cv2"):
+        _cv2_interp_from_str = {
+            'nearest': cv2.INTER_NEAREST,
+            'bilinear': cv2.INTER_LINEAR,
+            'area': cv2.INTER_AREA,
+            'bicubic': cv2.INTER_CUBIC,
+            'lanczos': cv2.INTER_LANCZOS4
+        }
+        _pil_interp_from_str = {
+            'nearest': Image.NEAREST,
+            'bilinear': Image.BILINEAR,
+            'bicubic': Image.BICUBIC,
+            'box': Image.BOX,
+            'lanczos': Image.LANCZOS,
+            'hamming': Image.HAMMING
+        }
+
+        def _pil_resize(src, size, resample):
+            pil_img = Image.fromarray(src)
+            pil_img = pil_img.resize(size, resample)
+            return np.asarray(pil_img)
+
+        if backend.lower() == "cv2":
+            if isinstance(interpolation, str):
+                interpolation = _cv2_interp_from_str[interpolation.lower()]
+            # compatible with opencv < version 4.4.0
+            elif interpolation is None:
+                interpolation = cv2.INTER_LINEAR
+            self.resize_func = partial(cv2.resize, interpolation=interpolation)
+        elif backend.lower() == "pil":
+            if isinstance(interpolation, str):
+                interpolation = _pil_interp_from_str[interpolation.lower()]
+            self.resize_func = partial(_pil_resize, resample=interpolation)
+        else:
+            logger.warning(
+                f"The backend of Resize only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead."
+            )
+            self.resize_func = cv2.resize
+
+    def __call__(self, src, size):
+        return self.resize_func(src, size)
+
+
+class OperatorParamError(ValueError):
+    """ OperatorParamError
+    """
+    pass
+
+
+class DecodeImage(object):
+    """ decode image """
+
+    def __init__(self, to_rgb=True, to_np=False, channel_first=False):
+        self.to_rgb = to_rgb
+        self.to_np = to_np  # to numpy
+        self.channel_first = channel_first  # only enabled when to_np is True
+
+    def __call__(self, img):
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        data = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(data, 1)
+        if self.to_rgb:
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
+                img.shape)
+            img = img[:, :, ::-1]
+
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+
+        return img
+
+
+class ResizeImage(object):
+    """ resize image """
+
+    def __init__(self,
+                 size=None,
+                 resize_short=None,
+                 interpolation=None,
+                 backend="cv2"):
+        if resize_short is not None and resize_short > 0:
+            self.resize_short = resize_short
+            self.w = None
+            self.h = None
+        elif size is not None:
+            self.resize_short = None
+            self.w = size if type(size) is int else size[0]
+            self.h = size if type(size) is int else size[1]
+        else:
+            raise OperatorParamError("invalid params for ReisizeImage for '\
+                'both 'size' and 'resize_short' are None")
+
+        self._resize_func = UnifiedResize(
+            interpolation=interpolation, backend=backend)
+
+    def __call__(self, img):
+        img_h, img_w = img.shape[:2]
+        if self.resize_short is not None:
+            percent = float(self.resize_short) / min(img_w, img_h)
+            w = int(round(img_w * percent))
+            h = int(round(img_h * percent))
+        else:
+            w = self.w
+            h = self.h
+        return self._resize_func(img, (w, h))
+
+
+class CropImage(object):
+    """ crop image """
+
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size  # (h, w)
+
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+
+
+class RandCropImage(object):
+    """ random crop image """
+
+    def __init__(self,
+                 size,
+                 scale=None,
+                 ratio=None,
+                 interpolation=None,
+                 backend="cv2"):
+        if type(size) is int:
+            self.size = (size, size)  # (h, w)
+        else:
+            self.size = size
+
+        self.scale = [0.08, 1.0] if scale is None else scale
+        self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio
+
+        self._resize_func = UnifiedResize(
+            interpolation=interpolation, backend=backend)
+
+    def __call__(self, img):
+        size = self.size
+        scale = self.scale
+        ratio = self.ratio
+
+        aspect_ratio = math.sqrt(random.uniform(*ratio))
+        w = 1. * aspect_ratio
+        h = 1. / aspect_ratio
+
+        img_h, img_w = img.shape[:2]
+
+        bound = min((float(img_w) / img_h) / (w**2),
+                    (float(img_h) / img_w) / (h**2))
+        scale_max = min(scale[1], bound)
+        scale_min = min(scale[0], bound)
+
+        target_area = img_w * img_h * random.uniform(scale_min, scale_max)
+        target_size = math.sqrt(target_area)
+        w = int(target_size * w)
+        h = int(target_size * h)
+
+        i = random.randint(0, img_w - w)
+        j = random.randint(0, img_h - h)
+
+        img = img[j:j + h, i:i + w, :]
+
+        return self._resize_func(img, size)
+
+
+class RandFlipImage(object):
+    """ random flip image
+        flip_code:
+            1: Flipped Horizontally
+            0: Flipped Vertically
+            -1: Flipped Horizontally & Vertically
+    """
+
+    def __init__(self, flip_code=1):
+        assert flip_code in [-1, 0, 1
+                             ], "flip_code should be a value in [-1, 0, 1]"
+        self.flip_code = flip_code
+
+    def __call__(self, img):
+        if random.randint(0, 1) == 1:
+            return cv2.flip(img, self.flip_code)
+        else:
+            return img
+
+
+class AutoAugment(object):
+    def __init__(self):
+        self.policy = ImageNetPolicy()
+
+    def __call__(self, img):
+        from PIL import Image
+        img = np.ascontiguousarray(img)
+        img = Image.fromarray(img)
+        img = self.policy(img)
+        img = np.asarray(img)
+
+
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+
+    def __init__(self,
+                 scale=None,
+                 mean=None,
+                 std=None,
+                 order='chw',
+                 output_fp16=False,
+                 channel_num=3):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        assert channel_num in [
+            3, 4
+        ], "channel number of input image should be set to 3 or 4."
+        self.channel_num = channel_num
+        self.output_dtype = 'float16' if output_fp16 else 'float32'
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        self.order = order
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (3, 1, 1) if self.order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+
+        img = (img.astype('float32') * self.scale - self.mean) / self.std
+
+        if self.channel_num == 4:
+            img_h = img.shape[1] if self.order == 'chw' else img.shape[0]
+            img_w = img.shape[2] if self.order == 'chw' else img.shape[1]
+            pad_zeros = np.zeros(
+                (1, img_h, img_w)) if self.order == 'chw' else np.zeros(
+                    (img_h, img_w, 1))
+            img = (np.concatenate(
+                (img, pad_zeros), axis=0)
+                   if self.order == 'chw' else np.concatenate(
+                       (img, pad_zeros), axis=2))
+        return img.astype(self.output_dtype)
+
+
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        return img.transpose((2, 0, 1))
+
+
+class AugMix(object):
+    """ Perform AugMix augmentation and compute mixture.
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 aug_prob_coeff=0.1,
+                 mixture_width=3,
+                 mixture_depth=1,
+                 aug_severity=1):
+        """
+        Args:
+            prob: Probability of taking augmix
+            aug_prob_coeff: Probability distribution coefficients.
+            mixture_width: Number of augmentation chains to mix per augmented example.
+            mixture_depth: Depth of augmentation chains. -1 denotes stochastic depth in [1, 3]'
+            aug_severity: Severity of underlying augmentation operators (between 1 to 10).
+        """
+        # fmt: off
+        self.prob = prob
+        self.aug_prob_coeff = aug_prob_coeff
+        self.mixture_width = mixture_width
+        self.mixture_depth = mixture_depth
+        self.aug_severity = aug_severity
+        self.augmentations = augmentations
+        # fmt: on
+
+    def __call__(self, image):
+        """Perform AugMix augmentations and compute mixture.
+        Returns:
+          mixed: Augmented and mixed image.
+        """
+        if random.random() > self.prob:
+            # Avoid the warning: the given NumPy array is not writeable
+            return np.asarray(image).copy()
+
+        ws = np.float32(
+            np.random.dirichlet([self.aug_prob_coeff] * self.mixture_width))
+        m = np.float32(
+            np.random.beta(self.aug_prob_coeff, self.aug_prob_coeff))
+
+        # image = Image.fromarray(image)
+        mix = np.zeros(image.shape)
+        for i in range(self.mixture_width):
+            image_aug = image.copy()
+            image_aug = Image.fromarray(image_aug)
+            depth = self.mixture_depth if self.mixture_depth > 0 else np.random.randint(
+                1, 4)
+            for _ in range(depth):
+                op = np.random.choice(self.augmentations)
+                image_aug = op(image_aug, self.aug_severity)
+            mix += ws[i] * np.asarray(image_aug)
+
+        mixed = (1 - m) * image + m * mix
+        return mixed.astype(np.uint8)
+
+
+class ColorJitter(RawColorJitter):
+    """ColorJitter.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+        img = super()._apply_image(img)
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+        return img
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/randaugment.py b/src/PaddleClas/ppcls/data/preprocess/ops/randaugment.py
new file mode 100644
index 0000000..cca59da
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/randaugment.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/heartInsert/randaugment
+
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+
+
+class RandAugment(object):
+    def __init__(self, num_layers=2, magnitude=5, fillcolor=(128, 128, 128)):
+        self.num_layers = num_layers
+        self.magnitude = magnitude
+        self.max_level = 10
+
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 150.0 / 331 * abso_level,
+            "translateY": 150.0 / 331 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 0.9 * abso_level,
+            "posterize": int(4.0 * abso_level),
+            "solarize": 256.0 * abso_level,
+            "contrast": 0.9 * abso_level,
+            "sharpness": 0.9 * abso_level,
+            "brightness": 0.9 * abso_level,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0
+        }
+
+        # from https://stackoverflow.com/questions/5252170/
+        # specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        rnd_ch_op = random.choice
+
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "posterize": lambda img, magnitude:
+                ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude:
+                ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude:
+                ImageEnhance.Contrast(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "sharpness": lambda img, magnitude:
+                ImageEnhance.Sharpness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "brightness": lambda img, magnitude:
+                ImageEnhance.Brightness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "autocontrast": lambda img, magnitude:
+                ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+
+    def __call__(self, img):
+        avaiable_op_names = list(self.level_map.keys())
+        for layer_num in range(self.num_layers):
+            op_name = np.random.choice(avaiable_op_names)
+            img = self.func[op_name](img, self.level_map[op_name])
+        return img
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/random_erasing.py b/src/PaddleClas/ppcls/data/preprocess/ops/random_erasing.py
new file mode 100644
index 0000000..f234abb
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/random_erasing.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#This code is adapted from https://github.com/zhunzhong07/Random-Erasing, and refer to Timm.
+
+from functools import partial
+
+import math
+import random
+
+import numpy as np
+
+
+class Pixels(object):
+    def __init__(self, mode="const", mean=[0., 0., 0.]):
+        self._mode = mode
+        self._mean = mean
+
+    def __call__(self, h=224, w=224, c=3):
+        if self._mode == "rand":
+            return np.random.normal(size=(1, 1, 3))
+        elif self._mode == "pixel":
+            return np.random.normal(size=(h, w, c))
+        elif self._mode == "const":
+            return self._mean
+        else:
+            raise Exception(
+                "Invalid mode in RandomErasing, only support \"const\", \"rand\", \"pixel\""
+            )
+
+
+class RandomErasing(object):
+    """RandomErasing.
+    """
+
+    def __init__(self,
+                 EPSILON=0.5,
+                 sl=0.02,
+                 sh=0.4,
+                 r1=0.3,
+                 mean=[0., 0., 0.],
+                 attempt=100,
+                 use_log_aspect=False,
+                 mode='const'):
+        self.EPSILON = eval(EPSILON) if isinstance(EPSILON, str) else EPSILON
+        self.sl = eval(sl) if isinstance(sl, str) else sl
+        self.sh = eval(sh) if isinstance(sh, str) else sh
+        r1 = eval(r1) if isinstance(r1, str) else r1
+        self.r1 = (math.log(r1), math.log(1 / r1)) if use_log_aspect else (
+            r1, 1 / r1)
+        self.use_log_aspect = use_log_aspect
+        self.attempt = attempt
+        self.get_pixels = Pixels(mode, mean)
+
+    def __call__(self, img):
+        if random.random() > self.EPSILON:
+            return img
+
+        for _ in range(self.attempt):
+            area = img.shape[0] * img.shape[1]
+
+            target_area = random.uniform(self.sl, self.sh) * area
+            aspect_ratio = random.uniform(*self.r1)
+            if self.use_log_aspect:
+                aspect_ratio = math.exp(aspect_ratio)
+
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w < img.shape[1] and h < img.shape[0]:
+                pixels = self.get_pixels(h, w, img.shape[2])
+                x1 = random.randint(0, img.shape[0] - h)
+                y1 = random.randint(0, img.shape[1] - w)
+                if img.shape[2] == 3:
+                    img[x1:x1 + h, y1:y1 + w, :] = pixels
+                else:
+                    img[x1:x1 + h, y1:y1 + w, 0] = pixels[0]
+                return img
+        return img
diff --git a/src/PaddleClas/ppcls/data/preprocess/ops/timm_autoaugment.py b/src/PaddleClas/ppcls/data/preprocess/ops/timm_autoaugment.py
new file mode 100644
index 0000000..dd2994d
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/preprocess/ops/timm_autoaugment.py
@@ -0,0 +1,877 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on  https://github.com/rwightman/pytorch-image-models
+
+import random
+import math
+import re
+from PIL import Image, ImageOps, ImageEnhance, ImageChops
+import PIL
+import numpy as np
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+
+_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
+
+_FILL = (128, 128, 128)
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+_HPARAMS_DEFAULT = dict(
+    translate_const=250,
+    img_mean=_FILL, )
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+
+def _pil_interp(method):
+    if method == 'bicubic':
+        return Image.BICUBIC
+    elif method == 'lanczos':
+        return Image.LANCZOS
+    elif method == 'hamming':
+        return Image.HAMMING
+    else:
+        # default bilinear, do we want to allow nearest?
+        return Image.BILINEAR
+
+
+def _interpolation(kwargs):
+    interpolation = kwargs.pop('resample', Image.BILINEAR)
+    if isinstance(interpolation, (list, tuple)):
+        return random.choice(interpolation)
+    else:
+        return interpolation
+
+
+def _check_args_tf(kwargs):
+    if 'fillcolor' in kwargs and _PIL_VER < (5, 0):
+        kwargs.pop('fillcolor')
+    kwargs['resample'] = _interpolation(kwargs)
+
+
+def shear_x(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0),
+                         **kwargs)
+
+
+def shear_y(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0),
+                         **kwargs)
+
+
+def translate_x_rel(img, pct, **kwargs):
+    pixels = pct * img.size[0]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0),
+                         **kwargs)
+
+
+def translate_y_rel(img, pct, **kwargs):
+    pixels = pct * img.size[1]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels),
+                         **kwargs)
+
+
+def translate_x_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0),
+                         **kwargs)
+
+
+def translate_y_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels),
+                         **kwargs)
+
+
+def rotate(img, degrees, **kwargs):
+    _check_args_tf(kwargs)
+    if _PIL_VER >= (5, 2):
+        return img.rotate(degrees, **kwargs)
+    elif _PIL_VER >= (5, 0):
+        w, h = img.size
+        post_trans = (0, 0)
+        rotn_center = (w / 2.0, h / 2.0)
+        angle = -math.radians(degrees)
+        matrix = [
+            round(math.cos(angle), 15),
+            round(math.sin(angle), 15),
+            0.0,
+            round(-math.sin(angle), 15),
+            round(math.cos(angle), 15),
+            0.0,
+        ]
+
+        def transform(x, y, matrix):
+            (a, b, c, d, e, f) = matrix
+            return a * x + b * y + c, d * x + e * y + f
+
+        matrix[2], matrix[5] = transform(-rotn_center[0] - post_trans[0],
+                                         -rotn_center[1] - post_trans[1],
+                                         matrix)
+        matrix[2] += rotn_center[0]
+        matrix[5] += rotn_center[1]
+        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
+    else:
+        return img.rotate(degrees, resample=kwargs['resample'])
+
+
+def auto_contrast(img, **__):
+    return ImageOps.autocontrast(img)
+
+
+def invert(img, **__):
+    return ImageOps.invert(img)
+
+
+def equalize(img, **__):
+    return ImageOps.equalize(img)
+
+
+def solarize(img, thresh, **__):
+    return ImageOps.solarize(img, thresh)
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def posterize(img, bits_to_keep, **__):
+    if bits_to_keep >= 8:
+        return img
+    return ImageOps.posterize(img, bits_to_keep)
+
+
+def contrast(img, factor, **__):
+    return ImageEnhance.Contrast(img).enhance(factor)
+
+
+def color(img, factor, **__):
+    return ImageEnhance.Color(img).enhance(factor)
+
+
+def brightness(img, factor, **__):
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def sharpness(img, factor, **__):
+    return ImageEnhance.Sharpness(img).enhance(factor)
+
+
+def _randomly_negate(v):
+    """With 50% prob, negate the value"""
+    return -v if random.random() > 0.5 else v
+
+
+def _rotate_level_to_arg(level, _hparams):
+    # range [-30, 30]
+    level = (level / _MAX_LEVEL) * 30.
+    level = _randomly_negate(level)
+    return level,
+
+
+def _enhance_level_to_arg(level, _hparams):
+    # range [0.1, 1.9]
+    return (level / _MAX_LEVEL) * 1.8 + 0.1,
+
+
+def _enhance_increasing_level_to_arg(level, _hparams):
+    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
+    # range [0.1, 1.9]
+    level = (level / _MAX_LEVEL) * .9
+    level = 1.0 + _randomly_negate(level)
+    return level,
+
+
+def _shear_level_to_arg(level, _hparams):
+    # range [-0.3, 0.3]
+    level = (level / _MAX_LEVEL) * 0.3
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams['translate_const']
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_rel_level_to_arg(level, hparams):
+    # default range [-0.45, 0.45]
+    translate_pct = hparams.get('translate_pct', 0.45)
+    level = (level / _MAX_LEVEL) * translate_pct
+    level = _randomly_negate(level)
+    return level,
+
+
+def _posterize_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4),
+
+
+def _posterize_increasing_level_to_arg(level, hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image',
+    # intensity/severity of augmentation increases with level
+    return 4 - _posterize_level_to_arg(level, hparams)[0],
+
+
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4) + 4,
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 256),
+
+
+def _solarize_increasing_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation increases with level
+    return 256 - _solarize_level_to_arg(level, _hparams)[0],
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return int((level / _MAX_LEVEL) * 110),
+
+
+LEVEL_TO_ARG = {
+    'AutoContrast': None,
+    'Equalize': None,
+    'Invert': None,
+    'Rotate': _rotate_level_to_arg,
+    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
+    'Posterize': _posterize_level_to_arg,
+    'PosterizeIncreasing': _posterize_increasing_level_to_arg,
+    'PosterizeOriginal': _posterize_original_level_to_arg,
+    'Solarize': _solarize_level_to_arg,
+    'SolarizeIncreasing': _solarize_increasing_level_to_arg,
+    'SolarizeAdd': _solarize_add_level_to_arg,
+    'Color': _enhance_level_to_arg,
+    'ColorIncreasing': _enhance_increasing_level_to_arg,
+    'Contrast': _enhance_level_to_arg,
+    'ContrastIncreasing': _enhance_increasing_level_to_arg,
+    'Brightness': _enhance_level_to_arg,
+    'BrightnessIncreasing': _enhance_increasing_level_to_arg,
+    'Sharpness': _enhance_level_to_arg,
+    'SharpnessIncreasing': _enhance_increasing_level_to_arg,
+    'ShearX': _shear_level_to_arg,
+    'ShearY': _shear_level_to_arg,
+    'TranslateX': _translate_abs_level_to_arg,
+    'TranslateY': _translate_abs_level_to_arg,
+    'TranslateXRel': _translate_rel_level_to_arg,
+    'TranslateYRel': _translate_rel_level_to_arg,
+}
+
+NAME_TO_OP = {
+    'AutoContrast': auto_contrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': rotate,
+    'Posterize': posterize,
+    'PosterizeIncreasing': posterize,
+    'PosterizeOriginal': posterize,
+    'Solarize': solarize,
+    'SolarizeIncreasing': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'ColorIncreasing': color,
+    'Contrast': contrast,
+    'ContrastIncreasing': contrast,
+    'Brightness': brightness,
+    'BrightnessIncreasing': brightness,
+    'Sharpness': sharpness,
+    'SharpnessIncreasing': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x_abs,
+    'TranslateY': translate_y_abs,
+    'TranslateXRel': translate_x_rel,
+    'TranslateYRel': translate_y_rel,
+}
+
+
+class AugmentOp(object):
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        hparams = hparams or _HPARAMS_DEFAULT
+        self.aug_fn = NAME_TO_OP[name]
+        self.level_fn = LEVEL_TO_ARG[name]
+        self.prob = prob
+        self.magnitude = magnitude
+        self.hparams = hparams.copy()
+        self.kwargs = dict(
+            fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
+            resample=hparams['interpolation']
+            if 'interpolation' in hparams else _RANDOM_INTERPOLATION, )
+
+        # If magnitude_std is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_std`.
+        # NOTE This is my own hack, being tested, not in papers or reference impls.
+        self.magnitude_std = self.hparams.get('magnitude_std', 0)
+
+    def __call__(self, img):
+        if self.prob < 1.0 and random.random() > self.prob:
+            return img
+        magnitude = self.magnitude
+        if self.magnitude_std and self.magnitude_std > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_std)
+        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
+        level_args = self.level_fn(
+            magnitude, self.hparams) if self.level_fn is not None else tuple()
+        return self.aug_fn(img, *level_args, **self.kwargs)
+
+
+def auto_augment_policy_v0(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference.
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)
+         ],  # This results in black image with Tpu posterize
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_v0r(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used
+    # in Google research implementation (number of bits discarded increases with magnitude)
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('PosterizeIncreasing', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_original(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501
+    policy = [
+        [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeOriginal', 0.6, 7), ('PosterizeOriginal', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeOriginal', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeOriginal', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_originalr(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation
+    policy = [
+        [('PosterizeIncreasing', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeIncreasing', 0.6, 7), ('PosterizeIncreasing', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeIncreasing', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy(name='v0', hparams=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    if name == 'original':
+        return auto_augment_policy_original(hparams)
+    elif name == 'originalr':
+        return auto_augment_policy_originalr(hparams)
+    elif name == 'v0':
+        return auto_augment_policy_v0(hparams)
+    elif name == 'v0r':
+        return auto_augment_policy_v0r(hparams)
+    else:
+        assert False, 'Unknown AA policy (%s)' % name
+
+
+class AutoAugment(object):
+    def __init__(self, policy):
+        self.policy = policy
+
+    def __call__(self, img):
+        sub_policy = random.choice(self.policy)
+        for op in sub_policy:
+            img = op(img)
+        return img
+
+
+def auto_augment_transform(config_str, hparams):
+    """
+    Create a AutoAugment transform
+
+    :param config_str: String defining configuration of auto augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', 'originalr').
+    The remaining sections, not order sepecific determine
+        'mstd' -  float std deviation of magnitude noise applied
+    Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5
+
+    :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme
+
+    :return: A callable Transform Op
+    """
+    config = config_str.split('-')
+    policy_name = config[0]
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        else:
+            assert False, 'Unknown AutoAugment config section'
+    aa_policy = auto_augment_policy(policy_name, hparams=hparams)
+    return AutoAugment(aa_policy)
+
+
+_RAND_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'Posterize',
+    'Solarize',
+    'SolarizeAdd',
+    'Color',
+    'Contrast',
+    'Brightness',
+    'Sharpness',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+_RAND_INCREASING_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'SolarizeAdd',
+    'ColorIncreasing',
+    'ContrastIncreasing',
+    'BrightnessIncreasing',
+    'SharpnessIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    'Rotate': 0.3,
+    'ShearX': 0.2,
+    'ShearY': 0.2,
+    'TranslateXRel': 0.1,
+    'TranslateYRel': 0.1,
+    'Color': .025,
+    'Sharpness': 0.025,
+    'AutoContrast': 0.025,
+    'Solarize': .005,
+    'SolarizeAdd': .005,
+    'Contrast': .005,
+    'Brightness': .005,
+    'Equalize': .005,
+    'Posterize': 0,
+    'Invert': 0,
+}
+
+
+def _select_rand_weights(weight_idx=0, transforms=None):
+    transforms = transforms or _RAND_TRANSFORMS
+    assert weight_idx == 0  # only one set of weights currently
+    rand_weights = _RAND_CHOICE_WEIGHTS_0
+    probs = [rand_weights[k] for k in transforms]
+    probs /= np.sum(probs)
+    return probs
+
+
+def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _RAND_TRANSFORMS
+    return [
+        AugmentOp(
+            name, prob=0.5, magnitude=magnitude, hparams=hparams)
+        for name in transforms
+    ]
+
+
+class RandAugment(object):
+    def __init__(self, ops, num_layers=2, choice_weights=None):
+        self.ops = ops
+        self.num_layers = num_layers
+        self.choice_weights = choice_weights
+
+    def __call__(self, img):
+        # no replacement when using weighted choice
+        ops = np.random.choice(
+            self.ops,
+            self.num_layers,
+            replace=self.choice_weights is None,
+            p=self.choice_weights)
+        for op in ops:
+            img = op(img)
+        return img
+
+
+def rand_augment_transform(config_str, hparams):
+    """
+    Create a RandAugment transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude of rand augment
+        'n' - integer num layers (number of transform ops selected per image)
+        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
+        'mstd' -  float std deviation of magnitude noise applied
+        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
+    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
+    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
+
+    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
+
+    :return: A callable Transform Op
+    """
+    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
+    num_layers = 2  # default to 2 ops per image
+    weight_idx = None  # default to no probability weights for op choice
+    transforms = _RAND_TRANSFORMS
+    config = config_str.split('-')
+    assert config[0] == 'rand'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'inc':
+            if bool(val):
+                transforms = _RAND_INCREASING_TRANSFORMS
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'n':
+            num_layers = int(val)
+        elif key == 'w':
+            weight_idx = int(val)
+        else:
+            assert False, 'Unknown RandAugment config section'
+    ra_ops = rand_augment_ops(
+        magnitude=magnitude, hparams=hparams, transforms=transforms)
+    choice_weights = None if weight_idx is None else _select_rand_weights(
+        weight_idx)
+    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
+
+
+_AUGMIX_TRANSFORMS = [
+    'AutoContrast',
+    'ColorIncreasing',  # not in paper
+    'ContrastIncreasing',  # not in paper
+    'BrightnessIncreasing',  # not in paper
+    'SharpnessIncreasing',  # not in paper
+    'Equalize',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+]
+
+
+def augmix_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _AUGMIX_TRANSFORMS
+    return [
+        AugmentOp(
+            name, prob=1.0, magnitude=magnitude, hparams=hparams)
+        for name in transforms
+    ]
+
+
+class AugMixAugment(object):
+    """ AugMix Transform
+    Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py
+    From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty -
+    https://arxiv.org/abs/1912.02781
+    """
+
+    def __init__(self, ops, alpha=1., width=3, depth=-1, blended=False):
+        self.ops = ops
+        self.alpha = alpha
+        self.width = width
+        self.depth = depth
+        self.blended = blended  # blended mode is faster but not well tested
+
+    def _calc_blended_weights(self, ws, m):
+        ws = ws * m
+        cump = 1.
+        rws = []
+        for w in ws[::-1]:
+            alpha = w / cump
+            cump *= (1 - alpha)
+            rws.append(alpha)
+        return np.array(rws[::-1], dtype=np.float32)
+
+    def _apply_blended(self, img, mixing_weights, m):
+        # This is my first crack and implementing a slightly faster mixed augmentation. Instead
+        # of accumulating the mix for each chain in a Numpy array and then blending with original,
+        # it recomputes the blending coefficients and applies one PIL image blend per chain.
+        # TODO the results appear in the right ballpark but they differ by more than rounding.
+        img_orig = img.copy()
+        ws = self._calc_blended_weights(mixing_weights, m)
+        for w in ws:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img_orig  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            img = Image.blend(img, img_aug, w)
+        return img
+
+    def _apply_basic(self, img, mixing_weights, m):
+        # This is a literal adaptation of the paper/official implementation without normalizations and
+        # PIL <-> Numpy conversions between every op. It is still quite CPU compute heavy compared to the
+        # typical augmentation transforms, could use a GPU / Kornia implementation.
+        img_shape = img.size[0], img.size[1], len(img.getbands())
+        mixed = np.zeros(img_shape, dtype=np.float32)
+        for mw in mixing_weights:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            mixed += mw * np.asarray(img_aug, dtype=np.float32)
+        np.clip(mixed, 0, 255., out=mixed)
+        mixed = Image.fromarray(mixed.astype(np.uint8))
+        return Image.blend(img, mixed, m)
+
+    def __call__(self, img):
+        mixing_weights = np.float32(
+            np.random.dirichlet([self.alpha] * self.width))
+        m = np.float32(np.random.beta(self.alpha, self.alpha))
+        if self.blended:
+            mixed = self._apply_blended(img, mixing_weights, m)
+        else:
+            mixed = self._apply_basic(img, mixing_weights, m)
+        return mixed
+
+
+def augment_and_mix_transform(config_str, hparams):
+    """ Create AugMix transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude (severity) of augmentation mix (default: 3)
+        'w' - integer width of augmentation chain (default: 3)
+        'd' - integer depth of augmentation chain (-1 is random [1, 3], default: -1)
+        'b' - integer (bool), blend each branch of chain into end result without a final blend, less CPU (default: 0)
+        'mstd' -  float std deviation of magnitude noise applied (default: 0)
+    Ex 'augmix-m5-w4-d2' results in AugMix with severity 5, chain width 4, chain depth 2
+
+    :param hparams: Other hparams (kwargs) for the Augmentation transforms
+
+    :return: A callable Transform Op
+    """
+    magnitude = 3
+    width = 3
+    depth = -1
+    alpha = 1.
+    blended = False
+    config = config_str.split('-')
+    assert config[0] == 'augmix'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'w':
+            width = int(val)
+        elif key == 'd':
+            depth = int(val)
+        elif key == 'a':
+            alpha = float(val)
+        elif key == 'b':
+            blended = bool(val)
+        else:
+            assert False, 'Unknown AugMix config section'
+    ops = augmix_ops(magnitude=magnitude, hparams=hparams)
+    return AugMixAugment(
+        ops, alpha=alpha, width=width, depth=depth, blended=blended)
+
+
+class RawTimmAutoAugment(object):
+    """TimmAutoAugment API for PaddleClas."""
+
+    def __init__(self,
+                 config_str="rand-m9-mstd0.5-inc1",
+                 interpolation="bicubic",
+                 img_size=224,
+                 mean=IMAGENET_DEFAULT_MEAN):
+        if isinstance(img_size, (tuple, list)):
+            img_size_min = min(img_size)
+        else:
+            img_size_min = img_size
+
+        aa_params = dict(
+            translate_const=int(img_size_min * 0.45),
+            img_mean=tuple([min(255, round(255 * x)) for x in mean]), )
+        if interpolation and interpolation != 'random':
+            aa_params['interpolation'] = _pil_interp(interpolation)
+        if config_str.startswith('rand'):
+            self.augment_func = rand_augment_transform(config_str, aa_params)
+        elif config_str.startswith('augmix'):
+            aa_params['translate_pct'] = 0.3
+            self.augment_func = augment_and_mix_transform(config_str,
+                                                          aa_params)
+        elif config_str.startswith('auto'):
+            self.augment_func = auto_augment_transform(config_str, aa_params)
+        else:
+            raise Exception(
+                "ConfigError: The TimmAutoAugment Op only support RandAugment, AutoAugment, AugMix, and the config_str only starts with \"rand\", \"augmix\", \"auto\"."
+            )
+
+    def __call__(self, img):
+        return self.augment_func(img)
diff --git a/src/PaddleClas/ppcls/data/utils/__init__.py b/src/PaddleClas/ppcls/data/utils/__init__.py
new file mode 100644
index 0000000..61d5aa2
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/src/PaddleClas/ppcls/data/utils/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/data/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..55982e8
Binary files /dev/null and b/src/PaddleClas/ppcls/data/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/utils/__pycache__/get_image_list.cpython-39.pyc b/src/PaddleClas/ppcls/data/utils/__pycache__/get_image_list.cpython-39.pyc
new file mode 100644
index 0000000..7a158c4
Binary files /dev/null and b/src/PaddleClas/ppcls/data/utils/__pycache__/get_image_list.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/data/utils/get_image_list.py b/src/PaddleClas/ppcls/data/utils/get_image_list.py
new file mode 100644
index 0000000..6f10935
--- /dev/null
+++ b/src/PaddleClas/ppcls/data/utils/get_image_list.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import base64
+import numpy as np
+
+
+def get_image_list(img_file):
+    imgs_lists = []
+    if img_file is None or not os.path.exists(img_file):
+        raise Exception("not found any img file in {}".format(img_file))
+
+    img_end = ['jpg', 'png', 'jpeg', 'JPEG', 'JPG', 'bmp']
+    if os.path.isfile(img_file) and img_file.split('.')[-1] in img_end:
+        imgs_lists.append(img_file)
+    elif os.path.isdir(img_file):
+        for single_file in os.listdir(img_file):
+            if single_file.split('.')[-1] in img_end:
+                imgs_lists.append(os.path.join(img_file, single_file))
+    if len(imgs_lists) == 0:
+        raise Exception("not found any img file in {}".format(img_file))
+    imgs_lists = sorted(imgs_lists)
+    return imgs_lists
+
+
+def get_image_list_from_label_file(image_path, label_file_path):
+    imgs_lists = []
+    gt_labels = []
+    with open(label_file_path, "r") as fin:
+        lines = fin.readlines()
+        for line in lines:
+            image_name, label = line.strip("\n").split()
+            label = int(label)
+            imgs_lists.append(os.path.join(image_path, image_name))
+            gt_labels.append(int(label))
+    return imgs_lists, gt_labels
diff --git a/src/PaddleClas/ppcls/engine/__init__.py b/src/PaddleClas/ppcls/engine/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/PaddleClas/ppcls/engine/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/engine/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..22fdcbb
Binary files /dev/null and b/src/PaddleClas/ppcls/engine/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/engine/__pycache__/engine.cpython-39.pyc b/src/PaddleClas/ppcls/engine/__pycache__/engine.cpython-39.pyc
new file mode 100644
index 0000000..3281cbb
Binary files /dev/null and b/src/PaddleClas/ppcls/engine/__pycache__/engine.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/engine/engine.py b/src/PaddleClas/ppcls/engine/engine.py
new file mode 100644
index 0000000..61d09ff
--- /dev/null
+++ b/src/PaddleClas/ppcls/engine/engine.py
@@ -0,0 +1,465 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import platform
+import paddle
+import paddle.distributed as dist
+from visualdl import LogWriter
+from paddle import nn
+import numpy as np
+import random
+
+from ppcls.utils.check import check_gpu
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.utils.config import print_config
+from ppcls.data import build_dataloader
+from ppcls.arch import build_model, RecModel, DistillationModel, TheseusLayer
+from ppcls.arch import apply_to_static
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+from ppcls.utils.save_load import init_model
+from ppcls.utils import save_load
+
+from ppcls.data.utils.get_image_list import get_image_list
+from ppcls.data.postprocess import build_postprocess
+from ppcls.data import create_operators
+from ppcls.engine.train import train_epoch
+from ppcls.engine import evaluation
+from ppcls.arch.gears.identity_head import IdentityHead
+
+
+class Engine(object):
+    def __init__(self, config, mode="train"):
+        assert mode in ["train", "eval", "infer", "export"]
+        self.mode = mode
+        self.config = config
+        self.eval_mode = self.config["Global"].get("eval_mode",
+                                                   "classification")
+        if "Head" in self.config["Arch"] or self.config["Arch"].get("is_rec",
+                                                                    False):
+            self.is_rec = True
+        else:
+            self.is_rec = False
+
+        # set seed
+        seed = self.config["Global"].get("seed", False)
+        if seed or seed == 0:
+            assert isinstance(seed, int), "The 'seed' must be a integer!"
+            paddle.seed(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+
+        # init logger
+        self.output_dir = self.config['Global']['output_dir']
+        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
+                                f"{mode}.log")
+        init_logger(name='root', log_file=log_file)
+        print_config(config)
+
+        # init train_func and eval_func
+        assert self.eval_mode in ["classification", "retrieval"], logger.error(
+            "Invalid eval mode: {}".format(self.eval_mode))
+        self.train_epoch_func = train_epoch
+        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")
+
+        self.use_dali = self.config['Global'].get("use_dali", False)
+
+        # for visualdl
+        self.vdl_writer = None
+        if self.config['Global'][
+                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
+            vdl_writer_path = os.path.join(self.output_dir, "vdl")
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+
+        # set device
+        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu"]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+
+        # AMP training
+        self.amp = True if "AMP" in self.config and self.mode == "train" else False
+        if self.amp and self.config["AMP"] is not None:
+            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
+            self.use_dynamic_loss_scaling = self.config["AMP"].get(
+                "use_dynamic_loss_scaling", False)
+        else:
+            self.scale_loss = 1.0
+            self.use_dynamic_loss_scaling = False
+        if self.amp:
+            AMP_RELATED_FLAGS_SETTING = {
+                'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+                'FLAGS_max_inplace_grad_add': 8,
+            }
+            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+        if "class_num" in config["Global"]:
+            global_class_num = config["Global"]["class_num"]
+            if "class_num" not in config["Arch"]:
+                config["Arch"]["class_num"] = global_class_num
+                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
+            else:
+                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
+            logger.warning(msg)
+        #TODO(gaotingquan): support rec
+        class_num = config["Arch"].get("class_num", None)
+        self.config["DataLoader"].update({"class_num": class_num})
+        # build dataloader
+        if self.mode == 'train':
+            self.train_dataloader = build_dataloader(
+                self.config["DataLoader"], "Train", self.device, self.use_dali)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode == "classification":
+                self.eval_dataloader = build_dataloader(
+                    self.config["DataLoader"], "Eval", self.device,
+                    self.use_dali)
+            elif self.eval_mode == "retrieval":
+                self.gallery_query_dataloader = None
+                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
+                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
+                    self.gallery_query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], key, self.device,
+                        self.use_dali)
+                else:
+                    self.gallery_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Gallery",
+                        self.device, self.use_dali)
+                    self.query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Query",
+                        self.device, self.use_dali)
+
+        # build loss
+        if self.mode == "train":
+            loss_info = self.config["Loss"]["Train"]
+            self.train_loss_func = build_loss(loss_info)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            loss_config = self.config.get("Loss", None)
+            if loss_config is not None:
+                loss_config = loss_config.get("Eval")
+                if loss_config is not None:
+                    self.eval_loss_func = build_loss(loss_config)
+                else:
+                    self.eval_loss_func = None
+            else:
+                self.eval_loss_func = None
+
+        # build metric
+        if self.mode == 'train':
+            metric_config = self.config.get("Metric")
+            if metric_config is not None:
+                metric_config = metric_config.get("Train")
+                if metric_config is not None:
+                    if hasattr(
+                            self.train_dataloader, "collate_fn"
+                    ) and self.train_dataloader.collate_fn is not None:
+                        for m_idx, m in enumerate(metric_config):
+                            if "TopkAcc" in m:
+                                msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed."
+                                logger.warning(msg)
+                                break
+                        metric_config.pop(m_idx)
+                    self.train_metric_func = build_metrics(metric_config)
+                else:
+                    self.train_metric_func = None
+        else:
+            self.train_metric_func = None
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            metric_config = self.config.get("Metric")
+            if self.eval_mode == "classification":
+                if metric_config is not None:
+                    metric_config = metric_config.get("Eval")
+                    if metric_config is not None:
+                        self.eval_metric_func = build_metrics(metric_config)
+            elif self.eval_mode == "retrieval":
+                if metric_config is None:
+                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
+                else:
+                    metric_config = metric_config["Eval"]
+                self.eval_metric_func = build_metrics(metric_config)
+        else:
+            self.eval_metric_func = None
+
+        # build model
+        self.model = build_model(self.config)
+        # set @to_static for benchmark, skip this by default.
+        apply_to_static(self.config, self.model)
+
+        # load_pretrain
+        if self.config["Global"]["pretrained_model"] is not None:
+            if self.config["Global"]["pretrained_model"].startswith("http"):
+                load_dygraph_pretrain_from_url(
+                    self.model, self.config["Global"]["pretrained_model"])
+            else:
+                load_dygraph_pretrain(
+                    self.model, self.config["Global"]["pretrained_model"])
+
+        # build optimizer
+        if self.mode == 'train':
+            self.optimizer, self.lr_sch = build_optimizer(
+                self.config["Optimizer"], self.config["Global"]["epochs"],
+                len(self.train_dataloader), [self.model])
+
+        # for amp training
+        if self.amp:
+            self.scaler = paddle.amp.GradScaler(
+                init_loss_scaling=self.scale_loss,
+                use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
+            amp_level = self.config['AMP'].get("level", "O1")
+            if amp_level not in ["O1", "O2"]:
+                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
+                logger.warning(msg)
+                self.config['AMP']["level"] = "O1"
+                amp_level = "O1"
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=amp_level,
+                save_dtype='float32')
+
+        # for distributed
+        world_size = dist.get_world_size()
+        self.config["Global"]["distributed"] = world_size != 1
+        if world_size != 4 and self.mode == "train":
+            msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train."
+            logger.warning(msg)
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+            self.model = paddle.DataParallel(self.model)
+
+        # build postprocess for infer
+        if self.mode == 'infer':
+            self.preprocess_func = create_operators(self.config["Infer"][
+                "transforms"])
+            self.postprocess_func = build_postprocess(self.config["Infer"][
+                "PostProcess"])
+
+    def train(self):
+        assert self.mode == "train"
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+        best_metric = {
+            "metric": 0.0,
+            "epoch": 0,
+        }
+        # key:
+        # val: metrics list word
+        self.output_info = dict()
+        self.time_info = {
+            "batch_cost": AverageMeter(
+                "batch_cost", '.5f', postfix=" s,"),
+            "reader_cost": AverageMeter(
+                "reader_cost", ".5f", postfix=" s,"),
+        }
+        # global iter counter
+        self.global_step = 0
+
+        if self.config["Global"]["checkpoints"] is not None:
+            metric_info = init_model(self.config["Global"], self.model,
+                                     self.optimizer)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+
+        self.max_iter = len(self.train_dataloader) - 1 if platform.system(
+        ) == "Windows" else len(self.train_dataloader)
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            # for one epoch train
+            self.train_epoch_func(self, epoch_id, print_batch_step)
+
+            if self.use_dali:
+                self.train_dataloader.reset()
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, self.output_info[key].avg)
+                for key in self.output_info
+            ])
+            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
+                epoch_id, self.config["Global"]["epochs"], metric_msg))
+            self.output_info.clear()
+
+            # eval model and save model if possible
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_interval"] == 0:
+                acc = self.eval(epoch_id)
+                if acc > best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                    save_load.save_model(
+                        self.model,
+                        self.optimizer,
+                        best_metric,
+                        self.output_dir,
+                        model_name=self.config["Arch"]["name"],
+                        prefix="best_model")
+                logger.info("[Eval][Epoch {}][best metric: {}]".format(
+                    epoch_id, best_metric["metric"]))
+                logger.scaler(
+                    name="eval_acc",
+                    value=acc,
+                    step=epoch_id,
+                    writer=self.vdl_writer)
+
+                self.model.train()
+
+            # save model
+            if epoch_id % save_interval == 0:
+                save_load.save_model(
+                    self.model,
+                    self.optimizer, {"metric": acc,
+                                     "epoch": epoch_id},
+                    self.output_dir,
+                    model_name=self.config["Arch"]["name"],
+                    prefix="epoch_{}".format(epoch_id))
+            # save the latest model
+            save_load.save_model(
+                self.model,
+                self.optimizer, {"metric": acc,
+                                 "epoch": epoch_id},
+                self.output_dir,
+                model_name=self.config["Arch"]["name"],
+                prefix="latest")
+
+        if self.vdl_writer is not None:
+            self.vdl_writer.close()
+
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        assert self.mode in ["train", "eval"]
+        self.model.eval()
+        eval_result = self.eval_func(self, epoch_id)
+        self.model.train()
+        return eval_result
+
+    @paddle.no_grad()
+    def infer(self):
+        assert self.mode == "infer" and self.eval_mode == "classification"
+        total_trainer = dist.get_world_size()
+        local_rank = dist.get_rank()
+        image_list = get_image_list(self.config["Infer"]["infer_imgs"])
+        # data split
+        image_list = image_list[local_rank::total_trainer]
+
+        batch_size = self.config["Infer"]["batch_size"]
+        self.model.eval()
+        batch_data = []
+        image_file_list = []
+        for idx, image_file in enumerate(image_list):
+            with open(image_file, 'rb') as f:
+                x = f.read()
+            for process in self.preprocess_func:
+                x = process(x)
+            batch_data.append(x)
+            image_file_list.append(image_file)
+            if len(batch_data) >= batch_size or idx == len(image_list) - 1:
+                batch_tensor = paddle.to_tensor(batch_data)
+                out = self.model(batch_tensor)
+                if isinstance(out, list):
+                    out = out[0]
+                if isinstance(out, dict) and "logits" in out:
+                    out = out["logits"]
+                if isinstance(out, dict) and "output" in out:
+                    out = out["output"]
+                result = self.postprocess_func(out, image_file_list)
+                print(result)
+                batch_data.clear()
+                image_file_list.clear()
+
+    def export(self):
+        assert self.mode == "export"
+        use_multilabel = self.config["Global"].get("use_multilabel", False)
+        model = ExportModel(self.config["Arch"], self.model, use_multilabel)
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(model.base_model,
+                                  self.config["Global"]["pretrained_model"])
+
+        model.eval()
+        save_path = os.path.join(self.config["Global"]["save_inference_dir"],
+                                 "inference")
+        if model.quanter:
+            model.quanter.save_quantized_model(
+                model.base_model,
+                save_path,
+                input_spec=[
+                    paddle.static.InputSpec(
+                        shape=[None] + self.config["Global"]["image_shape"],
+                        dtype='float32')
+                ])
+        else:
+            model = paddle.jit.to_static(
+                model,
+                input_spec=[
+                    paddle.static.InputSpec(
+                        shape=[None] + self.config["Global"]["image_shape"],
+                        dtype='float32')
+                ])
+            paddle.jit.save(model, save_path)
+
+
+class ExportModel(TheseusLayer):
+    """
+    ExportModel: add softmax onto the model
+    """
+
+    def __init__(self, config, model, use_multilabel):
+        super().__init__()
+        self.base_model = model
+        # we should choose a final model to export
+        if isinstance(self.base_model, DistillationModel):
+            self.infer_model_name = config["infer_model_name"]
+        else:
+            self.infer_model_name = None
+
+        self.infer_output_key = config.get("infer_output_key", None)
+        if self.infer_output_key == "features" and isinstance(self.base_model,
+                                                              RecModel):
+            self.base_model.head = IdentityHead()
+        if use_multilabel:
+            self.out_act = nn.Sigmoid()
+        else:
+            if config.get("infer_add_softmax", True):
+                self.out_act = nn.Softmax(axis=-1)
+            else:
+                self.out_act = None
+
+    def eval(self):
+        self.training = False
+        for layer in self.sublayers():
+            layer.training = False
+            layer.eval()
+
+    def forward(self, x):
+        x = self.base_model(x)
+        if isinstance(x, list):
+            x = x[0]
+        if self.infer_model_name is not None:
+            x = x[self.infer_model_name]
+        if self.infer_output_key is not None:
+            x = x[self.infer_output_key]
+        if self.out_act is not None:
+            x = self.out_act(x)
+        return x
diff --git a/src/PaddleClas/ppcls/engine/evaluation/__init__.py b/src/PaddleClas/ppcls/engine/evaluation/__init__.py
new file mode 100644
index 0000000..e0cd778
--- /dev/null
+++ b/src/PaddleClas/ppcls/engine/evaluation/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.engine.evaluation.classification import classification_eval
+from ppcls.engine.evaluation.retrieval import retrieval_eval
diff --git a/src/PaddleClas/ppcls/engine/evaluation/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/engine/evaluation/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..11ecf66
Binary files /dev/null and b/src/PaddleClas/ppcls/engine/evaluation/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/engine/evaluation/__pycache__/classification.cpython-39.pyc b/src/PaddleClas/ppcls/engine/evaluation/__pycache__/classification.cpython-39.pyc
new file mode 100644
index 0000000..089bcd4
Binary files /dev/null and b/src/PaddleClas/ppcls/engine/evaluation/__pycache__/classification.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/engine/evaluation/__pycache__/retrieval.cpython-39.pyc b/src/PaddleClas/ppcls/engine/evaluation/__pycache__/retrieval.cpython-39.pyc
new file mode 100644
index 0000000..61d40e3
Binary files /dev/null and b/src/PaddleClas/ppcls/engine/evaluation/__pycache__/retrieval.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/engine/evaluation/classification.py b/src/PaddleClas/ppcls/engine/evaluation/classification.py
new file mode 100644
index 0000000..d7b5c47
--- /dev/null
+++ b/src/PaddleClas/ppcls/engine/evaluation/classification.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import platform
+import paddle
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+
+
+def classification_eval(engine, epoch_id=0):
+    output_info = dict()
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = engine.config["Global"]["print_batch_step"]
+
+    metric_key = None
+    tic = time.time()
+    accum_samples = 0
+    total_samples = len(
+        engine.eval_dataloader.
+        dataset) if not engine.use_dali else engine.eval_dataloader.size
+    max_iter = len(engine.eval_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(engine.eval_dataloader)
+    for iter_id, batch in enumerate(engine.eval_dataloader):
+        if iter_id >= max_iter:
+            break
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+        if engine.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        time_info["reader_cost"].update(time.time() - tic)
+        batch_size = batch[0].shape[0]
+        batch[0] = paddle.to_tensor(batch[0]).astype("float32")
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+
+        # image input
+        if engine.amp:
+            amp_level = engine.config['AMP'].get("level", "O1").upper()
+            with paddle.amp.auto_cast(
+                    custom_black_list={
+                        "flatten_contiguous_range", "greater_than"
+                    },
+                    level=amp_level):
+                out = engine.model(batch[0])
+                # calc loss
+                if engine.eval_loss_func is not None:
+                    loss_dict = engine.eval_loss_func(out, batch[1])
+                    for key in loss_dict:
+                        if key not in output_info:
+                            output_info[key] = AverageMeter(key, '7.5f')
+                        output_info[key].update(loss_dict[key].numpy()[0],
+                                                batch_size)
+        else:
+            out = engine.model(batch[0])
+            # calc loss
+            if engine.eval_loss_func is not None:
+                loss_dict = engine.eval_loss_func(out, batch[1])
+                for key in loss_dict:
+                    if key not in output_info:
+                        output_info[key] = AverageMeter(key, '7.5f')
+                    output_info[key].update(loss_dict[key].numpy()[0],
+                                            batch_size)
+
+        # just for DistributedBatchSampler issue: repeat sampling
+        current_samples = batch_size * paddle.distributed.get_world_size()
+        accum_samples += current_samples
+
+        # calc metric
+        if engine.eval_metric_func is not None:
+            if paddle.distributed.get_world_size() > 1:
+                label_list = []
+                paddle.distributed.all_gather(label_list, batch[1])
+                labels = paddle.concat(label_list, 0)
+
+                if isinstance(out, dict):
+                    if "Student" in out:
+                        out = out["Student"]
+                    elif "logits" in out:
+                        out = out["logits"]
+                    else:
+                        msg = "Error: Wrong key in out!"
+                        raise Exception(msg)
+                if isinstance(out, list):
+                    pred = []
+                    for x in out:
+                        pred_list = []
+                        paddle.distributed.all_gather(pred_list, x)
+                        pred_x = paddle.concat(pred_list, 0)
+                        pred.append(pred_x)
+                else:
+                    pred_list = []
+                    paddle.distributed.all_gather(pred_list, out)
+                    pred = paddle.concat(pred_list, 0)
+
+                if accum_samples > total_samples and not engine.use_dali:
+                    pred = pred[:total_samples + current_samples -
+                                accum_samples]
+                    labels = labels[:total_samples + current_samples -
+                                    accum_samples]
+                    current_samples = total_samples + current_samples - accum_samples
+                metric_dict = engine.eval_metric_func(pred, labels)
+            else:
+                metric_dict = engine.eval_metric_func(out, batch[1])
+
+            for key in metric_dict:
+                if metric_key is None:
+                    metric_key = key
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+
+                output_info[key].update(metric_dict[key].numpy()[0],
+                                        current_samples)
+
+        time_info["batch_cost"].update(time.time() - tic)
+
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].val)
+                for key in output_info
+            ])
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(engine.eval_dataloader), metric_msg, time_msg, ips_msg))
+
+        tic = time.time()
+    if engine.use_dali:
+        engine.eval_dataloader.reset()
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, output_info[key].avg) for key in output_info
+    ])
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+    # do not try to save best eval.model
+    if engine.eval_metric_func is None:
+        return -1
+    # return 1st metric in the dict
+    return output_info[metric_key].avg
diff --git a/src/PaddleClas/ppcls/engine/evaluation/retrieval.py b/src/PaddleClas/ppcls/engine/evaluation/retrieval.py
new file mode 100644
index 0000000..8471a42
--- /dev/null
+++ b/src/PaddleClas/ppcls/engine/evaluation/retrieval.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+import paddle
+from ppcls.utils import logger
+
+
+def retrieval_eval(engine, epoch_id=0):
+    engine.model.eval()
+    # step1. build gallery
+    if engine.gallery_query_dataloader is not None:
+        gallery_feas, gallery_img_id, gallery_unique_id = cal_feature(
+            engine, name='gallery_query')
+        query_feas, query_img_id, query_query_id = gallery_feas, gallery_img_id, gallery_unique_id
+    else:
+        gallery_feas, gallery_img_id, gallery_unique_id = cal_feature(
+            engine, name='gallery')
+        query_feas, query_img_id, query_query_id = cal_feature(
+            engine, name='query')
+
+    # step2. do evaluation
+    sim_block_size = engine.config["Global"].get("sim_block_size", 64)
+    sections = [sim_block_size] * (len(query_feas) // sim_block_size)
+    if len(query_feas) % sim_block_size:
+        sections.append(len(query_feas) % sim_block_size)
+    fea_blocks = paddle.split(query_feas, num_or_sections=sections)
+    if query_query_id is not None:
+        query_id_blocks = paddle.split(
+            query_query_id, num_or_sections=sections)
+    image_id_blocks = paddle.split(query_img_id, num_or_sections=sections)
+    metric_key = None
+
+    if engine.eval_loss_func is None:
+        metric_dict = {metric_key: 0.}
+    else:
+        metric_dict = dict()
+        for block_idx, block_fea in enumerate(fea_blocks):
+            similarity_matrix = paddle.matmul(
+                block_fea, gallery_feas, transpose_y=True)
+            if query_query_id is not None:
+                query_id_block = query_id_blocks[block_idx]
+                query_id_mask = (query_id_block != gallery_unique_id.t())
+
+                image_id_block = image_id_blocks[block_idx]
+                image_id_mask = (image_id_block != gallery_img_id.t())
+
+                keep_mask = paddle.logical_or(query_id_mask, image_id_mask)
+                similarity_matrix = similarity_matrix * keep_mask.astype(
+                    "float32")
+            else:
+                keep_mask = None
+
+            metric_tmp = engine.eval_metric_func(similarity_matrix,
+                                                 image_id_blocks[block_idx],
+                                                 gallery_img_id, keep_mask)
+
+            for key in metric_tmp:
+                if key not in metric_dict:
+                    metric_dict[key] = metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+                else:
+                    metric_dict[key] += metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+
+    metric_info_list = []
+    for key in metric_dict:
+        if metric_key is None:
+            metric_key = key
+        metric_info_list.append("{}: {:.5f}".format(key, metric_dict[key]))
+    metric_msg = ", ".join(metric_info_list)
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+    return metric_dict[metric_key]
+
+
+def cal_feature(engine, name='gallery'):
+    all_feas = None
+    all_image_id = None
+    all_unique_id = None
+    has_unique_id = False
+
+    if name == 'gallery':
+        dataloader = engine.gallery_dataloader
+    elif name == 'query':
+        dataloader = engine.query_dataloader
+    elif name == 'gallery_query':
+        dataloader = engine.gallery_query_dataloader
+    else:
+        raise RuntimeError("Only support gallery or query dataset")
+
+    max_iter = len(dataloader) - 1 if platform.system() == "Windows" else len(
+        dataloader)
+    for idx, batch in enumerate(dataloader):  # load is very time-consuming
+        if idx >= max_iter:
+            break
+        if idx % engine.config["Global"]["print_batch_step"] == 0:
+            logger.info(
+                f"{name} feature calculation process: [{idx}/{len(dataloader)}]"
+            )
+        if engine.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch = [paddle.to_tensor(x) for x in batch]
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        if len(batch) == 3:
+            has_unique_id = True
+            batch[2] = batch[2].reshape([-1, 1]).astype("int64")
+        out = engine.model(batch[0], batch[1])
+        if "Student" in out:
+            out = out["Student"]
+        batch_feas = out["features"]
+
+        # do norm
+        if engine.config["Global"].get("feature_normalize", True):
+            feas_norm = paddle.sqrt(
+                paddle.sum(paddle.square(batch_feas), axis=1, keepdim=True))
+            batch_feas = paddle.divide(batch_feas, feas_norm)
+
+        # do binarize
+        if engine.config["Global"].get("feature_binarize") == "round":
+            batch_feas = paddle.round(batch_feas).astype("float32") * 2.0 - 1.0
+
+        if engine.config["Global"].get("feature_binarize") == "sign":
+            batch_feas = paddle.sign(batch_feas).astype("float32")
+
+        if all_feas is None:
+            all_feas = batch_feas
+            if has_unique_id:
+                all_unique_id = batch[2]
+            all_image_id = batch[1]
+        else:
+            all_feas = paddle.concat([all_feas, batch_feas])
+            all_image_id = paddle.concat([all_image_id, batch[1]])
+            if has_unique_id:
+                all_unique_id = paddle.concat([all_unique_id, batch[2]])
+
+    if engine.use_dali:
+        dataloader.reset()
+
+    if paddle.distributed.get_world_size() > 1:
+        feat_list = []
+        img_id_list = []
+        unique_id_list = []
+        paddle.distributed.all_gather(feat_list, all_feas)
+        paddle.distributed.all_gather(img_id_list, all_image_id)
+        all_feas = paddle.concat(feat_list, axis=0)
+        all_image_id = paddle.concat(img_id_list, axis=0)
+        if has_unique_id:
+            paddle.distributed.all_gather(unique_id_list, all_unique_id)
+            all_unique_id = paddle.concat(unique_id_list, axis=0)
+
+    logger.info("Build {} done, all feat shape: {}, begin to eval..".format(
+        name, all_feas.shape))
+    return all_feas, all_image_id, all_unique_id
diff --git a/src/PaddleClas/ppcls/engine/train/__init__.py b/src/PaddleClas/ppcls/engine/train/__init__.py
new file mode 100644
index 0000000..800d3a4
--- /dev/null
+++ b/src/PaddleClas/ppcls/engine/train/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.engine.train.train import train_epoch
diff --git a/src/PaddleClas/ppcls/engine/train/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/engine/train/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..01cb834
Binary files /dev/null and b/src/PaddleClas/ppcls/engine/train/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/engine/train/__pycache__/train.cpython-39.pyc b/src/PaddleClas/ppcls/engine/train/__pycache__/train.cpython-39.pyc
new file mode 100644
index 0000000..ea695c6
Binary files /dev/null and b/src/PaddleClas/ppcls/engine/train/__pycache__/train.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/engine/train/__pycache__/utils.cpython-39.pyc b/src/PaddleClas/ppcls/engine/train/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000..ac0f4e0
Binary files /dev/null and b/src/PaddleClas/ppcls/engine/train/__pycache__/utils.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/engine/train/train.py b/src/PaddleClas/ppcls/engine/train/train.py
new file mode 100644
index 0000000..3b02bac
--- /dev/null
+++ b/src/PaddleClas/ppcls/engine/train/train.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+from ppcls.engine.train.utils import update_loss, update_metric, log_info
+from ppcls.utils import profiler
+
+
+def train_epoch(engine, epoch_id, print_batch_step):
+    tic = time.time()
+    v_current = [int(i) for i in paddle.__version__.split(".")]
+    for iter_id, batch in enumerate(engine.train_dataloader):
+        if iter_id >= engine.max_iter:
+            break
+        profiler.add_profiler_step(engine.config["profiler_options"])
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+        engine.time_info["reader_cost"].update(time.time() - tic)
+        if engine.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch_size = batch[0].shape[0]
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([batch_size, -1])
+        engine.global_step += 1
+
+        # image input
+        if engine.amp:
+            amp_level = engine.config['AMP'].get("level", "O1").upper()
+            with paddle.amp.auto_cast(
+                    custom_black_list={
+                        "flatten_contiguous_range", "greater_than"
+                    },
+                    level=amp_level):
+                out = forward(engine, batch)
+                loss_dict = engine.train_loss_func(out, batch[1])
+        else:
+            out = forward(engine, batch)
+            loss_dict = engine.train_loss_func(out, batch[1])
+
+        # step opt and lr
+        if engine.amp:
+            scaled = engine.scaler.scale(loss_dict["loss"])
+            scaled.backward()
+            engine.scaler.minimize(engine.optimizer, scaled)
+        else:
+            loss_dict["loss"].backward()
+            engine.optimizer.step()
+        engine.optimizer.clear_grad()
+        engine.lr_sch.step()
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(engine, out, batch, batch_size)
+        # update_loss_for_logger
+        update_loss(engine, loss_dict, batch_size)
+        engine.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+
+def forward(engine, batch):
+    if not engine.is_rec:
+        return engine.model(batch[0])
+    else:
+        return engine.model(batch[0], batch[1])
diff --git a/src/PaddleClas/ppcls/engine/train/utils.py b/src/PaddleClas/ppcls/engine/train/utils.py
new file mode 100644
index 0000000..92eb35d
--- /dev/null
+++ b/src/PaddleClas/ppcls/engine/train/utils.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import datetime
+from ppcls.utils import logger
+from ppcls.utils.misc import AverageMeter
+
+
+def update_metric(trainer, out, batch, batch_size):
+    # calc metric
+    if trainer.train_metric_func is not None:
+        metric_dict = trainer.train_metric_func(out, batch[-1])
+        for key in metric_dict:
+            if key not in trainer.output_info:
+                trainer.output_info[key] = AverageMeter(key, '7.5f')
+            trainer.output_info[key].update(metric_dict[key].numpy()[0],
+                                            batch_size)
+
+
+def update_loss(trainer, loss_dict, batch_size):
+    # update_output_info
+    for key in loss_dict:
+        if key not in trainer.output_info:
+            trainer.output_info[key] = AverageMeter(key, '7.5f')
+        trainer.output_info[key].update(loss_dict[key].numpy()[0], batch_size)
+
+
+def log_info(trainer, batch_size, epoch_id, iter_id):
+    lr_msg = "lr: {:.5f}".format(trainer.lr_sch.get_lr())
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, trainer.output_info[key].avg)
+        for key in trainer.output_info
+    ])
+    time_msg = "s, ".join([
+        "{}: {:.5f}".format(key, trainer.time_info[key].avg)
+        for key in trainer.time_info
+    ])
+
+    ips_msg = "ips: {:.5f} images/sec".format(
+        batch_size / trainer.time_info["batch_cost"].avg)
+    eta_sec = ((trainer.config["Global"]["epochs"] - epoch_id + 1
+                ) * len(trainer.train_dataloader) - iter_id
+               ) * trainer.time_info["batch_cost"].avg
+    eta_msg = "eta: {:s}".format(str(datetime.timedelta(seconds=int(eta_sec))))
+    logger.info("[Train][Epoch {}/{}][Iter: {}/{}]{}, {}, {}, {}, {}".format(
+        epoch_id, trainer.config["Global"]["epochs"], iter_id,
+        len(trainer.train_dataloader), lr_msg, metric_msg, time_msg, ips_msg,
+        eta_msg))
+
+    logger.scaler(
+        name="lr",
+        value=trainer.lr_sch.get_lr(),
+        step=trainer.global_step,
+        writer=trainer.vdl_writer)
+    for key in trainer.output_info:
+        logger.scaler(
+            name="train_{}".format(key),
+            value=trainer.output_info[key].avg,
+            step=trainer.global_step,
+            writer=trainer.vdl_writer)
diff --git a/src/PaddleClas/ppcls/loss/__init__.py b/src/PaddleClas/ppcls/loss/__init__.py
new file mode 100644
index 0000000..d15dab9
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/__init__.py
@@ -0,0 +1,67 @@
+import copy
+
+import paddle
+import paddle.nn as nn
+from ppcls.utils import logger
+
+from .celoss import CELoss, MixCELoss
+from .googlenetloss import GoogLeNetLoss
+from .centerloss import CenterLoss
+from .emlloss import EmlLoss
+from .msmloss import MSMLoss
+from .npairsloss import NpairsLoss
+from .trihardloss import TriHardLoss
+from .triplet import TripletLoss, TripletLossV2
+from .supconloss import SupConLoss
+from .pairwisecosface import PairwiseCosface
+from .dmlloss import DMLLoss
+from .distanceloss import DistanceLoss
+
+from .distillationloss import DistillationCELoss
+from .distillationloss import DistillationGTCELoss
+from .distillationloss import DistillationDMLLoss
+from .distillationloss import DistillationDistanceLoss
+from .distillationloss import DistillationRKDLoss
+from .multilabelloss import MultiLabelLoss
+
+from .deephashloss import DSHSDLoss, LCDSHLoss
+
+
+class CombinedLoss(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.loss_func = []
+        self.loss_weight = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            name = list(config)[0]
+            param = config[name]
+            assert "weight" in param, "weight must be in param, but param just contains {}".format(
+                param.keys())
+            self.loss_weight.append(param.pop("weight"))
+            self.loss_func.append(eval(name)(**param))
+
+    def __call__(self, input, batch):
+        loss_dict = {}
+        # just for accelerate classification traing speed
+        if len(self.loss_func) == 1:
+            loss = self.loss_func[0](input, batch)
+            loss_dict.update(loss)
+            loss_dict["loss"] = list(loss.values())[0]
+        else:
+            for idx, loss_func in enumerate(self.loss_func):
+                loss = loss_func(input, batch)
+                weight = self.loss_weight[idx]
+                loss = {key: loss[key] * weight for key in loss}
+                loss_dict.update(loss)
+            loss_dict["loss"] = paddle.add_n(list(loss_dict.values()))
+        return loss_dict
+
+
+def build_loss(config):
+    module_class = CombinedLoss(copy.deepcopy(config))
+    logger.debug("build loss {} success.".format(module_class))
+    return module_class
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..cc3358b
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/celoss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/celoss.cpython-39.pyc
new file mode 100644
index 0000000..794a040
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/celoss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/centerloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/centerloss.cpython-39.pyc
new file mode 100644
index 0000000..43e5df7
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/centerloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/comfunc.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/comfunc.cpython-39.pyc
new file mode 100644
index 0000000..0baf00e
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/comfunc.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/deephashloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/deephashloss.cpython-39.pyc
new file mode 100644
index 0000000..9260c26
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/deephashloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/distanceloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/distanceloss.cpython-39.pyc
new file mode 100644
index 0000000..20e799b
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/distanceloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/distillationloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/distillationloss.cpython-39.pyc
new file mode 100644
index 0000000..9a669ee
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/distillationloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/dmlloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/dmlloss.cpython-39.pyc
new file mode 100644
index 0000000..79557bd
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/dmlloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/emlloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/emlloss.cpython-39.pyc
new file mode 100644
index 0000000..ac8b739
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/emlloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/googlenetloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/googlenetloss.cpython-39.pyc
new file mode 100644
index 0000000..cf905df
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/googlenetloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/msmloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/msmloss.cpython-39.pyc
new file mode 100644
index 0000000..4591ee7
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/msmloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/multilabelloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/multilabelloss.cpython-39.pyc
new file mode 100644
index 0000000..385d408
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/multilabelloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/npairsloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/npairsloss.cpython-39.pyc
new file mode 100644
index 0000000..a31cb1c
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/npairsloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/pairwisecosface.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/pairwisecosface.cpython-39.pyc
new file mode 100644
index 0000000..86cee45
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/pairwisecosface.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/rkdloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/rkdloss.cpython-39.pyc
new file mode 100644
index 0000000..3ef7b9f
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/rkdloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/supconloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/supconloss.cpython-39.pyc
new file mode 100644
index 0000000..e537144
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/supconloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/trihardloss.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/trihardloss.cpython-39.pyc
new file mode 100644
index 0000000..56e2651
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/trihardloss.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/__pycache__/triplet.cpython-39.pyc b/src/PaddleClas/ppcls/loss/__pycache__/triplet.cpython-39.pyc
new file mode 100644
index 0000000..8554871
Binary files /dev/null and b/src/PaddleClas/ppcls/loss/__pycache__/triplet.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/loss/celoss.py b/src/PaddleClas/ppcls/loss/celoss.py
new file mode 100644
index 0000000..a789261
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/celoss.py
@@ -0,0 +1,67 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppcls.utils import logger
+
+
+class CELoss(nn.Layer):
+    """
+    Cross entropy loss
+    """
+
+    def __init__(self, epsilon=None):
+        super().__init__()
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+
+    def _labelsmoothing(self, target, class_num):
+        if len(target.shape) == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+        if self.epsilon is not None:
+            class_num = x.shape[-1]
+            label = self._labelsmoothing(label, class_num)
+            x = -F.log_softmax(x, axis=-1)
+            loss = paddle.sum(x * label, axis=-1)
+        else:
+            if label.shape[-1] == x.shape[-1]:
+                label = F.softmax(label, axis=-1)
+                soft_label = True
+            else:
+                soft_label = False
+            loss = F.cross_entropy(x, label=label, soft_label=soft_label)
+        loss = loss.mean()
+        return {"CELoss": loss}
+
+
+class MixCELoss(object):
+    def __init__(self, *args, **kwargs):
+        msg = "\"MixCELos\" is deprecated, please use \"CELoss\" instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
diff --git a/src/PaddleClas/ppcls/loss/centerloss.py b/src/PaddleClas/ppcls/loss/centerloss.py
new file mode 100644
index 0000000..d85b3f2
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/centerloss.py
@@ -0,0 +1,54 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class CenterLoss(nn.Layer):
+    def __init__(self, num_classes=5013, feat_dim=2048):
+        super(CenterLoss, self).__init__()
+        self.num_classes = num_classes
+        self.feat_dim = feat_dim
+        self.centers = paddle.randn(
+            shape=[self.num_classes, self.feat_dim]).astype(
+                "float64")  #random center
+
+    def __call__(self, input, target):
+        """
+        inputs: network output: {"features: xxx", "logits": xxxx}
+        target: image label
+        """
+        feats = input["features"]
+        labels = target
+        batch_size = feats.shape[0]
+
+        #calc feat * feat   
+        dist1 = paddle.sum(paddle.square(feats), axis=1, keepdim=True)
+        dist1 = paddle.expand(dist1, [batch_size, self.num_classes])
+
+        #dist2 of centers
+        dist2 = paddle.sum(paddle.square(self.centers), axis=1,
+                           keepdim=True)  #num_classes
+        dist2 = paddle.expand(dist2,
+                              [self.num_classes, batch_size]).astype("float64")
+        dist2 = paddle.transpose(dist2, [1, 0])
+
+        #first x * x + y * y
+        distmat = paddle.add(dist1, dist2)
+        tmp = paddle.matmul(feats, paddle.transpose(self.centers, [1, 0]))
+        distmat = distmat - 2.0 * tmp
+
+        #generate the mask
+        classes = paddle.arange(self.num_classes).astype("int64")
+        labels = paddle.expand(
+            paddle.unsqueeze(labels, 1), (batch_size, self.num_classes))
+        mask = paddle.equal(
+            paddle.expand(classes, [batch_size, self.num_classes]),
+            labels).astype("float64")  #get mask
+
+        dist = paddle.multiply(distmat, mask)
+        loss = paddle.sum(paddle.clip(dist, min=1e-12, max=1e+12)) / batch_size
+
+        return {'CenterLoss': loss}
diff --git a/src/PaddleClas/ppcls/loss/comfunc.py b/src/PaddleClas/ppcls/loss/comfunc.py
new file mode 100644
index 0000000..277bdd6
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/comfunc.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def rerange_index(batch_size, samples_each_class):
+    tmp = np.arange(0, batch_size * batch_size)
+    tmp = tmp.reshape(-1, batch_size)
+    rerange_index = []
+
+    for i in range(batch_size):
+        step = i // samples_each_class
+        start = step * samples_each_class
+        end = (step + 1) * samples_each_class
+
+        pos_idx = []
+        neg_idx = []
+        for j, k in enumerate(tmp[i]):
+            if j >= start and j < end:
+                if j == i:
+                    pos_idx.insert(0, k)
+                else:
+                    pos_idx.append(k)
+            else:
+                neg_idx.append(k)
+        rerange_index += (pos_idx + neg_idx)
+
+    rerange_index = np.array(rerange_index).astype(np.int32)
+    return rerange_index
diff --git a/src/PaddleClas/ppcls/loss/deephashloss.py b/src/PaddleClas/ppcls/loss/deephashloss.py
new file mode 100644
index 0000000..c9a58dc
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/deephashloss.py
@@ -0,0 +1,92 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+class DSHSDLoss(nn.Layer):
+    """
+    # DSHSD(IEEE ACCESS 2019)
+    # paper [Deep Supervised Hashing Based on Stable Distribution](https://ieeexplore.ieee.org/document/8648432/)
+    # [DSHSD] epoch:70,  bit:48,  dataset:cifar10-1,  MAP:0.809, Best MAP: 0.809
+    # [DSHSD] epoch:250, bit:48,  dataset:nuswide_21, MAP:0.809, Best MAP: 0.815
+    # [DSHSD] epoch:135, bit:48,  dataset:imagenet,   MAP:0.647, Best MAP: 0.647
+    """
+    def __init__(self, alpha, multi_label=False):
+        super(DSHSDLoss, self).__init__()
+        self.alpha = alpha
+        self.multi_label = multi_label
+
+    def forward(self, input, label):
+        feature = input["features"]
+        logits = input["logits"]
+
+        dist = paddle.sum(paddle.square(
+            (paddle.unsqueeze(feature, 1) - paddle.unsqueeze(feature, 0))),
+                          axis=2)
+
+        # label to ont-hot
+        label = paddle.flatten(label)
+        n_class = logits.shape[1]
+        label = paddle.nn.functional.one_hot(label, n_class).astype("float32")
+
+        s = (paddle.matmul(
+            label, label, transpose_y=True) == 0).astype("float32")
+        margin = 2 * feature.shape[1]
+        Ld = (1 - s) / 2 * dist + s / 2 * (margin - dist).clip(min=0)
+        Ld = Ld.mean()
+
+        if self.multi_label:
+            # multiple labels classification loss
+            Lc = (logits - label * logits + (
+                (1 + (-logits).exp()).log())).sum(axis=1).mean()
+        else:
+            # single labels classification loss
+            Lc = (-paddle.nn.functional.softmax(logits).log() * label).sum(
+                axis=1).mean()
+
+        return {"dshsdloss": Lc + Ld * self.alpha}
+
+
+class LCDSHLoss(nn.Layer):
+    """
+    # paper [Locality-Constrained Deep Supervised Hashing for Image Retrieval](https://www.ijcai.org/Proceedings/2017/0499.pdf)
+    # [LCDSH] epoch:145, bit:48, dataset:cifar10-1,  MAP:0.798, Best MAP: 0.798
+    # [LCDSH] epoch:183, bit:48, dataset:nuswide_21, MAP:0.833, Best MAP: 0.834
+    """
+    def __init__(self, n_class, _lambda):
+        super(LCDSHLoss, self).__init__()
+        self._lambda = _lambda
+        self.n_class = n_class
+
+    def forward(self, input, label):
+        feature = input["features"]
+
+        # label to ont-hot
+        label = paddle.flatten(label)
+        label = paddle.nn.functional.one_hot(label,  self.n_class).astype("float32")
+        
+        s = 2 * (paddle.matmul(label, label, transpose_y=True) > 0).astype("float32") - 1
+        inner_product = paddle.matmul(feature, feature, transpose_y=True) * 0.5
+
+        inner_product = inner_product.clip(min=-50, max=50)
+        L1 = paddle.log(1 + paddle.exp(-s * inner_product)).mean()
+
+        b = feature.sign()
+        inner_product_ = paddle.matmul(b, b, transpose_y=True) * 0.5
+        sigmoid = paddle.nn.Sigmoid()
+        L2 = (sigmoid(inner_product) - sigmoid(inner_product_)).pow(2).mean()
+
+        return {"lcdshloss": L1 + self._lambda * L2}
+
diff --git a/src/PaddleClas/ppcls/loss/distanceloss.py b/src/PaddleClas/ppcls/loss/distanceloss.py
new file mode 100644
index 0000000..0a09f0c
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/distanceloss.py
@@ -0,0 +1,43 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn import L1Loss
+from paddle.nn import MSELoss as L2Loss
+from paddle.nn import SmoothL1Loss
+
+
+class DistanceLoss(nn.Layer):
+    """
+    DistanceLoss:
+        mode: loss mode
+    """
+
+    def __init__(self, mode="l2", **kargs):
+        super().__init__()
+        assert mode in ["l1", "l2", "smooth_l1"]
+        if mode == "l1":
+            self.loss_func = nn.L1Loss(**kargs)
+        elif mode == "l2":
+            self.loss_func = nn.MSELoss(**kargs)
+        elif mode == "smooth_l1":
+            self.loss_func = nn.SmoothL1Loss(**kargs)
+        self.mode = mode
+
+    def forward(self, x, y):
+        loss = self.loss_func(x, y)
+        return {"loss_{}".format(self.mode): loss}
diff --git a/src/PaddleClas/ppcls/loss/distillationloss.py b/src/PaddleClas/ppcls/loss/distillationloss.py
new file mode 100644
index 0000000..0340234
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/distillationloss.py
@@ -0,0 +1,174 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .celoss import CELoss
+from .dmlloss import DMLLoss
+from .distanceloss import DistanceLoss
+from .rkdloss import RKdAngle, RkdDistance
+
+
+class DistillationCELoss(CELoss):
+    """
+    DistillationCELoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 epsilon=None,
+                 key=None,
+                 name="loss_ce"):
+        super().__init__(epsilon=epsilon)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2)
+            for key in loss:
+                loss_dict["{}_{}_{}".format(key, pair[0], pair[1])] = loss[key]
+        return loss_dict
+
+
+class DistillationGTCELoss(CELoss):
+    """
+    DistillationGTCELoss
+    """
+
+    def __init__(self,
+                 model_names=[],
+                 epsilon=None,
+                 key=None,
+                 name="loss_gt_ce"):
+        super().__init__(epsilon=epsilon)
+        assert isinstance(model_names, list)
+        self.key = key
+        self.model_names = model_names
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for name in self.model_names:
+            out = predicts[name]
+            if self.key is not None:
+                out = out[self.key]
+            loss = super().forward(out, batch)
+            for key in loss:
+                loss_dict["{}_{}".format(key, name)] = loss[key]
+        return loss_dict
+
+
+class DistillationDMLLoss(DMLLoss):
+    """
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 act="softmax",
+                 key=None,
+                 name="loss_dml"):
+        super().__init__(act=act)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2)
+            if isinstance(loss, dict):
+                for key in loss:
+                    loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1],
+                                                   idx)] = loss[key]
+            else:
+                loss_dict["{}_{}".format(self.name, idx)] = loss
+        return loss_dict
+
+
+class DistillationDistanceLoss(DistanceLoss):
+    """
+    """
+
+    def __init__(self,
+                 mode="l2",
+                 model_name_pairs=[],
+                 key=None,
+                 name="loss_",
+                 **kargs):
+        super().__init__(mode=mode, **kargs)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name + mode
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2)
+            for key in loss:
+                loss_dict["{}_{}_{}".format(self.name, key, idx)] = loss[key]
+        return loss_dict
+
+
+class DistillationRKDLoss(nn.Layer):
+    def __init__(self,
+                 target_size=None,
+                 model_name_pairs=(["Student", "Teacher"], ),
+                 student_keepkeys=[],
+                 teacher_keepkeys=[]):
+        super().__init__()
+        self.student_keepkeys = student_keepkeys
+        self.teacher_keepkeys = teacher_keepkeys
+        self.model_name_pairs = model_name_pairs
+        assert len(self.student_keepkeys) == len(self.teacher_keepkeys)
+
+        self.rkd_angle_loss = RKdAngle(target_size=target_size)
+        self.rkd_dist_loss = RkdDistance(target_size=target_size)
+
+    def __call__(self, predicts, batch):
+        loss_dict = {}
+        for m1, m2 in self.model_name_pairs:
+            for idx, (
+                    student_name, teacher_name
+            ) in enumerate(zip(self.student_keepkeys, self.teacher_keepkeys)):
+                student_out = predicts[m1][student_name]
+                teacher_out = predicts[m2][teacher_name]
+
+                loss_dict[f"loss_angle_{idx}_{m1}_{m2}"] = self.rkd_angle_loss(
+                    student_out, teacher_out)
+                loss_dict[f"loss_dist_{idx}_{m1}_{m2}"] = self.rkd_dist_loss(
+                    student_out, teacher_out)
+
+        return loss_dict
diff --git a/src/PaddleClas/ppcls/loss/dmlloss.py b/src/PaddleClas/ppcls/loss/dmlloss.py
new file mode 100644
index 0000000..48bf6c0
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/dmlloss.py
@@ -0,0 +1,50 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class DMLLoss(nn.Layer):
+    """
+    DMLLoss
+    """
+
+    def __init__(self, act="softmax", eps=1e-12):
+        super().__init__()
+        if act is not None:
+            assert act in ["softmax", "sigmoid"]
+        if act == "softmax":
+            self.act = nn.Softmax(axis=-1)
+        elif act == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            self.act = None
+        self.eps = eps
+
+    def _kldiv(self, x, target):
+        class_num = x.shape[-1]
+        cost = target * paddle.log(
+            (target + self.eps) / (x + self.eps)) * class_num
+        return cost
+
+    def forward(self, x, target):
+        if self.act is not None:
+            x = self.act(x)
+            target = self.act(target)
+        loss = self._kldiv(x, target) + self._kldiv(target, x)
+        loss = loss / 2
+        loss = paddle.mean(loss)
+        return {"DMLLoss": loss}
diff --git a/src/PaddleClas/ppcls/loss/emlloss.py b/src/PaddleClas/ppcls/loss/emlloss.py
new file mode 100644
index 0000000..9735703
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/emlloss.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import numpy as np
+from .comfunc import rerange_index
+
+
+class EmlLoss(paddle.nn.Layer):
+    def __init__(self, batch_size=40, samples_each_class=2):
+        super(EmlLoss, self).__init__()
+        assert (batch_size % samples_each_class == 0)
+        self.samples_each_class = samples_each_class
+        self.batch_size = batch_size
+        self.rerange_index = rerange_index(batch_size, samples_each_class)
+        self.thresh = 20.0
+        self.beta = 100000
+
+    def surrogate_function(self, beta, theta, bias):
+        x = theta * paddle.exp(bias)
+        output = paddle.log(1 + beta * x) / math.log(1 + beta)
+        return output
+
+    def surrogate_function_approximate(self, beta, theta, bias):
+        output = (
+            paddle.log(theta) + bias + math.log(beta)) / math.log(1 + beta)
+        return output
+
+    def surrogate_function_stable(self, beta, theta, target, thresh):
+        max_gap = paddle.to_tensor(thresh, dtype='float32')
+        max_gap.stop_gradient = True
+
+        target_max = paddle.maximum(target, max_gap)
+        target_min = paddle.minimum(target, max_gap)
+
+        loss1 = self.surrogate_function(beta, theta, target_min)
+        loss2 = self.surrogate_function_approximate(beta, theta, target_max)
+        bias = self.surrogate_function(beta, theta, max_gap)
+        loss = loss1 + loss2 - bias
+        return loss
+
+    def forward(self, input, target=None):
+        features = input["features"]
+        samples_each_class = self.samples_each_class
+        batch_size = self.batch_size
+        rerange_index = self.rerange_index
+
+        #calc distance
+        diffs = paddle.unsqueeze(
+            features, axis=1) - paddle.unsqueeze(
+                features, axis=0)
+        similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
+
+        tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
+        rerange_index = paddle.to_tensor(rerange_index)
+        tmp = paddle.gather(tmp, index=rerange_index)
+        similary_matrix = paddle.reshape(tmp, shape=[-1, batch_size])
+
+        ignore, pos, neg = paddle.split(
+            similary_matrix,
+            num_or_sections=[
+                1, samples_each_class - 1, batch_size - samples_each_class
+            ],
+            axis=1)
+        ignore.stop_gradient = True
+
+        pos_max = paddle.max(pos, axis=1, keepdim=True)
+        pos = paddle.exp(pos - pos_max)
+        pos_mean = paddle.mean(pos, axis=1, keepdim=True)
+
+        neg_min = paddle.min(neg, axis=1, keepdim=True)
+        neg = paddle.exp(neg_min - neg)
+        neg_mean = paddle.mean(neg, axis=1, keepdim=True)
+
+        bias = pos_max - neg_min
+        theta = paddle.multiply(neg_mean, pos_mean)
+
+        loss = self.surrogate_function_stable(self.beta, theta, bias,
+                                              self.thresh)
+        loss = paddle.mean(loss)
+        return {"emlloss": loss}
diff --git a/src/PaddleClas/ppcls/loss/googlenetloss.py b/src/PaddleClas/ppcls/loss/googlenetloss.py
new file mode 100644
index 0000000..c580aa6
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/googlenetloss.py
@@ -0,0 +1,41 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class GoogLeNetLoss(nn.Layer):
+    """
+    Cross entropy loss used after googlenet
+    """
+    def __init__(self, epsilon=None):
+        super().__init__()
+        assert (epsilon is None or epsilon <= 0 or epsilon >= 1), "googlenet is not support label_smooth"
+            
+
+    def forward(self, inputs, label):
+        input0, input1, input2 = inputs
+        if isinstance(input0, dict):
+            input0 = input0["logits"]
+        if isinstance(input1, dict):
+            input1 = input1["logits"]
+        if isinstance(input2, dict):
+            input2 = input2["logits"]
+
+        loss0 = F.cross_entropy(input0, label=label, soft_label=False)
+        loss1 = F.cross_entropy(input1, label=label, soft_label=False)
+        loss2 = F.cross_entropy(input2, label=label, soft_label=False)
+        loss = loss0 + 0.3 * loss1 + 0.3 * loss2
+        loss = loss.mean()
+        return {"GooleNetLoss": loss}
diff --git a/src/PaddleClas/ppcls/loss/msmloss.py b/src/PaddleClas/ppcls/loss/msmloss.py
new file mode 100644
index 0000000..3aa0dd8
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/msmloss.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+from .comfunc import rerange_index
+
+
+class MSMLoss(paddle.nn.Layer):
+    """
+    MSMLoss Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+    
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+
+    def __init__(self, batch_size=120, samples_each_class=2, margin=0.1):
+        super(MSMLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size = batch_size
+        self.rerange_index = rerange_index(batch_size, samples_each_class)
+
+    def forward(self, input, target=None):
+        #normalization 
+        features = input["features"]
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index = paddle.to_tensor(self.rerange_index)
+
+        #calc sm
+        diffs = paddle.unsqueeze(
+            features, axis=1) - paddle.unsqueeze(
+                features, axis=0)
+        similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
+
+        #rerange 
+        tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
+        tmp = paddle.gather(tmp, index=rerange_index)
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])
+
+        #split
+        ignore, pos, neg = paddle.split(
+            similary_matrix,
+            num_or_sections=[1, samples_each_class - 1, -1],
+            axis=1)
+        ignore.stop_gradient = True
+
+        hard_pos = paddle.max(pos)
+        hard_neg = paddle.min(neg)
+
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)
+        return {"msmloss": loss}
+
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
diff --git a/src/PaddleClas/ppcls/loss/multilabelloss.py b/src/PaddleClas/ppcls/loss/multilabelloss.py
new file mode 100644
index 0000000..d30d5b8
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/multilabelloss.py
@@ -0,0 +1,43 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class MultiLabelLoss(nn.Layer):
+    """
+    Multi-label loss
+    """
+
+    def __init__(self, epsilon=None):
+        super().__init__()
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+
+    def _labelsmoothing(self, target, class_num):
+        if target.ndim == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def _binary_crossentropy(self, input, target, class_num):
+        if self.epsilon is not None:
+            target = self._labelsmoothing(target, class_num)
+            cost = F.binary_cross_entropy_with_logits(
+                logit=input, label=target)
+        else:
+            cost = F.binary_cross_entropy_with_logits(
+                logit=input, label=target)
+
+        return cost
+
+    def forward(self, x, target):
+        if isinstance(x, dict):
+            x = x["logits"]
+        class_num = x.shape[-1]
+        loss = self._binary_crossentropy(x, target, class_num)
+        loss = loss.mean()
+        return {"MultiLabelLoss": loss}
diff --git a/src/PaddleClas/ppcls/loss/npairsloss.py b/src/PaddleClas/ppcls/loss/npairsloss.py
new file mode 100644
index 0000000..d4b359e
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/npairsloss.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+
+
+class NpairsLoss(paddle.nn.Layer):
+    def __init__(self, reg_lambda=0.01):
+        super(NpairsLoss, self).__init__()
+        self.reg_lambda = reg_lambda
+
+    def forward(self, input, target=None):
+        """
+        anchor and positive(should include label)
+        """
+        features = input["features"]
+        reg_lambda = self.reg_lambda
+        batch_size = features.shape[0]
+        fea_dim = features.shape[1]
+        num_class = batch_size // 2
+
+        #reshape
+        out_feas = paddle.reshape(features, shape=[-1, 2, fea_dim])
+        anc_feas, pos_feas = paddle.split(out_feas, num_or_sections=2, axis=1)
+        anc_feas = paddle.squeeze(anc_feas, axis=1)
+        pos_feas = paddle.squeeze(pos_feas, axis=1)
+
+        #get simi matrix
+        similarity_matrix = paddle.matmul(
+            anc_feas, pos_feas, transpose_y=True)  #get similarity matrix
+        sparse_labels = paddle.arange(0, num_class, dtype='int64')
+        xentloss = paddle.nn.CrossEntropyLoss()(
+            similarity_matrix, sparse_labels)  #by default: mean
+
+        #l2 norm
+        reg = paddle.mean(paddle.sum(paddle.square(features), axis=1))
+        l2loss = 0.5 * reg_lambda * reg
+        return {"npairsloss": xentloss + l2loss}
diff --git a/src/PaddleClas/ppcls/loss/pairwisecosface.py b/src/PaddleClas/ppcls/loss/pairwisecosface.py
new file mode 100644
index 0000000..beb8068
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/pairwisecosface.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class PairwiseCosface(nn.Layer):
+    def __init__(self, margin, gamma):
+        super(PairwiseCosface, self).__init__()
+        self.margin = margin
+        self.gamma = gamma
+
+    def forward(self, embedding, targets):
+        if isinstance(embedding, dict):
+            embedding = embedding['features']
+        # Normalize embedding features
+        embedding = F.normalize(embedding, axis=1)
+        dist_mat = paddle.matmul(embedding, embedding, transpose_y=True)
+
+        N = dist_mat.shape[0]
+        is_pos = targets.reshape([N,1]).expand([N,N]).equal(paddle.t(targets.reshape([N,1]).expand([N,N]))).astype('float')
+        is_neg = targets.reshape([N,1]).expand([N,N]).not_equal(paddle.t(targets.reshape([N,1]).expand([N,N]))).astype('float')
+
+        # Mask scores related to itself
+        is_pos = is_pos - paddle.eye(N, N)
+
+        s_p = dist_mat * is_pos
+        s_n = dist_mat * is_neg
+
+        logit_p = -self.gamma * s_p + (-99999999.) * (1 - is_pos)
+        logit_n = self.gamma * (s_n + self.margin) + (-99999999.) * (1 - is_neg)
+
+        loss = F.softplus(paddle.logsumexp(logit_p, axis=1) + paddle.logsumexp(logit_n, axis=1)).mean()
+  
+        return {"PairwiseCosface": loss}
+
+
diff --git a/src/PaddleClas/ppcls/loss/rkdloss.py b/src/PaddleClas/ppcls/loss/rkdloss.py
new file mode 100644
index 0000000..e6ffea2
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/rkdloss.py
@@ -0,0 +1,97 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def pdist(e, squared=False, eps=1e-12):
+    e_square = e.pow(2).sum(axis=1)
+    prod = paddle.mm(e, e.t())
+    res = (e_square.unsqueeze(1) + e_square.unsqueeze(0) - 2 * prod).clip(
+        min=eps)
+
+    if not squared:
+        res = res.sqrt()
+    return res
+
+
+class RKdAngle(nn.Layer):
+    # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py
+    def __init__(self, target_size=None):
+        super().__init__()
+        if target_size is not None:
+            self.avgpool = paddle.nn.AdaptiveAvgPool2D(target_size)
+        else:
+            self.avgpool = None
+
+    def forward(self, student, teacher):
+        # GAP to reduce memory
+        if self.avgpool is not None:
+            # NxC1xH1xW1 -> NxC1x1x1
+            student = self.avgpool(student)
+            # NxC2xH2xW2 -> NxC2x1x1
+            teacher = self.avgpool(teacher)
+
+        # reshape for feature map distillation
+        bs = student.shape[0]
+        student = student.reshape([bs, -1])
+        teacher = teacher.reshape([bs, -1])
+
+        td = (teacher.unsqueeze(0) - teacher.unsqueeze(1))
+        norm_td = F.normalize(td, p=2, axis=2)
+        t_angle = paddle.bmm(norm_td, norm_td.transpose([0, 2, 1])).reshape(
+            [-1, 1])
+
+        sd = (student.unsqueeze(0) - student.unsqueeze(1))
+        norm_sd = F.normalize(sd, p=2, axis=2)
+        s_angle = paddle.bmm(norm_sd, norm_sd.transpose([0, 2, 1])).reshape(
+            [-1, 1])
+        loss = F.smooth_l1_loss(s_angle, t_angle, reduction='mean')
+        return loss
+
+
+class RkdDistance(nn.Layer):
+    # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py
+    def __init__(self, eps=1e-12, target_size=1):
+        super().__init__()
+        self.eps = eps
+        if target_size is not None:
+            self.avgpool = paddle.nn.AdaptiveAvgPool2D(target_size)
+        else:
+            self.avgpool = None
+
+    def forward(self, student, teacher):
+        # GAP to reduce memory
+        if self.avgpool is not None:
+            # NxC1xH1xW1 -> NxC1x1x1
+            student = self.avgpool(student)
+            # NxC2xH2xW2 -> NxC2x1x1
+            teacher = self.avgpool(teacher)
+
+        bs = student.shape[0]
+        student = student.reshape([bs, -1])
+        teacher = teacher.reshape([bs, -1])
+
+        t_d = pdist(teacher, squared=False)
+        mean_td = t_d.mean()
+        t_d = t_d / (mean_td + self.eps)
+
+        d = pdist(student, squared=False)
+        mean_d = d.mean()
+        d = d / (mean_d + self.eps)
+
+        loss = F.smooth_l1_loss(d, t_d, reduction="mean")
+        return loss
diff --git a/src/PaddleClas/ppcls/loss/supconloss.py b/src/PaddleClas/ppcls/loss/supconloss.py
new file mode 100644
index 0000000..3dd33bc
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/supconloss.py
@@ -0,0 +1,108 @@
+import paddle
+from paddle import nn
+
+
+class SupConLoss(nn.Layer):
+    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
+    It also supports the unsupervised contrastive loss in SimCLR"""
+
+    def __init__(self,
+                 views=16,
+                 temperature=0.07,
+                 contrast_mode='all',
+                 base_temperature=0.07,
+                 normalize_feature=True):
+        super(SupConLoss, self).__init__()
+        self.temperature = paddle.to_tensor(temperature)
+        self.contrast_mode = contrast_mode
+        self.base_temperature = paddle.to_tensor(base_temperature)
+        self.num_ids = None
+        self.views = views
+        self.normalize_feature = normalize_feature
+
+    def forward(self, features, labels, mask=None):
+        """Compute loss for model. If both `labels` and `mask` are None,
+        it degenerates to SimCLR unsupervised loss:
+        https://arxiv.org/pdf/2002.05709.pdf
+        Args:
+            features: hidden vector of shape [bsz, n_views, ...].
+            labels: ground truth of shape [bsz].
+            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
+                has the same class as sample i. Can be asymmetric.
+        Returns:
+            A loss scalar.
+        """
+        features = features["features"]
+        if self.num_ids is None:
+            self.num_ids = int(features.shape[0] / self.views)
+
+        if self.normalize_feature:
+            features = 1. * features / (paddle.expand_as(
+                paddle.norm(
+                    features, p=2, axis=-1, keepdim=True), features) + 1e-12)
+        features = features.reshape([self.num_ids, self.views, -1])
+        labels = labels.reshape([self.num_ids, self.views])[:, 0]
+
+        if len(features.shape) < 3:
+            raise ValueError('`features` needs to be [bsz, n_views, ...],'
+                             'at least 3 dimensions are required')
+        if len(features.shape) > 3:
+            features = features.reshape(
+                [features.shape[0], features.shape[1], -1])
+
+        batch_size = features.shape[0]
+        if labels is not None and mask is not None:
+            raise ValueError('Cannot define both `labels` and `mask`')
+        elif labels is None and mask is None:
+            mask = paddle.eye(batch_size, dtype='float32')
+        elif labels is not None:
+            labels = labels.reshape([-1, 1])
+            if labels.shape[0] != batch_size:
+                raise ValueError(
+                    'Num of labels does not match num of features')
+            mask = paddle.cast(
+                paddle.equal(labels, paddle.t(labels)), 'float32')
+        else:
+            mask = paddle.cast(mask, 'float32')
+
+        contrast_count = features.shape[1]
+        contrast_feature = paddle.concat(
+            paddle.unbind(
+                features, axis=1), axis=0)
+        if self.contrast_mode == 'one':
+            anchor_feature = features[:, 0]
+            anchor_count = 1
+        elif self.contrast_mode == 'all':
+            anchor_feature = contrast_feature
+            anchor_count = contrast_count
+        else:
+            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))
+
+        # compute logits
+        anchor_dot_contrast = paddle.divide(
+            paddle.matmul(anchor_feature, paddle.t(contrast_feature)),
+            self.temperature)
+        # for numerical stability
+        logits_max = paddle.max(anchor_dot_contrast, axis=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+
+        # tile mask
+        mask = paddle.tile(mask, [anchor_count, contrast_count])
+
+        logits_mask = 1 - paddle.eye(batch_size * anchor_count)
+        mask = mask * logits_mask
+
+        # compute log_prob
+        exp_logits = paddle.exp(logits) * logits_mask
+        log_prob = logits - paddle.log(
+            paddle.sum(exp_logits, axis=1, keepdim=True))
+
+        # compute mean of log-likelihood over positive
+        mean_log_prob_pos = paddle.sum((mask * log_prob),
+                                       axis=1) / paddle.sum(mask, axis=1)
+
+        # loss
+        loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = paddle.mean(loss.reshape([anchor_count, batch_size]))
+
+        return {"SupConLoss": loss}
diff --git a/src/PaddleClas/ppcls/loss/trihardloss.py b/src/PaddleClas/ppcls/loss/trihardloss.py
new file mode 100644
index 0000000..132c604
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/trihardloss.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .comfunc import rerange_index
+
+
+class TriHardLoss(paddle.nn.Layer):
+    """
+    TriHard Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+    
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+
+    def __init__(self, batch_size=120, samples_each_class=2, margin=0.1):
+        super(TriHardLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size = batch_size
+        self.rerange_index = rerange_index(batch_size, samples_each_class)
+
+    def forward(self, input, target=None):
+        features = input["features"]
+        assert (self.batch_size == features.shape[0])
+
+        #normalization 
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index = paddle.to_tensor(self.rerange_index)
+
+        #calc sm
+        diffs = paddle.unsqueeze(
+            features, axis=1) - paddle.unsqueeze(
+                features, axis=0)
+        similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
+
+        #rerange 
+        tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
+        tmp = paddle.gather(tmp, index=rerange_index)
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])
+
+        #split
+        ignore, pos, neg = paddle.split(
+            similary_matrix,
+            num_or_sections=[1, samples_each_class - 1, -1],
+            axis=1)
+
+        ignore.stop_gradient = True
+        hard_pos = paddle.max(pos, axis=1)
+        hard_neg = paddle.min(neg, axis=1)
+
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)
+        loss = paddle.mean(loss)
+        return {"trihardloss": loss}
+
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
diff --git a/src/PaddleClas/ppcls/loss/triplet.py b/src/PaddleClas/ppcls/loss/triplet.py
new file mode 100644
index 0000000..d1c7eec
--- /dev/null
+++ b/src/PaddleClas/ppcls/loss/triplet.py
@@ -0,0 +1,137 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class TripletLossV2(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    Args:
+        margin (float): margin for triplet.
+    """
+
+    def __init__(self, margin=0.5, normalize_feature=True):
+        super(TripletLossV2, self).__init__()
+        self.margin = margin
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+        self.normalize_feature = normalize_feature
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input["features"]
+
+        if self.normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True), inputs) + 1e-12)
+
+        bs = inputs.shape[0]
+
+        # compute distance
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(
+            input=dist, x=inputs, y=inputs.t(), alpha=-2.0, beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+
+        # hard negative mining
+        is_pos = paddle.expand(target, (
+            bs, bs)).equal(paddle.expand(target, (bs, bs)).t())
+        is_neg = paddle.expand(target, (
+            bs, bs)).not_equal(paddle.expand(target, (bs, bs)).t())
+
+        # `dist_ap` means distance(anchor, positive)
+        ## both `dist_ap` and `relative_p_inds` with shape [N, 1]
+        '''
+        dist_ap, relative_p_inds = paddle.max(
+            paddle.reshape(dist[is_pos], (bs, -1)), axis=1, keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an, relative_n_inds = paddle.min(
+            paddle.reshape(dist[is_neg], (bs, -1)), axis=1, keepdim=True)
+        '''
+        dist_ap = paddle.max(paddle.reshape(
+            paddle.masked_select(dist, is_pos), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an = paddle.min(paddle.reshape(
+            paddle.masked_select(dist, is_neg), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # shape [N]
+        dist_ap = paddle.squeeze(dist_ap, axis=1)
+        dist_an = paddle.squeeze(dist_an, axis=1)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLossV2": loss}
+
+
+class TripletLoss(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    Reference:
+    Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737.
+    Code imported from https://github.com/Cysu/open-reid/blob/master/reid/loss/triplet.py.
+    Args:
+        margin (float): margin for triplet.
+    """
+
+    def __init__(self, margin=1.0):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input["features"]
+
+        bs = inputs.shape[0]
+        # Compute pairwise distance, replace by the official when merged
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(
+            input=dist, x=inputs, y=inputs.t(), alpha=-2.0, beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+
+        mask = paddle.equal(
+            target.expand([bs, bs]), target.expand([bs, bs]).t())
+        mask_numpy_idx = mask.numpy()
+        dist_ap, dist_an = [], []
+        for i in range(bs):
+            # dist_ap_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i]].max(),dtype='float64').unsqueeze(0)
+            # dist_ap_i.stop_gradient = False
+            # dist_ap.append(dist_ap_i)
+            dist_ap.append(
+                max([
+                    dist[i][j] if mask_numpy_idx[i][j] == True else float(
+                        "-inf") for j in range(bs)
+                ]).unsqueeze(0))
+            # dist_an_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i] == False].min(), dtype='float64').unsqueeze(0)
+            # dist_an_i.stop_gradient = False
+            # dist_an.append(dist_an_i)
+            dist_an.append(
+                min([
+                    dist[i][k] if mask_numpy_idx[i][k] == False else float(
+                        "inf") for k in range(bs)
+                ]).unsqueeze(0))
+
+        dist_ap = paddle.concat(dist_ap, axis=0)
+        dist_an = paddle.concat(dist_an, axis=0)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLoss": loss}
diff --git a/src/PaddleClas/ppcls/metric/__init__.py b/src/PaddleClas/ppcls/metric/__init__.py
new file mode 100644
index 0000000..9472123
--- /dev/null
+++ b/src/PaddleClas/ppcls/metric/__init__.py
@@ -0,0 +1,51 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle import nn
+import copy
+from collections import OrderedDict
+
+from .metrics import TopkAcc, mAP, mINP, Recallk, Precisionk
+from .metrics import DistillationTopkAcc
+from .metrics import GoogLeNetTopkAcc
+from .metrics import HammingDistance, AccuracyScore
+
+
+class CombinedMetrics(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.metric_func_list = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            metric_name = list(config)[0]
+            metric_params = config[metric_name]
+            if metric_params is not None:
+                self.metric_func_list.append(
+                    eval(metric_name)(**metric_params))
+            else:
+                self.metric_func_list.append(eval(metric_name)())
+
+    def __call__(self, *args, **kwargs):
+        metric_dict = OrderedDict()
+        for idx, metric_func in enumerate(self.metric_func_list):
+            metric_dict.update(metric_func(*args, **kwargs))
+        return metric_dict
+
+
+def build_metrics(config):
+    metrics_list = CombinedMetrics(copy.deepcopy(config))
+    return metrics_list
diff --git a/src/PaddleClas/ppcls/metric/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/metric/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..c075b76
Binary files /dev/null and b/src/PaddleClas/ppcls/metric/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/metric/__pycache__/metrics.cpython-39.pyc b/src/PaddleClas/ppcls/metric/__pycache__/metrics.cpython-39.pyc
new file mode 100644
index 0000000..d7ce7f4
Binary files /dev/null and b/src/PaddleClas/ppcls/metric/__pycache__/metrics.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/metric/metrics.py b/src/PaddleClas/ppcls/metric/metrics.py
new file mode 100644
index 0000000..7c6407e
--- /dev/null
+++ b/src/PaddleClas/ppcls/metric/metrics.py
@@ -0,0 +1,309 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.preprocessing import binarize
+
+
+class TopkAcc(nn.Layer):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+
+        metric_dict = dict()
+        for k in self.topk:
+            metric_dict["top{}".format(k)] = paddle.metric.accuracy(
+                x, label, k=k)
+        return metric_dict
+
+
+class mAP(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        div = paddle.arange(acc_sum.shape[1]).astype("float32") + 1
+        precision = paddle.divide(acc_sum, div)
+
+        #calc map
+        precision_mask = paddle.multiply(equal_flag, precision)
+        ap = paddle.sum(precision_mask, axis=1) / paddle.sum(equal_flag,
+                                                             axis=1)
+        metric_dict["mAP"] = paddle.mean(ap).numpy()[0]
+        return metric_dict
+
+
+class mINP(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        #do accumulative sum
+        div = paddle.arange(equal_flag.shape[1]).astype("float32") + 2
+        minus = paddle.divide(equal_flag, div)
+        auxilary = paddle.subtract(equal_flag, minus)
+        hard_index = paddle.argmax(auxilary, axis=1).astype("float32")
+        all_INP = paddle.divide(paddle.sum(equal_flag, axis=1), hard_index)
+        mINP = paddle.mean(all_INP)
+        metric_dict["mINP"] = mINP.numpy()[0]
+        return metric_dict
+
+
+class Recallk(nn.Layer):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        #get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+        real_query_num = paddle.sum(equal_flag, axis=1)
+        real_query_num = paddle.sum(
+            paddle.greater_than(real_query_num, paddle.to_tensor(0.)).astype(
+                "float32"))
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        mask = paddle.greater_than(acc_sum,
+                                   paddle.to_tensor(0.)).astype("float32")
+        all_cmc = (paddle.sum(mask, axis=0) / real_query_num).numpy()
+
+        for k in self.topk:
+            metric_dict["recall{}".format(k)] = all_cmc[k - 1]
+        return metric_dict
+
+
+class Precisionk(nn.Layer):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        #get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        Ns = paddle.arange(gallery_img_id.shape[0]) + 1
+        equal_flag_cumsum = paddle.cumsum(equal_flag, axis=1)
+        Precision_at_k = (paddle.mean(equal_flag_cumsum, axis=0) / Ns).numpy()
+
+        for k in self.topk:
+            metric_dict["precision@{}".format(k)] = Precision_at_k[k - 1]
+
+        return metric_dict
+
+
+class DistillationTopkAcc(TopkAcc):
+    def __init__(self, model_key, feature_key=None, topk=(1, 5)):
+        super().__init__(topk=topk)
+        self.model_key = model_key
+        self.feature_key = feature_key
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x[self.model_key]
+        if self.feature_key is not None:
+            x = x[self.feature_key]
+        return super().forward(x, label)
+
+
+class GoogLeNetTopkAcc(TopkAcc):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, x, label):
+        return super().forward(x[0], label)
+
+
+class MutiLabelMetric(object):
+    def __init__(self):
+        pass
+
+    def _multi_hot_encode(self, logits, threshold=0.5):
+        return binarize(logits, threshold=threshold)
+
+    def __call__(self, output):
+        output = F.sigmoid(output)
+        preds = self._multi_hot_encode(logits=output.numpy(), threshold=0.5)
+        return preds
+
+
+class HammingDistance(MutiLabelMetric):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def __call__(self, output, target):
+        preds = super().__call__(output)
+        metric_dict = dict()
+        metric_dict["HammingDistance"] = paddle.to_tensor(
+            hamming_loss(target, preds))
+        return metric_dict
+
+
+class AccuracyScore(MutiLabelMetric):
+    """
+    Hard metric for multilabel classification
+    Args:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    def __init__(self, base="label"):
+        super().__init__()
+        assert base in ["sample", "label"
+                        ], 'must be one of ["sample", "label"]'
+        self.base = base
+
+    def __call__(self, output, target):
+        preds = super().__call__(output)
+        metric_dict = dict()
+        if self.base == "sample":
+            accuracy = accuracy_metric(target, preds)
+        elif self.base == "label":
+            mcm = multilabel_confusion_matrix(target, preds)
+            tns = mcm[:, 0, 0]
+            fns = mcm[:, 1, 0]
+            tps = mcm[:, 1, 1]
+            fps = mcm[:, 0, 1]
+            accuracy = (sum(tps) + sum(tns)) / (
+                sum(tps) + sum(tns) + sum(fns) + sum(fps))
+            precision = sum(tps) / (sum(tps) + sum(fps))
+            recall = sum(tps) / (sum(tps) + sum(fns))
+            F1 = 2 * (accuracy * recall) / (accuracy + recall)
+        metric_dict["AccuracyScore"] = paddle.to_tensor(accuracy)
+        return metric_dict
diff --git a/src/PaddleClas/ppcls/optimizer/__init__.py b/src/PaddleClas/ppcls/optimizer/__init__.py
new file mode 100644
index 0000000..61db39f
--- /dev/null
+++ b/src/PaddleClas/ppcls/optimizer/__init__.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+
+from ppcls.utils import logger
+
+from . import optimizer
+
+__all__ = ['build_optimizer']
+
+
+def build_lr_scheduler(lr_config, epochs, step_each_epoch):
+    from . import learning_rate
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+    if 'name' in lr_config:
+        lr_name = lr_config.pop('name')
+        lr = getattr(learning_rate, lr_name)(**lr_config)
+        if isinstance(lr, paddle.optimizer.lr.LRScheduler):
+            return lr
+        else:
+            return lr()
+    else:
+        lr = lr_config['learning_rate']
+    return lr
+
+
+# model_list is None in static graph
+def build_optimizer(config, epochs, step_each_epoch, model_list=None):
+    config = copy.deepcopy(config)
+    # step1 build lr
+    lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch)
+    logger.debug("build lr ({}) success..".format(lr))
+    # step2 build regularization
+    if 'regularizer' in config and config['regularizer'] is not None:
+        if 'weight_decay' in config:
+            logger.warning(
+                "ConfigError: Only one of regularizer and weight_decay can be set in Optimizer Config. \"weight_decay\" has been ignored."
+            )
+        reg_config = config.pop('regularizer')
+        reg_name = reg_config.pop('name') + 'Decay'
+        reg = getattr(paddle.regularizer, reg_name)(**reg_config)
+        config["weight_decay"] = reg
+        logger.debug("build regularizer ({}) success..".format(reg))
+    # step3 build optimizer
+    optim_name = config.pop('name')
+    if 'clip_norm' in config:
+        clip_norm = config.pop('clip_norm')
+        grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+    else:
+        grad_clip = None
+    optim = getattr(optimizer, optim_name)(learning_rate=lr,
+                                           grad_clip=grad_clip,
+                                           **config)(model_list=model_list)
+    logger.debug("build optimizer ({}) success..".format(optim))
+    return optim, lr
diff --git a/src/PaddleClas/ppcls/optimizer/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/optimizer/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..282c4ce
Binary files /dev/null and b/src/PaddleClas/ppcls/optimizer/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/optimizer/__pycache__/learning_rate.cpython-39.pyc b/src/PaddleClas/ppcls/optimizer/__pycache__/learning_rate.cpython-39.pyc
new file mode 100644
index 0000000..d3b715c
Binary files /dev/null and b/src/PaddleClas/ppcls/optimizer/__pycache__/learning_rate.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/optimizer/__pycache__/optimizer.cpython-39.pyc b/src/PaddleClas/ppcls/optimizer/__pycache__/optimizer.cpython-39.pyc
new file mode 100644
index 0000000..7d244ac
Binary files /dev/null and b/src/PaddleClas/ppcls/optimizer/__pycache__/optimizer.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/optimizer/learning_rate.py b/src/PaddleClas/ppcls/optimizer/learning_rate.py
new file mode 100644
index 0000000..b59387d
--- /dev/null
+++ b/src/PaddleClas/ppcls/optimizer/learning_rate.py
@@ -0,0 +1,326 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+from paddle.optimizer import lr
+from paddle.optimizer.lr import LRScheduler
+
+from ppcls.utils import logger
+
+
+class Linear(object):
+    """
+    Linear learning rate decay
+    Args:
+        lr (float): The initial learning rate. It is a python float number.
+        epochs(int): The decay step size. It determines the decay cycle.
+        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
+        power(float, optional): Power of polynomial. Default: 1.0.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 epochs,
+                 step_each_epoch,
+                 end_lr=0.0,
+                 power=1.0,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.learning_rate = learning_rate
+        self.steps = (epochs - warmup_epoch) * step_each_epoch
+        self.end_lr = end_lr
+        self.power = power
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.PolynomialDecay(
+            learning_rate=self.learning_rate,
+            decay_steps=self.steps,
+            end_lr=self.end_lr,
+            power=self.power,
+            last_epoch=self.
+            last_epoch) if self.steps > 0 else self.learning_rate
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class Cosine(object):
+    """
+    Cosine learning rate decay
+    lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
+    Args:
+        lr(float): initial learning rate
+        step_each_epoch(int): steps each epoch
+        epochs(int): total training epochs
+        eta_min(float): Minimum learning rate. Default: 0.0.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 step_each_epoch,
+                 epochs,
+                 eta_min=0.0,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.learning_rate = learning_rate
+        self.T_max = (epochs - warmup_epoch) * step_each_epoch
+        self.eta_min = eta_min
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.CosineAnnealingDecay(
+            learning_rate=self.learning_rate,
+            T_max=self.T_max,
+            eta_min=self.eta_min,
+            last_epoch=self.
+            last_epoch) if self.T_max > 0 else self.learning_rate
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class Step(object):
+    """
+    Piecewise learning rate decay
+    Args:
+        step_each_epoch(int): steps each epoch
+        learning_rate (float): The initial learning rate. It is a python float number.
+        step_size (int): the interval to update.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
+            It should be less than 1.0. Default: 0.1.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 step_size,
+                 step_each_epoch,
+                 epochs,
+                 gamma,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.step_size = step_each_epoch * step_size
+        self.learning_rate = learning_rate
+        self.gamma = gamma
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.StepDecay(
+            learning_rate=self.learning_rate,
+            step_size=self.step_size,
+            gamma=self.gamma,
+            last_epoch=self.last_epoch)
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class Piecewise(object):
+    """
+    Piecewise learning rate decay
+    Args:
+        boundaries(list): A list of steps numbers. The type of element in the list is python int.
+        values(list): A list of learning rate values that will be picked during different epoch boundaries.
+            The type of element in the list is python float.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 step_each_epoch,
+                 decay_epochs,
+                 values,
+                 epochs,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.boundaries = [step_each_epoch * e for e in decay_epochs]
+        self.values = values
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.PiecewiseDecay(
+            boundaries=self.boundaries,
+            values=self.values,
+            last_epoch=self.last_epoch)
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.values[0],
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class MultiStepDecay(LRScheduler):
+    """
+    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
+    The algorithm can be described as the code below.
+    .. code-block:: text
+        learning_rate = 0.5
+        milestones = [30, 50]
+        gamma = 0.1
+        if epoch < 30:
+            learning_rate = 0.5
+        elif epoch < 50:
+            learning_rate = 0.05
+        else:
+            learning_rate = 0.005
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``MultiStepDecay`` instance to schedule learning rate.
+    Examples:
+
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            # train on default dynamic graph mode
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(5):
+                    x = paddle.uniform([10, 10])
+                    out = linear(x)
+                    loss = paddle.mean(out)
+                    loss.backward()
+                    sgd.step()
+                    sgd.clear_gradients()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
+            # train on static graph mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(5):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 milestones,
+                 epochs,
+                 step_each_epoch,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(milestones, (tuple, list)):
+            raise TypeError(
+                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
+                % type(milestones))
+        if not all([
+                milestones[i] < milestones[i + 1]
+                for i in range(len(milestones) - 1)
+        ]):
+            raise ValueError('The elements of milestones must be incremented')
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+        self.milestones = [x * step_each_epoch for x in milestones]
+        self.gamma = gamma
+        super().__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        for i in range(len(self.milestones)):
+            if self.last_epoch < self.milestones[i]:
+                return self.base_lr * (self.gamma**i)
+        return self.base_lr * (self.gamma**len(self.milestones))
diff --git a/src/PaddleClas/ppcls/optimizer/optimizer.py b/src/PaddleClas/ppcls/optimizer/optimizer.py
new file mode 100644
index 0000000..4422ea7
--- /dev/null
+++ b/src/PaddleClas/ppcls/optimizer/optimizer.py
@@ -0,0 +1,217 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddle import optimizer as optim
+import paddle
+
+from ppcls.utils import logger
+
+
+class Momentum(object):
+    """
+    Simple Momentum optimizer with velocity state.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=True):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.multi_precision = multi_precision
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.Momentum(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            multi_precision=self.multi_precision,
+            parameters=parameters)
+        if hasattr(opt, '_use_multi_tensor'):
+            opt = optim.Momentum(
+                learning_rate=self.learning_rate,
+                momentum=self.momentum,
+                weight_decay=self.weight_decay,
+                grad_clip=self.grad_clip,
+                multi_precision=self.multi_precision,
+                parameters=parameters,
+                use_multi_tensor=True)
+        return opt
+
+
+class Adam(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-08,
+                 parameter_list=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False,
+                 multi_precision=False):
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.parameter_list = parameter_list
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.name = name
+        self.lazy_mode = lazy_mode
+        self.multi_precision = multi_precision
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.Adam(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            name=self.name,
+            lazy_mode=self.lazy_mode,
+            multi_precision=self.multi_precision,
+            parameters=parameters)
+        return opt
+
+
+class RMSProp(object):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        rho (float) - rho value in equation.
+        epsilon (float) - avoid division by zero, default is 1e-6.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum=0.0,
+                 rho=0.95,
+                 epsilon=1e-6,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=False):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.rho = rho
+        self.epsilon = epsilon
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.RMSProp(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            rho=self.rho,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            parameters=parameters)
+        return opt
+
+
+class AdamW(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=None,
+                 multi_precision=False,
+                 grad_clip=None,
+                 no_weight_decay_name=None,
+                 one_dim_param_no_weight_decay=False,
+                 **args):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.grad_clip = grad_clip
+        self.weight_decay = weight_decay
+        self.multi_precision = multi_precision
+        self.no_weight_decay_name_list = no_weight_decay_name.split(
+        ) if no_weight_decay_name else []
+        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+
+        # TODO(gaotingquan): model_list is None when in static graph, "no_weight_decay" not work.
+        if model_list is None:
+            if self.one_dim_param_no_weight_decay or len(
+                    self.no_weight_decay_name_list) != 0:
+                msg = "\"AdamW\" does not support setting \"no_weight_decay\" in static graph. Please use dynamic graph."
+                logger.error(Exception(msg))
+                raise Exception(msg)
+
+        self.no_weight_decay_param_name_list = [
+            p.name for model in model_list for n, p in model.named_parameters()
+            if any(nd in n for nd in self.no_weight_decay_name_list)
+        ] if model_list else []
+
+        if self.one_dim_param_no_weight_decay:
+            self.no_weight_decay_param_name_list += [
+                p.name for model in model_list
+                for n, p in model.named_parameters() if len(p.shape) == 1
+            ] if model_list else []
+
+        opt = optim.AdamW(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            parameters=parameters,
+            weight_decay=self.weight_decay,
+            multi_precision=self.multi_precision,
+            grad_clip=self.grad_clip,
+            apply_decay_param_fun=self._apply_decay_param_fun)
+        return opt
+
+    def _apply_decay_param_fun(self, name):
+        return name not in self.no_weight_decay_param_name_list
diff --git a/src/PaddleClas/ppcls/static/program.py b/src/PaddleClas/ppcls/static/program.py
new file mode 100644
index 0000000..b3534a2
--- /dev/null
+++ b/src/PaddleClas/ppcls/static/program.py
@@ -0,0 +1,449 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import numpy as np
+
+from collections import OrderedDict
+
+import paddle
+import paddle.nn.functional as F
+
+from paddle.distributed import fleet
+from paddle.distributed.fleet import DistributedStrategy
+
+# from ppcls.optimizer import OptimizerBuilder
+# from ppcls.optimizer.learning_rate import LearningRateBuilder
+
+from ppcls.arch import build_model
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.optimizer import build_lr_scheduler
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger, profiler
+
+
+def create_feeds(image_shape, use_mix=False, class_num=None, dtype="float32"):
+    """
+    Create feeds as model input
+
+    Args:
+        image_shape(list[int]): model input shape, such as [3, 224, 224]
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        class_num(int): the class number of network, required if use_mix
+
+    Returns:
+        feeds(dict): dict of model input variables
+    """
+    feeds = OrderedDict()
+    feeds['data'] = paddle.static.data(
+        name="data", shape=[None] + image_shape, dtype=dtype)
+
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        feeds['target'] = paddle.static.data(
+            name="target", shape=[None, class_num], dtype="float32")
+    else:
+        feeds['label'] = paddle.static.data(
+            name="label", shape=[None, 1], dtype="int64")
+
+    return feeds
+
+
+def create_fetchs(out,
+                  feeds,
+                  architecture,
+                  topk=5,
+                  epsilon=None,
+                  class_num=None,
+                  use_mix=False,
+                  config=None,
+                  mode="Train"):
+    """
+    Create fetchs as model outputs(included loss and measures),
+    will call create_loss and create_metric(if use_mix).
+    Args:
+        out(variable): model output variable
+        feeds(dict): dict of model input variables.
+            If use mix_up, it will not include label.
+        architecture(dict): architecture information,
+            name(such as ResNet50) is needed
+        topk(int): usually top5
+        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
+        class_num(int): the class number of network, required if use_mix
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        config(dict): model config
+
+    Returns:
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    fetchs = OrderedDict()
+    # build loss
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        target = paddle.reshape(feeds['target'], [-1, class_num])
+    else:
+        target = paddle.reshape(feeds['label'], [-1, 1])
+
+    loss_func = build_loss(config["Loss"][mode])
+    loss_dict = loss_func(out, target)
+
+    loss_out = loss_dict["loss"]
+    fetchs['loss'] = (loss_out, AverageMeter('loss', '7.4f', need_avg=True))
+
+    # build metric
+    if not use_mix:
+        metric_func = build_metrics(config["Metric"][mode])
+
+        metric_dict = metric_func(out, target)
+
+        for key in metric_dict:
+            if mode != "Train" and paddle.distributed.get_world_size() > 1:
+                paddle.distributed.all_reduce(
+                    metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
+                metric_dict[key] = metric_dict[
+                    key] / paddle.distributed.get_world_size()
+
+            fetchs[key] = (metric_dict[key], AverageMeter(
+                key, '7.4f', need_avg=True))
+
+    return fetchs
+
+
+def create_optimizer(config, step_each_epoch):
+    # create learning_rate instance
+    optimizer, lr_sch = build_optimizer(
+        config["Optimizer"], config["Global"]["epochs"], step_each_epoch)
+    return optimizer, lr_sch
+
+
+def create_strategy(config):
+    """
+    Create build strategy and exec strategy.
+
+    Args:
+        config(dict): config
+
+    Returns:
+        build_strategy: build strategy
+        exec_strategy: exec strategy
+    """
+    build_strategy = paddle.static.BuildStrategy()
+    exec_strategy = paddle.static.ExecutionStrategy()
+
+    exec_strategy.num_threads = 1
+    exec_strategy.num_iteration_per_drop_scope = (
+        10000
+        if 'AMP' in config and config.AMP.get("level", "O1") == "O2" else 10)
+
+    fuse_op = True if 'AMP' in config else False
+
+    fuse_bn_act_ops = config.get('fuse_bn_act_ops', fuse_op)
+    fuse_elewise_add_act_ops = config.get('fuse_elewise_add_act_ops', fuse_op)
+    fuse_bn_add_act_ops = config.get('fuse_bn_add_act_ops', fuse_op)
+    enable_addto = config.get('enable_addto', fuse_op)
+
+    build_strategy.fuse_bn_act_ops = fuse_bn_act_ops
+    build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+    build_strategy.fuse_bn_add_act_ops = fuse_bn_add_act_ops
+    build_strategy.enable_addto = enable_addto
+
+    return build_strategy, exec_strategy
+
+
+def dist_optimizer(config, optimizer):
+    """
+    Create a distributed optimizer based on a normal optimizer
+
+    Args:
+        config(dict):
+        optimizer(): a normal optimizer
+
+    Returns:
+        optimizer: a distributed optimizer
+    """
+    build_strategy, exec_strategy = create_strategy(config)
+
+    dist_strategy = DistributedStrategy()
+    dist_strategy.execution_strategy = exec_strategy
+    dist_strategy.build_strategy = build_strategy
+
+    dist_strategy.nccl_comm_num = 1
+    dist_strategy.fuse_all_reduce_ops = True
+    dist_strategy.fuse_grad_size_in_MB = 16
+    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+
+    return optimizer
+
+
+def mixed_precision_optimizer(config, optimizer):
+    if 'AMP' in config:
+        amp_cfg = config.AMP if config.AMP else dict()
+        scale_loss = amp_cfg.get('scale_loss', 1.0)
+        use_dynamic_loss_scaling = amp_cfg.get('use_dynamic_loss_scaling',
+                                               False)
+        use_pure_fp16 = amp_cfg.get("level", "O1") == "O2"
+        optimizer = paddle.static.amp.decorate(
+            optimizer,
+            init_loss_scaling=scale_loss,
+            use_dynamic_loss_scaling=use_dynamic_loss_scaling,
+            use_pure_fp16=use_pure_fp16,
+            use_fp16_guard=True)
+
+    return optimizer
+
+
+def build(config,
+          main_prog,
+          startup_prog,
+          class_num=None,
+          step_each_epoch=100,
+          is_train=True,
+          is_distributed=True):
+    """
+    Build a program using a model and an optimizer
+        1. create feeds
+        2. create a dataloader
+        3. create a model
+        4. create fetchs
+        5. create an optimizer
+
+    Args:
+        config(dict): config
+        main_prog(): main program
+        startup_prog(): startup program
+        class_num(int): the class number of network, required if use_mix
+        is_train(bool): train or eval
+        is_distributed(bool): whether to use distributed training method
+
+    Returns:
+        dataloader(): a bridge between the model and the data
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    with paddle.static.program_guard(main_prog, startup_prog):
+        with paddle.utils.unique_name.guard():
+            mode = "Train" if is_train else "Eval"
+            use_mix = "batch_transform_ops" in config["DataLoader"][mode][
+                "dataset"]
+            feeds = create_feeds(
+                config["Global"]["image_shape"],
+                use_mix,
+                class_num=class_num,
+                dtype="float32")
+
+            # build model
+            # data_format should be assigned in arch-dict
+            input_image_channel = config["Global"]["image_shape"][
+                0]  # default as [3, 224, 224]
+            model = build_model(config)
+            out = model(feeds["data"])
+            # end of build model
+
+            fetchs = create_fetchs(
+                out,
+                feeds,
+                config["Arch"],
+                epsilon=config.get('ls_epsilon'),
+                class_num=class_num,
+                use_mix=use_mix,
+                config=config,
+                mode=mode)
+            lr_scheduler = None
+            optimizer = None
+            if is_train:
+                optimizer, lr_scheduler = build_optimizer(
+                    config["Optimizer"], config["Global"]["epochs"],
+                    step_each_epoch)
+                optimizer = mixed_precision_optimizer(config, optimizer)
+                if is_distributed:
+                    optimizer = dist_optimizer(config, optimizer)
+                optimizer.minimize(fetchs['loss'][0])
+    return fetchs, lr_scheduler, feeds, optimizer
+
+
+def compile(config, program, loss_name=None, share_prog=None):
+    """
+    Compile the program
+
+    Args:
+        config(dict): config
+        program(): the program which is wrapped by
+        loss_name(str): loss name
+        share_prog(): the shared program, used for evaluation during training
+
+    Returns:
+        compiled_program(): a compiled program
+    """
+    build_strategy, exec_strategy = create_strategy(config)
+
+    compiled_program = paddle.static.CompiledProgram(
+        program).with_data_parallel(
+            share_vars_from=share_prog,
+            loss_name=loss_name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+
+    return compiled_program
+
+
+total_step = 0
+
+
+def run(dataloader,
+        exe,
+        program,
+        feeds,
+        fetchs,
+        epoch=0,
+        mode='train',
+        config=None,
+        vdl_writer=None,
+        lr_scheduler=None,
+        profiler_options=None):
+    """
+    Feed data to the model and fetch the measures and loss
+
+    Args:
+        dataloader(paddle io dataloader):
+        exe():
+        program():
+        fetchs(dict): dict of measures and the loss
+        epoch(int): epoch of training or evaluation
+        model(str): log only
+
+    Returns:
+    """
+    fetch_list = [f[0] for f in fetchs.values()]
+    metric_dict = OrderedDict([("lr", AverageMeter(
+        'lr', 'f', postfix=",", need_avg=False))])
+
+    for k in fetchs:
+        metric_dict[k] = fetchs[k][1]
+
+    metric_dict["batch_time"] = AverageMeter(
+        'batch_cost', '.5f', postfix=" s,")
+    metric_dict["reader_time"] = AverageMeter(
+        'reader_cost', '.5f', postfix=" s,")
+
+    for m in metric_dict.values():
+        m.reset()
+
+    use_dali = config["Global"].get('use_dali', False)
+    tic = time.time()
+
+    if not use_dali:
+        dataloader = dataloader()
+
+    idx = 0
+    batch_size = None
+    while True:
+        # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG
+        try:
+            batch = next(dataloader)
+        except StopIteration:
+            break
+        except RuntimeError:
+            logger.warning(
+                "Except RuntimeError when reading data from dataloader, try to read once again..."
+            )
+            continue
+        idx += 1
+        # ignore the warmup iters
+        if idx == 5:
+            metric_dict["batch_time"].reset()
+            metric_dict["reader_time"].reset()
+
+        metric_dict['reader_time'].update(time.time() - tic)
+
+        profiler.add_profiler_step(profiler_options)
+
+        if use_dali:
+            batch_size = batch[0]["data"].shape()[0]
+            feed_dict = batch[0]
+        else:
+            batch_size = batch[0].shape()[0]
+            feed_dict = {
+                key.name: batch[idx]
+                for idx, key in enumerate(feeds.values())
+            }
+
+        metrics = exe.run(program=program,
+                          feed=feed_dict,
+                          fetch_list=fetch_list)
+
+        for name, m in zip(fetchs.keys(), metrics):
+            metric_dict[name].update(np.mean(m), batch_size)
+        metric_dict["batch_time"].update(time.time() - tic)
+        if mode == "train":
+            metric_dict['lr'].update(lr_scheduler.get_lr())
+
+        fetchs_str = ' '.join([
+            str(metric_dict[key].mean)
+            if "time" in key else str(metric_dict[key].value)
+            for key in metric_dict
+        ])
+        ips_info = " ips: {:.5f} images/sec.".format(
+            batch_size / metric_dict["batch_time"].avg)
+        fetchs_str += ips_info
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        if vdl_writer:
+            global total_step
+            logger.scaler('loss', metrics[0][0], total_step, vdl_writer)
+            total_step += 1
+        if mode == 'eval':
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info("{:s} step:{:<4d} {:s}".format(mode, idx,
+                                                           fetchs_str))
+        else:
+            epoch_str = "epoch:{:<3d}".format(epoch)
+            step_str = "{:s} step:{:<4d}".format(mode, idx)
+
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info("{:s} {:s} {:s}".format(epoch_str, step_str,
+                                                    fetchs_str))
+
+        tic = time.time()
+
+    end_str = ' '.join([str(m.mean) for m in metric_dict.values()] +
+                       [metric_dict["batch_time"].total])
+    ips_info = "ips: {:.5f} images/sec.".format(batch_size /
+                                                metric_dict["batch_time"].avg)
+    if mode == 'eval':
+        logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info))
+    else:
+        end_epoch_str = "END epoch:{:<3d}".format(epoch)
+        logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str,
+                                                 ips_info))
+    if use_dali:
+        dataloader.reset()
+
+    # return top1_acc in order to save the best model
+    if mode == 'eval':
+        return fetchs["top1"][1].avg
diff --git a/src/PaddleClas/ppcls/static/run_dali.sh b/src/PaddleClas/ppcls/static/run_dali.sh
new file mode 100644
index 0000000..5bf0ef4
--- /dev/null
+++ b/src/PaddleClas/ppcls/static/run_dali.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+python3.7 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    ppcls/static/train.py \
+    -c ./ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
diff --git a/src/PaddleClas/ppcls/static/save_load.py b/src/PaddleClas/ppcls/static/save_load.py
new file mode 100644
index 0000000..13badfd
--- /dev/null
+++ b/src/PaddleClas/ppcls/static/save_load.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import re
+import shutil
+import tempfile
+
+import paddle
+
+from ppcls.utils import logger
+
+__all__ = ['init_model', 'save_model']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def _load_state(path):
+    if os.path.exists(path + '.pdopt'):
+        # XXX another hack to ignore the optimizer state
+        tmp = tempfile.mkdtemp()
+        dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
+        shutil.copy(path + '.pdparams', dst + '.pdparams')
+        state = paddle.static.load_program_state(dst)
+        shutil.rmtree(tmp)
+    else:
+        state = paddle.static.load_program_state(path)
+    return state
+
+
+def load_params(exe, prog, path, ignore_params=None):
+    """
+    Load model from the given path.
+    Args:
+        exe (fluid.Executor): The fluid.Executor object.
+        prog (fluid.Program): load weight to which Program object.
+        path (string): URL string or loca model path.
+        ignore_params (list): ignore variable to load when finetuning.
+            It can be specified by finetune_exclude_pretrained_params
+            and the usage can refer to the document
+            docs/advanced_tutorials/TRANSFER_LEARNING.md
+    """
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(path))
+
+    logger.info("Loading parameters from {}...".format(path))
+
+    ignore_set = set()
+    state = _load_state(path)
+
+    # ignore the parameter which mismatch the shape
+    # between the model and pretrain weight.
+    all_var_shape = {}
+    for block in prog.blocks:
+        for param in block.all_parameters():
+            all_var_shape[param.name] = param.shape
+    ignore_set.update([
+        name for name, shape in all_var_shape.items()
+        if name in state and shape != state[name].shape
+    ])
+
+    if ignore_params:
+        all_var_names = [var.name for var in prog.list_vars()]
+        ignore_list = filter(
+            lambda var: any([re.match(name, var) for name in ignore_params]),
+            all_var_names)
+        ignore_set.update(list(ignore_list))
+
+    if len(ignore_set) > 0:
+        for k in ignore_set:
+            if k in state:
+                logger.warning(
+                    'variable {} is already excluded automatically'.format(k))
+                del state[k]
+
+    paddle.static.set_program_state(prog, state)
+
+
+def init_model(config, program, exe):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints:
+        paddle.static.load(program, checkpoints, exe)
+        logger.info("Finish initing model from {}".format(checkpoints))
+        return
+
+    pretrained_model = config.get('pretrained_model')
+    if pretrained_model:
+        if not isinstance(pretrained_model, list):
+            pretrained_model = [pretrained_model]
+        for pretrain in pretrained_model:
+            load_params(exe, program, pretrain)
+        logger.info("Finish initing model from {}".format(pretrained_model))
+
+
+def save_model(program, model_path, epoch_id, prefix='ppcls'):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    model_path = os.path.join(model_path, str(epoch_id))
+    _mkdir_if_not_exist(model_path)
+    model_prefix = os.path.join(model_path, prefix)
+    paddle.static.save(program, model_prefix)
+    logger.info("Already save model in {}".format(model_path))
diff --git a/src/PaddleClas/ppcls/static/train.py b/src/PaddleClas/ppcls/static/train.py
new file mode 100644
index 0000000..9c03598
--- /dev/null
+++ b/src/PaddleClas/ppcls/static/train.py
@@ -0,0 +1,209 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+
+import paddle
+from paddle.distributed import fleet
+from visualdl import LogWriter
+
+from ppcls.data import build_dataloader
+from ppcls.utils.config import get_config, print_config
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.static.save_load import init_model, save_model
+from ppcls.static import program
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleClas train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/ResNet/ResNet50.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    """
+    all the config of training paradigm should be in config["Global"]
+    """
+    config = get_config(args.config, overrides=args.override, show=False)
+    global_config = config["Global"]
+
+    mode = "train"
+
+    log_file = os.path.join(global_config['output_dir'],
+                            config["Arch"]["name"], f"{mode}.log")
+    init_logger(name='root', log_file=log_file)
+    print_config(config)
+
+    if global_config.get("is_distributed", True):
+        fleet.init(is_collective=True)
+    # assign the device
+    use_gpu = global_config.get("use_gpu", True)
+    # amp related config
+    if 'AMP' in config:
+        AMP_RELATED_FLAGS_SETTING = {
+            'FLAGS_cudnn_exhaustive_search': 1,
+            'FLAGS_conv_workspace_size_limit': 1500,
+            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+            'FLAGS_max_inplace_grad_add': 8,
+        }
+        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
+        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+    use_xpu = global_config.get("use_xpu", False)
+    use_npu = global_config.get("use_npu", False)
+    assert (
+        use_gpu and use_xpu and use_npu
+    ) is not True, "gpu, xpu and npu can not be true in the same time in static mode!"
+
+    if use_gpu:
+        device = paddle.set_device('gpu')
+    elif use_xpu:
+        device = paddle.set_device('xpu')
+    elif use_npu:
+        device = paddle.set_device('npu')
+    else:
+        device = paddle.set_device('cpu')
+
+    # visualDL
+    vdl_writer = None
+    if global_config["use_visualdl"]:
+        vdl_dir = os.path.join(global_config["output_dir"], "vdl")
+        vdl_writer = LogWriter(vdl_dir)
+
+    # build dataloader
+    eval_dataloader = None
+    use_dali = global_config.get('use_dali', False)
+
+    class_num = config["Arch"].get("class_num", None)
+    config["DataLoader"].update({"class_num": class_num})
+    train_dataloader = build_dataloader(
+        config["DataLoader"], "Train", device=device, use_dali=use_dali)
+    if global_config["eval_during_train"]:
+        eval_dataloader = build_dataloader(
+            config["DataLoader"], "Eval", device=device, use_dali=use_dali)
+
+    step_each_epoch = len(train_dataloader)
+
+    # startup_prog is used to do some parameter init work,
+    # and train prog is used to hold the network
+    startup_prog = paddle.static.Program()
+    train_prog = paddle.static.Program()
+
+    best_top1_acc = 0.0  # best top1 acc record
+
+    train_fetchs, lr_scheduler, train_feeds, optimizer = program.build(
+        config,
+        train_prog,
+        startup_prog,
+        class_num,
+        step_each_epoch=step_each_epoch,
+        is_train=True,
+        is_distributed=global_config.get("is_distributed", True))
+
+    if global_config["eval_during_train"]:
+        eval_prog = paddle.static.Program()
+        eval_fetchs, _, eval_feeds, _ = program.build(
+            config,
+            eval_prog,
+            startup_prog,
+            is_train=False,
+            is_distributed=global_config.get("is_distributed", True))
+        # clone to prune some content which is irrelevant in eval_prog
+        eval_prog = eval_prog.clone(for_test=True)
+
+    # create the "Executor" with the statement of which device
+    exe = paddle.static.Executor(device)
+    # Parameter initialization
+    exe.run(startup_prog)
+    # load pretrained models or checkpoints
+    init_model(global_config, train_prog, exe)
+
+    if 'AMP' in config and config.AMP.get("level", "O1") == "O2":
+        optimizer.amp_init(
+            device,
+            scope=paddle.static.global_scope(),
+            test_program=eval_prog
+            if global_config["eval_during_train"] else None)
+
+    if not global_config.get("is_distributed", True):
+        compiled_train_prog = program.compile(
+            config, train_prog, loss_name=train_fetchs["loss"][0].name)
+    else:
+        compiled_train_prog = train_prog
+
+    if eval_dataloader is not None:
+        compiled_eval_prog = program.compile(config, eval_prog)
+
+    for epoch_id in range(global_config["epochs"]):
+        # 1. train with train dataset
+        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
+                    train_fetchs, epoch_id, 'train', config, vdl_writer,
+                    lr_scheduler, args.profiler_options)
+        # 2. evaate with eval dataset
+        if global_config["eval_during_train"] and epoch_id % global_config[
+                "eval_interval"] == 0:
+            top1_acc = program.run(eval_dataloader, exe, compiled_eval_prog,
+                                   eval_feeds, eval_fetchs, epoch_id, "eval",
+                                   config)
+            if top1_acc > best_top1_acc:
+                best_top1_acc = top1_acc
+                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
+                    best_top1_acc, epoch_id)
+                logger.info(message)
+                if epoch_id % global_config["save_interval"] == 0:
+
+                    model_path = os.path.join(global_config["output_dir"],
+                                              config["Arch"]["name"])
+                    save_model(train_prog, model_path, "best_model")
+
+        # 3. save the persistable model
+        if epoch_id % global_config["save_interval"] == 0:
+            model_path = os.path.join(global_config["output_dir"],
+                                      config["Arch"]["name"])
+            save_model(train_prog, model_path, epoch_id)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    args = parse_args()
+    main(args)
diff --git a/src/PaddleClas/ppcls/utils/__init__.py b/src/PaddleClas/ppcls/utils/__init__.py
new file mode 100644
index 0000000..632cc78
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import logger
+from . import misc
+from . import model_zoo
+from . import metrics
+
+from .save_load import init_model, save_model
+from .config import get_config
+from .misc import AverageMeter
+from .metrics import multi_hot_encode
+from .metrics import hamming_distance
+from .metrics import accuracy_score
+from .metrics import precision_recall_fscore
+from .metrics import mean_average_precision
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/__init__.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..742ad34
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/check.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/check.cpython-39.pyc
new file mode 100644
index 0000000..5537a77
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/check.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/config.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/config.cpython-39.pyc
new file mode 100644
index 0000000..bb62e36
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/config.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/download.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/download.cpython-39.pyc
new file mode 100644
index 0000000..7aafa18
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/download.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/logger.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/logger.cpython-39.pyc
new file mode 100644
index 0000000..1a80880
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/logger.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/metrics.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/metrics.cpython-39.pyc
new file mode 100644
index 0000000..d0d59db
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/metrics.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/misc.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/misc.cpython-39.pyc
new file mode 100644
index 0000000..bc2f9ff
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/misc.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/model_zoo.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/model_zoo.cpython-39.pyc
new file mode 100644
index 0000000..52849be
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/model_zoo.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/profiler.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/profiler.cpython-39.pyc
new file mode 100644
index 0000000..87676dd
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/profiler.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/__pycache__/save_load.cpython-39.pyc b/src/PaddleClas/ppcls/utils/__pycache__/save_load.cpython-39.pyc
new file mode 100644
index 0000000..621cfc1
Binary files /dev/null and b/src/PaddleClas/ppcls/utils/__pycache__/save_load.cpython-39.pyc differ
diff --git a/src/PaddleClas/ppcls/utils/check.py b/src/PaddleClas/ppcls/utils/check.py
new file mode 100644
index 0000000..bc70308
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/check.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import paddle
+from paddle import is_compiled_with_cuda
+
+from ppcls.arch import get_architectures
+from ppcls.arch import similar_architectures
+from ppcls.arch import get_blacklist_model_in_static_mode
+from ppcls.utils import logger
+
+
+def check_version():
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version 1.8.0 or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code."
+    try:
+        pass
+        # paddle.utils.require_version('0.0.0')
+    except Exception:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_gpu():
+    """
+    Log error and exit when using paddlepaddle cpu version.
+    """
+    err = "You are using paddlepaddle cpu version! Please try to " \
+          "install paddlepaddle-gpu to run model on GPU."
+
+    try:
+        assert is_compiled_with_cuda()
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_architecture(architecture):
+    """
+    check architecture and recommend similar architectures
+    """
+    assert isinstance(architecture, dict), \
+        ("the type of architecture({}) should be dict". format(architecture))
+    assert "name" in architecture, \
+        ("name must be in the architecture keys, just contains: {}". format(
+            architecture.keys()))
+
+    similar_names = similar_architectures(architecture["name"],
+                                          get_architectures())
+    model_list = ', '.join(similar_names)
+    err = "Architecture [{}] is not exist! Maybe you want: [{}]" \
+          "".format(architecture["name"], model_list)
+    try:
+        assert architecture["name"] in similar_names
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_model_with_running_mode(architecture):
+    """
+    check whether the model is consistent with the operating mode 
+    """
+    # some model are not supported in the static mode
+    blacklist = get_blacklist_model_in_static_mode()
+    if not paddle.in_dynamic_mode() and architecture["name"] in blacklist:
+        logger.error("Model: {} is not supported in the staic mode.".format(
+            architecture["name"]))
+        sys.exit(1)
+    return
+
+
+def check_mix(architecture, use_mix=False):
+    """
+    check mix parameter
+    """
+    err = "Cannot use mix processing in GoogLeNet, " \
+          "please set use_mix = False."
+    try:
+        if architecture["name"] == "GoogLeNet":
+            assert use_mix is not True
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_classes_num(classes_num):
+    """
+    check classes_num
+    """
+    err = "classes_num({}) should be a positive integer" \
+        "and larger than 1".format(classes_num)
+    try:
+        assert isinstance(classes_num, int)
+        assert classes_num > 1
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_data_dir(path):
+    """
+    check cata_dir
+    """
+    err = "Data path is not exist, please given a right path" \
+          "".format(path)
+    try:
+        assert os.isdir(path)
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_function_params(config, key):
+    """
+    check specify config
+    """
+    k_config = config.get(key)
+    assert k_config is not None, \
+        ('{} is required in config'.format(key))
+
+    assert k_config.get('function'), \
+        ('function is required {} config'.format(key))
+    params = k_config.get('params')
+    assert params is not None, \
+        ('params is required in {} config'.format(key))
+    assert isinstance(params, dict), \
+        ('the params in {} config should be a dict'.format(key))
diff --git a/src/PaddleClas/ppcls/utils/config.py b/src/PaddleClas/ppcls/utils/config.py
new file mode 100644
index 0000000..e3277c4
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/config.py
@@ -0,0 +1,210 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+import argparse
+import yaml
+from ppcls.utils import logger
+from ppcls.utils import check
+__all__ = ['get_config']
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+    def __deepcopy__(self, content):
+        return copy.deepcopy(dict(self))
+
+
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    placeholder = "-" * 60
+    for k, v in sorted(d.items()):
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ", k, v))
+        if k.isupper():
+            logger.info(placeholder)
+
+
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    logger.advertise()
+    print_dict(config)
+
+
+def check_config(config):
+    """
+    Check config
+    """
+    check.check_version()
+    use_gpu = config.get('use_gpu', True)
+    if use_gpu:
+        check.check_gpu()
+    architecture = config.get('ARCHITECTURE')
+    #check.check_architecture(architecture)
+    use_mix = config.get('use_mix', False)
+    check.check_mix(architecture, use_mix)
+    classes_num = config.get('classes_num')
+    check.check_classes_num(classes_num)
+    mode = config.get('mode', 'train')
+    if mode.lower() == 'train':
+        check.check_function_params(config, 'LEARNING_RATE')
+        check.check_function_params(config, 'OPTIMIZER')
+
+
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+
+    def str2num(v):
+        try:
+            return eval(v)
+        except Exception:
+            return v
+
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            if not ks[0] in dl:
+                print('A new filed ({}) detected!'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                'topk=2',
+                'VALID.transforms.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    if options is not None:
+        for opt in options:
+            assert isinstance(opt, str), (
+                "option({}) should be a str".format(opt))
+            assert "=" in opt, (
+                "option({}) should contain a ="
+                "to distinguish between key and value".format(opt))
+            pair = opt.split('=')
+            assert len(pair) == 2, ("there can be only a = in the option")
+            key, value = pair
+            keys = key.split('.')
+            override(config, keys, value)
+    return config
+
+
+def get_config(fname, overrides=None, show=False):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), (
+        'config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    override_config(config, overrides)
+    if show:
+        print_config(config)
+    # check_config(config)
+    return config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("generic-image-rec train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/config.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    args = parser.parse_args()
+    return args
diff --git a/src/PaddleClas/ppcls/utils/download.py b/src/PaddleClas/ppcls/utils/download.py
new file mode 100644
index 0000000..9c45750
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/download.py
@@ -0,0 +1,319 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import os.path as osp
+import shutil
+import requests
+import hashlib
+import tarfile
+import zipfile
+import time
+from collections import OrderedDict
+from tqdm import tqdm
+
+from ppcls.utils import logger
+
+__all__ = ['get_weights_path_from_url']
+
+WEIGHTS_HOME = osp.expanduser("~/.paddleclas/weights")
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+
+
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded weights.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.utils.download import get_weights_path_from_url
+
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def _get_unique_endpoints(trainer_endpoints):
+    # Sorting is to avoid different environmental variables for each card
+    trainer_endpoints.sort()
+    ips = set()
+    unique_endpoints = set()
+    for endpoint in trainer_endpoints:
+        ip = endpoint.split(":")[0]
+        if ip in ips:
+            continue
+        ips.add(ip)
+        unique_endpoints.add(endpoint)
+    logger.info("unique_endpoints {}".format(unique_endpoints))
+    return unique_endpoints
+
+
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    from paddle.fluid.dygraph.parallel import ParallelEnv
+
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+    # Mainly used to solve the problem of downloading data from different 
+    # machines in the case of multiple machines. Different ips will download 
+    # data, and the same ip will only download data once.
+    unique_endpoints = _get_unique_endpoints(ParallelEnv()
+                                             .trainer_endpoints[:])
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if ParallelEnv().current_endpoint in unique_endpoints:
+            fullpath = _download(url, root_dir, md5sum)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+
+    if ParallelEnv().current_endpoint in unique_endpoints:
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
+            fullpath = _decompress(fullpath)
+
+    return fullpath
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+
+    url (str): download url
+    path (str): download to given path
+    """
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        try:
+            req = requests.get(url, stream=True)
+        except Exception as e:  # requests.exceptions.ConnectionError
+            logger.info(
+                "Downloading {} from {} failed {} times with exception {}".
+                format(fname, url, retry_cnt + 1, str(e)))
+            time.sleep(1)
+            continue
+
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(1)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, 'r')
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
diff --git a/src/PaddleClas/ppcls/utils/ema.py b/src/PaddleClas/ppcls/utils/ema.py
new file mode 100644
index 0000000..b54cdb1
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/ema.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+
+class ExponentialMovingAverage():
+    """
+    Exponential Moving Average
+    Code was heavily based on https://github.com/Wanger-SJTU/SegToolbox.Pytorch/blob/master/lib/utils/ema.py
+    """
+
+    def __init__(self, model, decay, thres_steps=True):
+        self._model = model
+        self._decay = decay
+        self._thres_steps = thres_steps
+        self._shadow = {}
+        self._backup = {}
+
+    def register(self):
+        self._update_step = 0
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                self._shadow[name] = param.numpy().copy()
+
+    def update(self):
+        decay = min(self._decay, (1 + self._update_step) / (
+            10 + self._update_step)) if self._thres_steps else self._decay
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                assert name in self._shadow
+                new_val = np.array(param.numpy().copy())
+                old_val = np.array(self._shadow[name])
+                new_average = decay * old_val + (1 - decay) * new_val
+                self._shadow[name] = new_average
+        self._update_step += 1
+        return decay
+
+    def apply(self):
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                assert name in self._shadow
+                self._backup[name] = np.array(param.numpy().copy())
+                param.set_value(np.array(self._shadow[name]))
+
+    def restore(self):
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                assert name in self._backup
+                param.set_value(self._backup[name])
+        self._backup = {}
diff --git a/src/PaddleClas/ppcls/utils/feature_maps_visualization/fm_vis.py b/src/PaddleClas/ppcls/utils/feature_maps_visualization/fm_vis.py
new file mode 100644
index 0000000..a5368b1
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/feature_maps_visualization/fm_vis.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import cv2
+import utils
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../..')))
+
+import paddle
+from paddle.distributed import ParallelEnv
+
+from resnet import ResNet50
+from ppcls.utils.save_load import load_dygraph_pretrain
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--image_file", required=True, type=str)
+    parser.add_argument("-c", "--channel_num", type=int)
+    parser.add_argument("-p", "--pretrained_model", type=str)
+    parser.add_argument("--show", type=str2bool, default=False)
+    parser.add_argument("--interpolation", type=int, default=1)
+    parser.add_argument("--save_path", type=str, default=None)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+
+    return parser.parse_args()
+
+
+def create_operators(interpolation=1):
+    size = 224
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    img_scale = 1.0 / 255.0
+
+    resize_op = utils.ResizeImage(
+        resize_short=256, interpolation=interpolation)
+    crop_op = utils.CropImage(size=(size, size))
+    normalize_op = utils.NormalizeImage(
+        scale=img_scale, mean=img_mean, std=img_std)
+    totensor_op = utils.ToTensor()
+
+    return [resize_op, crop_op, normalize_op, totensor_op]
+
+
+def preprocess(data, ops):
+    for op in ops:
+        data = op(data)
+    return data
+
+
+def main():
+    args = parse_args()
+    operators = create_operators(args.interpolation)
+    # assign the place
+    place = 'gpu:{}'.format(ParallelEnv().dev_id) if args.use_gpu else 'cpu'
+    place = paddle.set_device(place)
+
+    net = ResNet50()
+    load_dygraph_pretrain(net, args.pretrained_model)
+
+    img = cv2.imread(args.image_file, cv2.IMREAD_COLOR)
+    data = preprocess(img, operators)
+    data = np.expand_dims(data, axis=0)
+    data = paddle.to_tensor(data)
+    net.eval()
+    _, fm = net(data)
+    assert args.channel_num >= 0 and args.channel_num <= fm.shape[
+        1], "the channel is out of the range, should be in {} but got {}".format(
+            [0, fm.shape[1]], args.channel_num)
+
+    fm = (np.squeeze(fm[0][args.channel_num].numpy()) * 255).astype(np.uint8)
+    fm = cv2.resize(fm, (img.shape[1], img.shape[0]))
+    if args.save_path is not None:
+        print("the feature map is saved in path: {}".format(args.save_path))
+        cv2.imwrite(args.save_path, fm)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/PaddleClas/ppcls/utils/feature_maps_visualization/resnet.py b/src/PaddleClas/ppcls/utils/feature_maps_visualization/resnet.py
new file mode 100644
index 0000000..b758814
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/feature_maps_visualization/resnet.py
@@ -0,0 +1,535 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better.
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, 2]],
+            "vd":
+            [[input_image_channel, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+        if return_patterns is not None:
+            self.update_res(return_patterns)
+            self.register_forward_post_hook(self._return_dict_hook)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                x = paddle.transpose(x, [0, 2, 3, 1])
+                x.stop_gradient = True
+            x = self.stem(x)
+            fm = x
+            x = self.max_pool(x)
+            x = self.blocks(x)
+            x = self.avg_pool(x)
+            x = self.flatten(x)
+            x = self.fc(x)
+        return x, fm
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["200"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/src/PaddleClas/ppcls/utils/feature_maps_visualization/utils.py b/src/PaddleClas/ppcls/utils/feature_maps_visualization/utils.py
new file mode 100644
index 0000000..7c70149
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/feature_maps_visualization/utils.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+
+
+class DecodeImage(object):
+    def __init__(self, to_rgb=True):
+        self.to_rgb = to_rgb
+
+    def __call__(self, img):
+        data = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(data, 1)
+        if self.to_rgb:
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
+                img.shape)
+            img = img[:, :, ::-1]
+
+        return img
+
+
+class ResizeImage(object):
+    def __init__(self, resize_short=None, interpolation=1):
+        self.resize_short = resize_short
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        img_h, img_w = img.shape[:2]
+        percent = float(self.resize_short) / min(img_w, img_h)
+        w = int(round(img_w * percent))
+        h = int(round(img_h * percent))
+        return cv2.resize(img, (w, h), interpolation=self.interpolation)
+
+
+class CropImage(object):
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size
+
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+
+
+class NormalizeImage(object):
+    def __init__(self, scale=None, mean=None, std=None):
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, img):
+        return (img.astype('float32') * self.scale - self.mean) / self.std
+
+
+class ToTensor(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        img = img.transpose((2, 0, 1))
+        return img
diff --git a/src/PaddleClas/ppcls/utils/gallery2fc.py b/src/PaddleClas/ppcls/utils/gallery2fc.py
new file mode 100644
index 0000000..67b0852
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/gallery2fc.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import cv2
+
+from ppcls.arch import build_model
+from ppcls.utils.config import parse_config, parse_args
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.utils.logger import init_logger
+from ppcls.data import create_operators
+from ppcls.arch.slim import quantize_model
+
+
+class GalleryLayer(paddle.nn.Layer):
+    def __init__(self, configs):
+        super().__init__()
+        self.configs = configs
+        embedding_size = self.configs["Arch"]["Head"]["embedding_size"]
+        self.batch_size = self.configs["IndexProcess"]["batch_size"]
+        self.image_shape = self.configs["Global"]["image_shape"].copy()
+        self.image_shape.insert(0, self.batch_size)
+
+        image_root = self.configs["IndexProcess"]["image_root"]
+        data_file = self.configs["IndexProcess"]["data_file"]
+        delimiter = self.configs["IndexProcess"]["delimiter"]
+        self.gallery_images = []
+        gallery_docs = []
+        gallery_labels = []
+
+        with open(data_file, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            for ori_line in lines:
+                line = ori_line.strip().split(delimiter)
+                text_num = len(line)
+                assert text_num >= 2, f"line({ori_line}) must be splitted into at least 2 parts, but got {text_num}"
+                image_file = os.path.join(image_root, line[0])
+
+                self.gallery_images.append(image_file)
+                gallery_docs.append(ori_line.strip())
+                gallery_labels.append(line[1].strip())
+        self.gallery_layer = paddle.nn.Linear(embedding_size, len(self.gallery_images), bias_attr=False)
+        self.gallery_layer.skip_quant = True
+        output_label_str = ""
+        for i, label_i in enumerate(gallery_labels):
+            output_label_str += "{} {}\n".format(i, label_i)
+        output_path = configs["Global"]["save_inference_dir"] + "_label.txt"
+        with open(output_path, "w") as f:
+            f.write(output_label_str)
+
+    def forward(self, x, label=None):
+        x = paddle.nn.functional.normalize(x)
+        x = self.gallery_layer(x)
+        return x
+
+    def build_gallery_layer(self, feature_extractor):
+        transform_configs = self.configs["IndexProcess"]["transform_ops"]
+        preprocess_ops = create_operators(transform_configs)
+        embedding_size = self.configs["Arch"]["Head"]["embedding_size"]
+        batch_index = 0
+        input_tensor = paddle.zeros(self.image_shape)
+        gallery_feature = paddle.zeros((len(self.gallery_images), embedding_size))
+        for i, image_path in enumerate(self.gallery_images):
+            image = cv2.imread(image_path)[:, :, ::-1]
+            for op in preprocess_ops:
+                image = op(image)
+            input_tensor[batch_index] = image
+            batch_index += 1
+            if batch_index == self.batch_size or i == len(self.gallery_images) - 1:
+                batch_feature = feature_extractor(input_tensor)["features"]
+                for j in range(batch_index):
+                    feature = batch_feature[j]
+                    norm_feature = paddle.nn.functional.normalize(feature, axis=0)
+                    gallery_feature[i - batch_index + j + 1] = norm_feature
+        self.gallery_layer.set_state_dict({"_layer.weight": gallery_feature.T})
+
+
+def export_fuse_model(configs):
+    slim_config = configs["Slim"].copy()
+    configs["Slim"] = None
+    fuse_model = build_model(configs)
+    fuse_model.head = GalleryLayer(configs)
+    configs["Slim"] = slim_config
+    quantize_model(configs, fuse_model)
+    load_dygraph_pretrain(fuse_model, configs["Global"]["pretrained_model"])
+    fuse_model.eval()
+    fuse_model.head.build_gallery_layer(fuse_model)
+    save_path = configs["Global"]["save_inference_dir"]
+    fuse_model.quanter.save_quantized_model(
+        fuse_model,
+        save_path,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None] + configs["Global"]["image_shape"],
+                dtype='float32')
+        ])
+
+
+def main():
+    args = parse_args()
+    configs = parse_config(args.config)
+    init_logger(name='gallery2fc')
+    export_fuse_model(configs)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/PaddleClas/ppcls/utils/imagenet1k_label_list.txt b/src/PaddleClas/ppcls/utils/imagenet1k_label_list.txt
new file mode 100644
index 0000000..376e180
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/imagenet1k_label_list.txt
@@ -0,0 +1,1000 @@
+0 tench, Tinca tinca
+1 goldfish, Carassius auratus
+2 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+3 tiger shark, Galeocerdo cuvieri
+4 hammerhead, hammerhead shark
+5 electric ray, crampfish, numbfish, torpedo
+6 stingray
+7 cock
+8 hen
+9 ostrich, Struthio camelus
+10 brambling, Fringilla montifringilla
+11 goldfinch, Carduelis carduelis
+12 house finch, linnet, Carpodacus mexicanus
+13 junco, snowbird
+14 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+15 robin, American robin, Turdus migratorius
+16 bulbul
+17 jay
+18 magpie
+19 chickadee
+20 water ouzel, dipper
+21 kite
+22 bald eagle, American eagle, Haliaeetus leucocephalus
+23 vulture
+24 great grey owl, great gray owl, Strix nebulosa
+25 European fire salamander, Salamandra salamandra
+26 common newt, Triturus vulgaris
+27 eft
+28 spotted salamander, Ambystoma maculatum
+29 axolotl, mud puppy, Ambystoma mexicanum
+30 bullfrog, Rana catesbeiana
+31 tree frog, tree-frog
+32 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+33 loggerhead, loggerhead turtle, Caretta caretta
+34 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+35 mud turtle
+36 terrapin
+37 box turtle, box tortoise
+38 banded gecko
+39 common iguana, iguana, Iguana iguana
+40 American chameleon, anole, Anolis carolinensis
+41 whiptail, whiptail lizard
+42 agama
+43 frilled lizard, Chlamydosaurus kingi
+44 alligator lizard
+45 Gila monster, Heloderma suspectum
+46 green lizard, Lacerta viridis
+47 African chameleon, Chamaeleo chamaeleon
+48 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+49 African crocodile, Nile crocodile, Crocodylus niloticus
+50 American alligator, Alligator mississipiensis
+51 triceratops
+52 thunder snake, worm snake, Carphophis amoenus
+53 ringneck snake, ring-necked snake, ring snake
+54 hognose snake, puff adder, sand viper
+55 green snake, grass snake
+56 king snake, kingsnake
+57 garter snake, grass snake
+58 water snake
+59 vine snake
+60 night snake, Hypsiglena torquata
+61 boa constrictor, Constrictor constrictor
+62 rock python, rock snake, Python sebae
+63 Indian cobra, Naja naja
+64 green mamba
+65 sea snake
+66 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+67 diamondback, diamondback rattlesnake, Crotalus adamanteus
+68 sidewinder, horned rattlesnake, Crotalus cerastes
+69 trilobite
+70 harvestman, daddy longlegs, Phalangium opilio
+71 scorpion
+72 black and gold garden spider, Argiope aurantia
+73 barn spider, Araneus cavaticus
+74 garden spider, Aranea diademata
+75 black widow, Latrodectus mactans
+76 tarantula
+77 wolf spider, hunting spider
+78 tick
+79 centipede
+80 black grouse
+81 ptarmigan
+82 ruffed grouse, partridge, Bonasa umbellus
+83 prairie chicken, prairie grouse, prairie fowl
+84 peacock
+85 quail
+86 partridge
+87 African grey, African gray, Psittacus erithacus
+88 macaw
+89 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+90 lorikeet
+91 coucal
+92 bee eater
+93 hornbill
+94 hummingbird
+95 jacamar
+96 toucan
+97 drake
+98 red-breasted merganser, Mergus serrator
+99 goose
+100 black swan, Cygnus atratus
+101 tusker
+102 echidna, spiny anteater, anteater
+103 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+104 wallaby, brush kangaroo
+105 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+106 wombat
+107 jellyfish
+108 sea anemone, anemone
+109 brain coral
+110 flatworm, platyhelminth
+111 nematode, nematode worm, roundworm
+112 conch
+113 snail
+114 slug
+115 sea slug, nudibranch
+116 chiton, coat-of-mail shell, sea cradle, polyplacophore
+117 chambered nautilus, pearly nautilus, nautilus
+118 Dungeness crab, Cancer magister
+119 rock crab, Cancer irroratus
+120 fiddler crab
+121 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+122 American lobster, Northern lobster, Maine lobster, Homarus americanus
+123 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+124 crayfish, crawfish, crawdad, crawdaddy
+125 hermit crab
+126 isopod
+127 white stork, Ciconia ciconia
+128 black stork, Ciconia nigra
+129 spoonbill
+130 flamingo
+131 little blue heron, Egretta caerulea
+132 American egret, great white heron, Egretta albus
+133 bittern
+134 crane
+135 limpkin, Aramus pictus
+136 European gallinule, Porphyrio porphyrio
+137 American coot, marsh hen, mud hen, water hen, Fulica americana
+138 bustard
+139 ruddy turnstone, Arenaria interpres
+140 red-backed sandpiper, dunlin, Erolia alpina
+141 redshank, Tringa totanus
+142 dowitcher
+143 oystercatcher, oyster catcher
+144 pelican
+145 king penguin, Aptenodytes patagonica
+146 albatross, mollymawk
+147 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+148 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+149 dugong, Dugong dugon
+150 sea lion
+151 Chihuahua
+152 Japanese spaniel
+153 Maltese dog, Maltese terrier, Maltese
+154 Pekinese, Pekingese, Peke
+155 Shih-Tzu
+156 Blenheim spaniel
+157 papillon
+158 toy terrier
+159 Rhodesian ridgeback
+160 Afghan hound, Afghan
+161 basset, basset hound
+162 beagle
+163 bloodhound, sleuthhound
+164 bluetick
+165 black-and-tan coonhound
+166 Walker hound, Walker foxhound
+167 English foxhound
+168 redbone
+169 borzoi, Russian wolfhound
+170 Irish wolfhound
+171 Italian greyhound
+172 whippet
+173 Ibizan hound, Ibizan Podenco
+174 Norwegian elkhound, elkhound
+175 otterhound, otter hound
+176 Saluki, gazelle hound
+177 Scottish deerhound, deerhound
+178 Weimaraner
+179 Staffordshire bullterrier, Staffordshire bull terrier
+180 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+181 Bedlington terrier
+182 Border terrier
+183 Kerry blue terrier
+184 Irish terrier
+185 Norfolk terrier
+186 Norwich terrier
+187 Yorkshire terrier
+188 wire-haired fox terrier
+189 Lakeland terrier
+190 Sealyham terrier, Sealyham
+191 Airedale, Airedale terrier
+192 cairn, cairn terrier
+193 Australian terrier
+194 Dandie Dinmont, Dandie Dinmont terrier
+195 Boston bull, Boston terrier
+196 miniature schnauzer
+197 giant schnauzer
+198 standard schnauzer
+199 Scotch terrier, Scottish terrier, Scottie
+200 Tibetan terrier, chrysanthemum dog
+201 silky terrier, Sydney silky
+202 soft-coated wheaten terrier
+203 West Highland white terrier
+204 Lhasa, Lhasa apso
+205 flat-coated retriever
+206 curly-coated retriever
+207 golden retriever
+208 Labrador retriever
+209 Chesapeake Bay retriever
+210 German short-haired pointer
+211 vizsla, Hungarian pointer
+212 English setter
+213 Irish setter, red setter
+214 Gordon setter
+215 Brittany spaniel
+216 clumber, clumber spaniel
+217 English springer, English springer spaniel
+218 Welsh springer spaniel
+219 cocker spaniel, English cocker spaniel, cocker
+220 Sussex spaniel
+221 Irish water spaniel
+222 kuvasz
+223 schipperke
+224 groenendael
+225 malinois
+226 briard
+227 kelpie
+228 komondor
+229 Old English sheepdog, bobtail
+230 Shetland sheepdog, Shetland sheep dog, Shetland
+231 collie
+232 Border collie
+233 Bouvier des Flandres, Bouviers des Flandres
+234 Rottweiler
+235 German shepherd, German shepherd dog, German police dog, alsatian
+236 Doberman, Doberman pinscher
+237 miniature pinscher
+238 Greater Swiss Mountain dog
+239 Bernese mountain dog
+240 Appenzeller
+241 EntleBucher
+242 boxer
+243 bull mastiff
+244 Tibetan mastiff
+245 French bulldog
+246 Great Dane
+247 Saint Bernard, St Bernard
+248 Eskimo dog, husky
+249 malamute, malemute, Alaskan malamute
+250 Siberian husky
+251 dalmatian, coach dog, carriage dog
+252 affenpinscher, monkey pinscher, monkey dog
+253 basenji
+254 pug, pug-dog
+255 Leonberg
+256 Newfoundland, Newfoundland dog
+257 Great Pyrenees
+258 Samoyed, Samoyede
+259 Pomeranian
+260 chow, chow chow
+261 keeshond
+262 Brabancon griffon
+263 Pembroke, Pembroke Welsh corgi
+264 Cardigan, Cardigan Welsh corgi
+265 toy poodle
+266 miniature poodle
+267 standard poodle
+268 Mexican hairless
+269 timber wolf, grey wolf, gray wolf, Canis lupus
+270 white wolf, Arctic wolf, Canis lupus tundrarum
+271 red wolf, maned wolf, Canis rufus, Canis niger
+272 coyote, prairie wolf, brush wolf, Canis latrans
+273 dingo, warrigal, warragal, Canis dingo
+274 dhole, Cuon alpinus
+275 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+276 hyena, hyaena
+277 red fox, Vulpes vulpes
+278 kit fox, Vulpes macrotis
+279 Arctic fox, white fox, Alopex lagopus
+280 grey fox, gray fox, Urocyon cinereoargenteus
+281 tabby, tabby cat
+282 tiger cat
+283 Persian cat
+284 Siamese cat, Siamese
+285 Egyptian cat
+286 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+287 lynx, catamount
+288 leopard, Panthera pardus
+289 snow leopard, ounce, Panthera uncia
+290 jaguar, panther, Panthera onca, Felis onca
+291 lion, king of beasts, Panthera leo
+292 tiger, Panthera tigris
+293 cheetah, chetah, Acinonyx jubatus
+294 brown bear, bruin, Ursus arctos
+295 American black bear, black bear, Ursus americanus, Euarctos americanus
+296 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+297 sloth bear, Melursus ursinus, Ursus ursinus
+298 mongoose
+299 meerkat, mierkat
+300 tiger beetle
+301 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+302 ground beetle, carabid beetle
+303 long-horned beetle, longicorn, longicorn beetle
+304 leaf beetle, chrysomelid
+305 dung beetle
+306 rhinoceros beetle
+307 weevil
+308 fly
+309 bee
+310 ant, emmet, pismire
+311 grasshopper, hopper
+312 cricket
+313 walking stick, walkingstick, stick insect
+314 cockroach, roach
+315 mantis, mantid
+316 cicada, cicala
+317 leafhopper
+318 lacewing, lacewing fly
+319 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+320 damselfly
+321 admiral
+322 ringlet, ringlet butterfly
+323 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+324 cabbage butterfly
+325 sulphur butterfly, sulfur butterfly
+326 lycaenid, lycaenid butterfly
+327 starfish, sea star
+328 sea urchin
+329 sea cucumber, holothurian
+330 wood rabbit, cottontail, cottontail rabbit
+331 hare
+332 Angora, Angora rabbit
+333 hamster
+334 porcupine, hedgehog
+335 fox squirrel, eastern fox squirrel, Sciurus niger
+336 marmot
+337 beaver
+338 guinea pig, Cavia cobaya
+339 sorrel
+340 zebra
+341 hog, pig, grunter, squealer, Sus scrofa
+342 wild boar, boar, Sus scrofa
+343 warthog
+344 hippopotamus, hippo, river horse, Hippopotamus amphibius
+345 ox
+346 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+347 bison
+348 ram, tup
+349 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+350 ibex, Capra ibex
+351 hartebeest
+352 impala, Aepyceros melampus
+353 gazelle
+354 Arabian camel, dromedary, Camelus dromedarius
+355 llama
+356 weasel
+357 mink
+358 polecat, fitch, foulmart, foumart, Mustela putorius
+359 black-footed ferret, ferret, Mustela nigripes
+360 otter
+361 skunk, polecat, wood pussy
+362 badger
+363 armadillo
+364 three-toed sloth, ai, Bradypus tridactylus
+365 orangutan, orang, orangutang, Pongo pygmaeus
+366 gorilla, Gorilla gorilla
+367 chimpanzee, chimp, Pan troglodytes
+368 gibbon, Hylobates lar
+369 siamang, Hylobates syndactylus, Symphalangus syndactylus
+370 guenon, guenon monkey
+371 patas, hussar monkey, Erythrocebus patas
+372 baboon
+373 macaque
+374 langur
+375 colobus, colobus monkey
+376 proboscis monkey, Nasalis larvatus
+377 marmoset
+378 capuchin, ringtail, Cebus capucinus
+379 howler monkey, howler
+380 titi, titi monkey
+381 spider monkey, Ateles geoffroyi
+382 squirrel monkey, Saimiri sciureus
+383 Madagascar cat, ring-tailed lemur, Lemur catta
+384 indri, indris, Indri indri, Indri brevicaudatus
+385 Indian elephant, Elephas maximus
+386 African elephant, Loxodonta africana
+387 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+388 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+389 barracouta, snoek
+390 eel
+391 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+392 rock beauty, Holocanthus tricolor
+393 anemone fish
+394 sturgeon
+395 gar, garfish, garpike, billfish, Lepisosteus osseus
+396 lionfish
+397 puffer, pufferfish, blowfish, globefish
+398 abacus
+399 abaya
+400 academic gown, academic robe, judge's robe
+401 accordion, piano accordion, squeeze box
+402 acoustic guitar
+403 aircraft carrier, carrier, flattop, attack aircraft carrier
+404 airliner
+405 airship, dirigible
+406 altar
+407 ambulance
+408 amphibian, amphibious vehicle
+409 analog clock
+410 apiary, bee house
+411 apron
+412 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+413 assault rifle, assault gun
+414 backpack, back pack, knapsack, packsack, rucksack, haversack
+415 bakery, bakeshop, bakehouse
+416 balance beam, beam
+417 balloon
+418 ballpoint, ballpoint pen, ballpen, Biro
+419 Band Aid
+420 banjo
+421 bannister, banister, balustrade, balusters, handrail
+422 barbell
+423 barber chair
+424 barbershop
+425 barn
+426 barometer
+427 barrel, cask
+428 barrow, garden cart, lawn cart, wheelbarrow
+429 baseball
+430 basketball
+431 bassinet
+432 bassoon
+433 bathing cap, swimming cap
+434 bath towel
+435 bathtub, bathing tub, bath, tub
+436 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+437 beacon, lighthouse, beacon light, pharos
+438 beaker
+439 bearskin, busby, shako
+440 beer bottle
+441 beer glass
+442 bell cote, bell cot
+443 bib
+444 bicycle-built-for-two, tandem bicycle, tandem
+445 bikini, two-piece
+446 binder, ring-binder
+447 binoculars, field glasses, opera glasses
+448 birdhouse
+449 boathouse
+450 bobsled, bobsleigh, bob
+451 bolo tie, bolo, bola tie, bola
+452 bonnet, poke bonnet
+453 bookcase
+454 bookshop, bookstore, bookstall
+455 bottlecap
+456 bow
+457 bow tie, bow-tie, bowtie
+458 brass, memorial tablet, plaque
+459 brassiere, bra, bandeau
+460 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+461 breastplate, aegis, egis
+462 broom
+463 bucket, pail
+464 buckle
+465 bulletproof vest
+466 bullet train, bullet
+467 butcher shop, meat market
+468 cab, hack, taxi, taxicab
+469 caldron, cauldron
+470 candle, taper, wax light
+471 cannon
+472 canoe
+473 can opener, tin opener
+474 cardigan
+475 car mirror
+476 carousel, carrousel, merry-go-round, roundabout, whirligig
+477 carpenter's kit, tool kit
+478 carton
+479 car wheel
+480 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+481 cassette
+482 cassette player
+483 castle
+484 catamaran
+485 CD player
+486 cello, violoncello
+487 cellular telephone, cellular phone, cellphone, cell, mobile phone
+488 chain
+489 chainlink fence
+490 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+491 chain saw, chainsaw
+492 chest
+493 chiffonier, commode
+494 chime, bell, gong
+495 china cabinet, china closet
+496 Christmas stocking
+497 church, church building
+498 cinema, movie theater, movie theatre, movie house, picture palace
+499 cleaver, meat cleaver, chopper
+500 cliff dwelling
+501 cloak
+502 clog, geta, patten, sabot
+503 cocktail shaker
+504 coffee mug
+505 coffeepot
+506 coil, spiral, volute, whorl, helix
+507 combination lock
+508 computer keyboard, keypad
+509 confectionery, confectionary, candy store
+510 container ship, containership, container vessel
+511 convertible
+512 corkscrew, bottle screw
+513 cornet, horn, trumpet, trump
+514 cowboy boot
+515 cowboy hat, ten-gallon hat
+516 cradle
+517 crane
+518 crash helmet
+519 crate
+520 crib, cot
+521 Crock Pot
+522 croquet ball
+523 crutch
+524 cuirass
+525 dam, dike, dyke
+526 desk
+527 desktop computer
+528 dial telephone, dial phone
+529 diaper, nappy, napkin
+530 digital clock
+531 digital watch
+532 dining table, board
+533 dishrag, dishcloth
+534 dishwasher, dish washer, dishwashing machine
+535 disk brake, disc brake
+536 dock, dockage, docking facility
+537 dogsled, dog sled, dog sleigh
+538 dome
+539 doormat, welcome mat
+540 drilling platform, offshore rig
+541 drum, membranophone, tympan
+542 drumstick
+543 dumbbell
+544 Dutch oven
+545 electric fan, blower
+546 electric guitar
+547 electric locomotive
+548 entertainment center
+549 envelope
+550 espresso maker
+551 face powder
+552 feather boa, boa
+553 file, file cabinet, filing cabinet
+554 fireboat
+555 fire engine, fire truck
+556 fire screen, fireguard
+557 flagpole, flagstaff
+558 flute, transverse flute
+559 folding chair
+560 football helmet
+561 forklift
+562 fountain
+563 fountain pen
+564 four-poster
+565 freight car
+566 French horn, horn
+567 frying pan, frypan, skillet
+568 fur coat
+569 garbage truck, dustcart
+570 gasmask, respirator, gas helmet
+571 gas pump, gasoline pump, petrol pump, island dispenser
+572 goblet
+573 go-kart
+574 golf ball
+575 golfcart, golf cart
+576 gondola
+577 gong, tam-tam
+578 gown
+579 grand piano, grand
+580 greenhouse, nursery, glasshouse
+581 grille, radiator grille
+582 grocery store, grocery, food market, market
+583 guillotine
+584 hair slide
+585 hair spray
+586 half track
+587 hammer
+588 hamper
+589 hand blower, blow dryer, blow drier, hair dryer, hair drier
+590 hand-held computer, hand-held microcomputer
+591 handkerchief, hankie, hanky, hankey
+592 hard disc, hard disk, fixed disk
+593 harmonica, mouth organ, harp, mouth harp
+594 harp
+595 harvester, reaper
+596 hatchet
+597 holster
+598 home theater, home theatre
+599 honeycomb
+600 hook, claw
+601 hoopskirt, crinoline
+602 horizontal bar, high bar
+603 horse cart, horse-cart
+604 hourglass
+605 iPod
+606 iron, smoothing iron
+607 jack-o'-lantern
+608 jean, blue jean, denim
+609 jeep, landrover
+610 jersey, T-shirt, tee shirt
+611 jigsaw puzzle
+612 jinrikisha, ricksha, rickshaw
+613 joystick
+614 kimono
+615 knee pad
+616 knot
+617 lab coat, laboratory coat
+618 ladle
+619 lampshade, lamp shade
+620 laptop, laptop computer
+621 lawn mower, mower
+622 lens cap, lens cover
+623 letter opener, paper knife, paperknife
+624 library
+625 lifeboat
+626 lighter, light, igniter, ignitor
+627 limousine, limo
+628 liner, ocean liner
+629 lipstick, lip rouge
+630 Loafer
+631 lotion
+632 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+633 loupe, jeweler's loupe
+634 lumbermill, sawmill
+635 magnetic compass
+636 mailbag, postbag
+637 mailbox, letter box
+638 maillot
+639 maillot, tank suit
+640 manhole cover
+641 maraca
+642 marimba, xylophone
+643 mask
+644 matchstick
+645 maypole
+646 maze, labyrinth
+647 measuring cup
+648 medicine chest, medicine cabinet
+649 megalith, megalithic structure
+650 microphone, mike
+651 microwave, microwave oven
+652 military uniform
+653 milk can
+654 minibus
+655 miniskirt, mini
+656 minivan
+657 missile
+658 mitten
+659 mixing bowl
+660 mobile home, manufactured home
+661 Model T
+662 modem
+663 monastery
+664 monitor
+665 moped
+666 mortar
+667 mortarboard
+668 mosque
+669 mosquito net
+670 motor scooter, scooter
+671 mountain bike, all-terrain bike, off-roader
+672 mountain tent
+673 mouse, computer mouse
+674 mousetrap
+675 moving van
+676 muzzle
+677 nail
+678 neck brace
+679 necklace
+680 nipple
+681 notebook, notebook computer
+682 obelisk
+683 oboe, hautboy, hautbois
+684 ocarina, sweet potato
+685 odometer, hodometer, mileometer, milometer
+686 oil filter
+687 organ, pipe organ
+688 oscilloscope, scope, cathode-ray oscilloscope, CRO
+689 overskirt
+690 oxcart
+691 oxygen mask
+692 packet
+693 paddle, boat paddle
+694 paddlewheel, paddle wheel
+695 padlock
+696 paintbrush
+697 pajama, pyjama, pj's, jammies
+698 palace
+699 panpipe, pandean pipe, syrinx
+700 paper towel
+701 parachute, chute
+702 parallel bars, bars
+703 park bench
+704 parking meter
+705 passenger car, coach, carriage
+706 patio, terrace
+707 pay-phone, pay-station
+708 pedestal, plinth, footstall
+709 pencil box, pencil case
+710 pencil sharpener
+711 perfume, essence
+712 Petri dish
+713 photocopier
+714 pick, plectrum, plectron
+715 pickelhaube
+716 picket fence, paling
+717 pickup, pickup truck
+718 pier
+719 piggy bank, penny bank
+720 pill bottle
+721 pillow
+722 ping-pong ball
+723 pinwheel
+724 pirate, pirate ship
+725 pitcher, ewer
+726 plane, carpenter's plane, woodworking plane
+727 planetarium
+728 plastic bag
+729 plate rack
+730 plow, plough
+731 plunger, plumber's helper
+732 Polaroid camera, Polaroid Land camera
+733 pole
+734 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+735 poncho
+736 pool table, billiard table, snooker table
+737 pop bottle, soda bottle
+738 pot, flowerpot
+739 potter's wheel
+740 power drill
+741 prayer rug, prayer mat
+742 printer
+743 prison, prison house
+744 projectile, missile
+745 projector
+746 puck, hockey puck
+747 punching bag, punch bag, punching ball, punchball
+748 purse
+749 quill, quill pen
+750 quilt, comforter, comfort, puff
+751 racer, race car, racing car
+752 racket, racquet
+753 radiator
+754 radio, wireless
+755 radio telescope, radio reflector
+756 rain barrel
+757 recreational vehicle, RV, R.V.
+758 reel
+759 reflex camera
+760 refrigerator, icebox
+761 remote control, remote
+762 restaurant, eating house, eating place, eatery
+763 revolver, six-gun, six-shooter
+764 rifle
+765 rocking chair, rocker
+766 rotisserie
+767 rubber eraser, rubber, pencil eraser
+768 rugby ball
+769 rule, ruler
+770 running shoe
+771 safe
+772 safety pin
+773 saltshaker, salt shaker
+774 sandal
+775 sarong
+776 sax, saxophone
+777 scabbard
+778 scale, weighing machine
+779 school bus
+780 schooner
+781 scoreboard
+782 screen, CRT screen
+783 screw
+784 screwdriver
+785 seat belt, seatbelt
+786 sewing machine
+787 shield, buckler
+788 shoe shop, shoe-shop, shoe store
+789 shoji
+790 shopping basket
+791 shopping cart
+792 shovel
+793 shower cap
+794 shower curtain
+795 ski
+796 ski mask
+797 sleeping bag
+798 slide rule, slipstick
+799 sliding door
+800 slot, one-armed bandit
+801 snorkel
+802 snowmobile
+803 snowplow, snowplough
+804 soap dispenser
+805 soccer ball
+806 sock
+807 solar dish, solar collector, solar furnace
+808 sombrero
+809 soup bowl
+810 space bar
+811 space heater
+812 space shuttle
+813 spatula
+814 speedboat
+815 spider web, spider's web
+816 spindle
+817 sports car, sport car
+818 spotlight, spot
+819 stage
+820 steam locomotive
+821 steel arch bridge
+822 steel drum
+823 stethoscope
+824 stole
+825 stone wall
+826 stopwatch, stop watch
+827 stove
+828 strainer
+829 streetcar, tram, tramcar, trolley, trolley car
+830 stretcher
+831 studio couch, day bed
+832 stupa, tope
+833 submarine, pigboat, sub, U-boat
+834 suit, suit of clothes
+835 sundial
+836 sunglass
+837 sunglasses, dark glasses, shades
+838 sunscreen, sunblock, sun blocker
+839 suspension bridge
+840 swab, swob, mop
+841 sweatshirt
+842 swimming trunks, bathing trunks
+843 swing
+844 switch, electric switch, electrical switch
+845 syringe
+846 table lamp
+847 tank, army tank, armored combat vehicle, armoured combat vehicle
+848 tape player
+849 teapot
+850 teddy, teddy bear
+851 television, television system
+852 tennis ball
+853 thatch, thatched roof
+854 theater curtain, theatre curtain
+855 thimble
+856 thresher, thrasher, threshing machine
+857 throne
+858 tile roof
+859 toaster
+860 tobacco shop, tobacconist shop, tobacconist
+861 toilet seat
+862 torch
+863 totem pole
+864 tow truck, tow car, wrecker
+865 toyshop
+866 tractor
+867 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+868 tray
+869 trench coat
+870 tricycle, trike, velocipede
+871 trimaran
+872 tripod
+873 triumphal arch
+874 trolleybus, trolley coach, trackless trolley
+875 trombone
+876 tub, vat
+877 turnstile
+878 typewriter keyboard
+879 umbrella
+880 unicycle, monocycle
+881 upright, upright piano
+882 vacuum, vacuum cleaner
+883 vase
+884 vault
+885 velvet
+886 vending machine
+887 vestment
+888 viaduct
+889 violin, fiddle
+890 volleyball
+891 waffle iron
+892 wall clock
+893 wallet, billfold, notecase, pocketbook
+894 wardrobe, closet, press
+895 warplane, military plane
+896 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+897 washer, automatic washer, washing machine
+898 water bottle
+899 water jug
+900 water tower
+901 whiskey jug
+902 whistle
+903 wig
+904 window screen
+905 window shade
+906 Windsor tie
+907 wine bottle
+908 wing
+909 wok
+910 wooden spoon
+911 wool, woolen, woollen
+912 worm fence, snake fence, snake-rail fence, Virginia fence
+913 wreck
+914 yawl
+915 yurt
+916 web site, website, internet site, site
+917 comic book
+918 crossword puzzle, crossword
+919 street sign
+920 traffic light, traffic signal, stoplight
+921 book jacket, dust cover, dust jacket, dust wrapper
+922 menu
+923 plate
+924 guacamole
+925 consomme
+926 hot pot, hotpot
+927 trifle
+928 ice cream, icecream
+929 ice lolly, lolly, lollipop, popsicle
+930 French loaf
+931 bagel, beigel
+932 pretzel
+933 cheeseburger
+934 hotdog, hot dog, red hot
+935 mashed potato
+936 head cabbage
+937 broccoli
+938 cauliflower
+939 zucchini, courgette
+940 spaghetti squash
+941 acorn squash
+942 butternut squash
+943 cucumber, cuke
+944 artichoke, globe artichoke
+945 bell pepper
+946 cardoon
+947 mushroom
+948 Granny Smith
+949 strawberry
+950 orange
+951 lemon
+952 fig
+953 pineapple, ananas
+954 banana
+955 jackfruit, jak, jack
+956 custard apple
+957 pomegranate
+958 hay
+959 carbonara
+960 chocolate sauce, chocolate syrup
+961 dough
+962 meat loaf, meatloaf
+963 pizza, pizza pie
+964 potpie
+965 burrito
+966 red wine
+967 espresso
+968 cup
+969 eggnog
+970 alp
+971 bubble
+972 cliff, drop, drop-off
+973 coral reef
+974 geyser
+975 lakeside, lakeshore
+976 promontory, headland, head, foreland
+977 sandbar, sand bar
+978 seashore, coast, seacoast, sea-coast
+979 valley, vale
+980 volcano
+981 ballplayer, baseball player
+982 groom, bridegroom
+983 scuba diver
+984 rapeseed
+985 daisy
+986 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+987 corn
+988 acorn
+989 hip, rose hip, rosehip
+990 buckeye, horse chestnut, conker
+991 coral fungus
+992 agaric
+993 gyromitra
+994 stinkhorn, carrion fungus
+995 earthstar
+996 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+997 bolete
+998 ear, spike, capitulum
+999 toilet tissue, toilet paper, bathroom tissue
diff --git a/src/PaddleClas/ppcls/utils/logger.py b/src/PaddleClas/ppcls/utils/logger.py
new file mode 100644
index 0000000..d4faaa9
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/logger.py
@@ -0,0 +1,137 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import logging
+import datetime
+import paddle.distributed as dist
+
+_logger = None
+
+
+def init_logger(name='root', log_file=None, log_level=logging.INFO):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified a FileHandler will also be added.
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    global _logger
+    assert _logger is None, "logger should not be initialized twice or more."
+    _logger = logging.getLogger(name)
+
+    formatter = logging.Formatter(
+        '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
+        datefmt="%Y/%m/%d %H:%M:%S")
+
+    stream_handler = logging.StreamHandler(stream=sys.stdout)
+    stream_handler.setFormatter(formatter)
+    _logger.addHandler(stream_handler)
+    if log_file is not None and dist.get_rank() == 0:
+        log_file_folder = os.path.split(log_file)[0]
+        os.makedirs(log_file_folder, exist_ok=True)
+        file_handler = logging.FileHandler(log_file, 'a')
+        file_handler.setFormatter(formatter)
+        _logger.addHandler(file_handler)
+    if dist.get_rank() == 0:
+        _logger.setLevel(log_level)
+    else:
+        _logger.setLevel(logging.ERROR)
+
+
+def log_at_trainer0(log):
+    """
+    logs will print multi-times when calling Fleet API.
+    Only display single log and ignore the others.
+    """
+
+    def wrapper(fmt, *args):
+        if dist.get_rank() == 0:
+            log(fmt, *args)
+
+    return wrapper
+
+
+@log_at_trainer0
+def info(fmt, *args):
+    _logger.info(fmt, *args)
+
+
+@log_at_trainer0
+def debug(fmt, *args):
+    _logger.debug(fmt, *args)
+
+
+@log_at_trainer0
+def warning(fmt, *args):
+    _logger.warning(fmt, *args)
+
+
+@log_at_trainer0
+def error(fmt, *args):
+    _logger.error(fmt, *args)
+
+
+def scaler(name, value, step, writer):
+    """
+    This function will draw a scalar curve generated by the visualdl.
+    Usage: Install visualdl: pip3 install visualdl==2.0.0b4
+           and then:
+           visualdl --logdir ./scalar --host 0.0.0.0 --port 8830 
+           to preview loss corve in real time.
+    """
+    if writer is None:
+        return
+    writer.add_scalar(tag=name, step=step, value=value)
+
+
+def advertise():
+    """
+    Show the advertising message like the following:
+
+    ===========================================================
+    ==        PaddleClas is powered by PaddlePaddle !        ==
+    ===========================================================
+    ==                                                       ==
+    ==   For more info please go to the following website.   ==
+    ==                                                       ==
+    ==       https://github.com/PaddlePaddle/PaddleClas      ==
+    ===========================================================
+
+    """
+    copyright = "PaddleClas is powered by PaddlePaddle !"
+    ad = "For more info please go to the following website."
+    website = "https://github.com/PaddlePaddle/PaddleClas"
+    AD_LEN = 6 + len(max([copyright, ad, website], key=len))
+
+    info("\n{0}\n{1}\n{2}\n{3}\n{4}\n{5}\n{6}\n{7}\n".format(
+        "=" * (AD_LEN + 4),
+        "=={}==".format(copyright.center(AD_LEN)),
+        "=" * (AD_LEN + 4),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(ad.center(AD_LEN)),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(website.center(AD_LEN)),
+        "=" * (AD_LEN + 4), ))
diff --git a/src/PaddleClas/ppcls/utils/metrics.py b/src/PaddleClas/ppcls/utils/metrics.py
new file mode 100644
index 0000000..b0db68a
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/metrics.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.metrics import average_precision_score
+from sklearn.preprocessing import binarize
+
+import numpy as np
+
+__all__ = ["multi_hot_encode", "hamming_distance", "accuracy_score", "precision_recall_fscore", "mean_average_precision"]
+
+
+def multi_hot_encode(logits, threshold=0.5):
+    """
+    Encode logits to multi-hot by elementwise for multilabel
+    """
+
+    return binarize(logits, threshold=threshold)
+
+
+def hamming_distance(output, target):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    return hamming_loss(target, output)
+
+
+def accuracy_score(output, target, base="sample"):
+    """
+    Hard metric for multilabel classification
+    Args:
+        output:
+        target:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    assert base in ["sample", "label"], 'must be one of ["sample", "label"]'
+
+    if base == "sample":
+        accuracy = accuracy_metric(target, output)
+    elif base == "label":
+        mcm = multilabel_confusion_matrix(target, output)
+        tns = mcm[:, 0, 0]
+        fns = mcm[:, 1, 0]
+        tps = mcm[:, 1, 1]
+        fps = mcm[:, 0, 1]
+
+        accuracy = (sum(tps) + sum(tns)) / (sum(tps) + sum(tns) + sum(fns) + sum(fps))
+
+    return accuracy
+
+
+def precision_recall_fscore(output, target):
+    """
+    Metric based label for multilabel classification
+    Returns:
+        precisions:
+        recalls:
+        fscores:
+    """
+
+    precisions, recalls, fscores, _ = precision_recall_fscore_support(target, output)
+
+    return precisions, recalls, fscores
+
+
+def mean_average_precision(logits, target):
+    """
+    Calculate average precision
+    Args:
+        logits: probability from network before sigmoid or softmax
+        target: ground truth, 0 or 1
+    """
+    if not (isinstance(logits, np.ndarray) and isinstance(target, np.ndarray)):
+        raise TypeError("logits and target should be np.ndarray.")
+
+    aps = []
+    for i in range(target.shape[1]):
+        ap = average_precision_score(target[:, i], logits[:, i])
+        aps.append(ap)
+
+    return np.mean(aps)
diff --git a/src/PaddleClas/ppcls/utils/misc.py b/src/PaddleClas/ppcls/utils/misc.py
new file mode 100644
index 0000000..08ab7b6
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/misc.py
@@ -0,0 +1,63 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['AverageMeter']
+
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    Code was based on https://github.com/pytorch/examples/blob/master/imagenet/main.py
+    """
+
+    def __init__(self, name='', fmt='f', postfix="", need_avg=True):
+        self.name = name
+        self.fmt = fmt
+        self.postfix = postfix
+        self.need_avg = need_avg
+        self.reset()
+
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """ update """
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def total(self):
+        return '{self.name}_sum: {self.sum:{self.fmt}}{self.postfix}'.format(
+            self=self)
+
+    @property
+    def total_minute(self):
+        return '{self.name} {s:{self.fmt}}{self.postfix} min'.format(
+            s=self.sum / 60, self=self)
+
+    @property
+    def mean(self):
+        return '{self.name}: {self.avg:{self.fmt}}{self.postfix}'.format(
+            self=self) if self.need_avg else ''
+
+    @property
+    def value(self):
+        return '{self.name}: {self.val:{self.fmt}}{self.postfix}'.format(
+            self=self)
diff --git a/src/PaddleClas/ppcls/utils/model_zoo.py b/src/PaddleClas/ppcls/utils/model_zoo.py
new file mode 100644
index 0000000..fc527f6
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/model_zoo.py
@@ -0,0 +1,213 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import requests
+import shutil
+import tarfile
+import tqdm
+import zipfile
+
+from ppcls.arch import similar_architectures
+from ppcls.utils import logger
+
+__all__ = ['get']
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+class UrlError(Exception):
+    """ UrlError
+    """
+
+    def __init__(self, url='', code=''):
+        message = "Downloading from {} failed with code {}!".format(url, code)
+        super(UrlError, self).__init__(message)
+
+
+class ModelNameError(Exception):
+    """ ModelNameError
+    """
+
+    def __init__(self, message=''):
+        super(ModelNameError, self).__init__(message)
+
+
+class RetryError(Exception):
+    """ RetryError
+    """
+
+    def __init__(self, url='', times=''):
+        message = "Download from {} failed. Retry({}) limit reached".format(
+            url, times)
+        super(RetryError, self).__init__(message)
+
+
+def _get_url(architecture, postfix="pdparams"):
+    prefix = "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/"
+    fname = architecture + "_pretrained." + postfix
+    return prefix + fname
+
+
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not os.path.exists(dst):
+        shutil.move(src, dst)
+    elif os.path.isfile(src):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = os.path.join(src, fp)
+            dst_fp = os.path.join(dst, fp)
+            if os.path.isdir(src_fp):
+                if os.path.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif os.path.isfile(src_fp) and \
+                    not os.path.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
+
+
+def _download(url, path):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    fname = os.path.split(url)[-1]
+    fullname = os.path.join(path, fname)
+    retry_cnt = 0
+
+    while not os.path.exists(fullname):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RetryError(url, DOWNLOAD_RETRY_LIMIT)
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise UrlError(url, req.status_code)
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = os.path.split(fname)[0]
+    fpath_tmp = os.path.join(fpath, 'tmp')
+    if os.path.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    fs = os.listdir(fpath_tmp)
+    assert len(
+        fs
+    ) == 1, "There should just be 1 pretrained path in an archive file but got {}.".format(
+        len(fs))
+
+    f = fs[0]
+    src_dir = os.path.join(fpath_tmp, f)
+    dst_dir = os.path.join(fpath, f)
+    _move_and_merge_tree(src_dir, dst_dir)
+
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+
+    return f
+
+
+def _get_pretrained():
+    with open('./ppcls/utils/pretrained.list') as flist:
+        pretrained = [line.strip() for line in flist]
+    return pretrained
+
+
+def _check_pretrained_name(architecture):
+    assert isinstance(architecture, str), \
+        ("the type of architecture({}) should be str". format(architecture))
+    pretrained = _get_pretrained()
+    similar_names = similar_architectures(architecture, pretrained)
+    model_list = ', '.join(similar_names)
+    err = "{} is not exist! Maybe you want: [{}]" \
+          "".format(architecture, model_list)
+    if architecture not in similar_names:
+        raise ModelNameError(err)
+
+
+def list_models():
+    pretrained = _get_pretrained()
+    msg = "All avialable pretrained models are as follows: {}".format(
+        pretrained)
+    logger.info(msg)
+    return
+
+
+def get(architecture, path, decompress=False, postfix="pdparams"):
+    """
+    Get the pretrained model.
+    """
+    _check_pretrained_name(architecture)
+    url = _get_url(architecture, postfix=postfix)
+    fname = _download(url, path)
+    if postfix == "tar" and decompress:
+        _decompress(fname)
+    logger.info("download {} finished ".format(fname))
diff --git a/src/PaddleClas/ppcls/utils/pretrained.list b/src/PaddleClas/ppcls/utils/pretrained.list
new file mode 100644
index 0000000..36d70f5
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/pretrained.list
@@ -0,0 +1,121 @@
+ResNet18
+ResNet34
+ResNet50
+ResNet101
+ResNet152
+ResNet50_vc
+ResNet18_vd
+ResNet34_vd
+ResNet50_vd
+ResNet50_vd_v2
+ResNet101_vd
+ResNet152_vd
+ResNet200_vd
+ResNet50_vd_ssld
+ResNet50_vd_ssld_v2
+Fix_ResNet50_vd_ssld_v2
+ResNet101_vd_ssld
+MobileNetV3_large_x0_35
+MobileNetV3_large_x0_5
+MobileNetV3_large_x0_75
+MobileNetV3_large_x1_0
+MobileNetV3_large_x1_25
+MobileNetV3_small_x0_35
+MobileNetV3_small_x0_5
+MobileNetV3_small_x0_75
+MobileNetV3_small_x1_0
+MobileNetV3_small_x1_25
+MobileNetV3_large_x1_0_ssld
+MobileNetV3_large_x1_0_ssld_int8
+MobileNetV3_small_x1_0_ssld
+MobileNetV2_x0_25
+MobileNetV2_x0_5
+MobileNetV2_x0_75
+MobileNetV2
+MobileNetV2_x1_5
+MobileNetV2_x2_0
+MobileNetV2_ssld
+MobileNetV1_x0_25
+MobileNetV1_x0_5
+MobileNetV1_x0_75
+MobileNetV1
+MobileNetV1_ssld
+ShuffleNetV2_x0_25
+ShuffleNetV2_x0_33
+ShuffleNetV2_x0_5
+ShuffleNetV2
+ShuffleNetV2_x1_5
+ShuffleNetV2_x2_0
+ShuffleNetV2_swish
+ResNeXt50_32x4d
+ResNeXt50_64x4d
+ResNeXt101_32x4d
+ResNeXt101_64x4d
+ResNeXt152_32x4d
+ResNeXt152_64x4d
+ResNeXt50_vd_32x4d
+ResNeXt50_vd_64x4d
+ResNeXt101_vd_32x4d
+ResNeXt101_vd_64x4d
+ResNeXt152_vd_32x4d
+ResNeXt152_vd_64x4d
+SE_ResNet18_vd
+SE_ResNet34_vd
+SE_ResNet50_vd
+SE_ResNeXt50_32x4d
+SE_ResNeXt101_32x4d
+SE_ResNeXt50_vd_32x4d
+SENet154_vd
+Res2Net50_26w_4s
+Res2Net50_vd_26w_4s
+Res2Net50_14w_8s
+Res2Net101_vd_26w_4s
+Res2Net200_vd_26w_4s
+GoogLeNet
+InceptionV4
+Xception41
+Xception41_deeplab
+Xception65
+Xception65_deeplab
+Xception71
+HRNet_W18_C
+HRNet_W30_C
+HRNet_W32_C
+HRNet_W40_C
+HRNet_W44_C
+HRNet_W48_C
+HRNet_W64_C
+DPN68
+DPN92
+DPN98
+DPN107
+DPN131
+DenseNet121
+DenseNet161
+DenseNet169
+DenseNet201
+DenseNet264
+EfficientNetB0_small
+EfficientNetB0
+EfficientNetB1
+EfficientNetB2
+EfficientNetB3
+EfficientNetB4
+EfficientNetB5
+EfficientNetB6
+EfficientNetB7
+ResNeXt101_32x8d_wsl
+ResNeXt101_32x16d_wsl
+ResNeXt101_32x32d_wsl
+ResNeXt101_32x48d_wsl
+Fix_ResNeXt101_32x48d_wsl
+AlexNet
+SqueezeNet1_0
+SqueezeNet1_1
+VGG11
+VGG13
+VGG16
+VGG19
+DarkNet53_ImageNet1k
+ResNet50_ACNet_deploy
+CSPResNet50_leaky
diff --git a/src/PaddleClas/ppcls/utils/profiler.py b/src/PaddleClas/ppcls/utils/profiler.py
new file mode 100644
index 0000000..7cf945a
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/profiler.py
@@ -0,0 +1,111 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(
+            _profiler_options['state'], _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/src/PaddleClas/ppcls/utils/save_load.py b/src/PaddleClas/ppcls/utils/save_load.py
new file mode 100644
index 0000000..625a284
--- /dev/null
+++ b/src/PaddleClas/ppcls/utils/save_load.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import re
+import shutil
+import tempfile
+
+import paddle
+from ppcls.utils import logger
+from .download import get_weights_path_from_url
+
+__all__ = ['init_model', 'save_model', 'load_dygraph_pretrain']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def load_dygraph_pretrain(model, path=None):
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(path))
+    param_state_dict = paddle.load(path + ".pdparams")
+    model.set_dict(param_state_dict)
+    return
+
+
+def load_dygraph_pretrain_from_url(model, pretrained_url, use_ssld=False):
+    if use_ssld:
+        pretrained_url = pretrained_url.replace("_pretrained",
+                                                "_ssld_pretrained")
+    local_weight_path = get_weights_path_from_url(pretrained_url).replace(
+        ".pdparams", "")
+    load_dygraph_pretrain(model, path=local_weight_path)
+    return
+
+
+def load_distillation_model(model, pretrained_model):
+    logger.info("In distillation mode, teacher model will be "
+                "loaded firstly before student model.")
+
+    if not isinstance(pretrained_model, list):
+        pretrained_model = [pretrained_model]
+
+    teacher = model.teacher if hasattr(model,
+                                       "teacher") else model._layers.teacher
+    student = model.student if hasattr(model,
+                                       "student") else model._layers.student
+    load_dygraph_pretrain(teacher, path=pretrained_model[0])
+    logger.info("Finish initing teacher model from {}".format(
+        pretrained_model))
+    # load student model
+    if len(pretrained_model) >= 2:
+        load_dygraph_pretrain(student, path=pretrained_model[1])
+        logger.info("Finish initing student model from {}".format(
+            pretrained_model))
+
+
+def init_model(config, net, optimizer=None):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints and optimizer is not None:
+        assert os.path.exists(checkpoints + ".pdparams"), \
+            "Given dir {}.pdparams not exist.".format(checkpoints)
+        assert os.path.exists(checkpoints + ".pdopt"), \
+            "Given dir {}.pdopt not exist.".format(checkpoints)
+        para_dict = paddle.load(checkpoints + ".pdparams")
+        opti_dict = paddle.load(checkpoints + ".pdopt")
+        metric_dict = paddle.load(checkpoints + ".pdstates")
+        net.set_dict(para_dict)
+        optimizer.set_state_dict(opti_dict)
+        logger.info("Finish load checkpoints from {}".format(checkpoints))
+        return metric_dict
+
+    pretrained_model = config.get('pretrained_model')
+    use_distillation = config.get('use_distillation', False)
+    if pretrained_model:
+        if use_distillation:
+            load_distillation_model(net, pretrained_model)
+        else:  # common load
+            load_dygraph_pretrain(net, path=pretrained_model)
+            logger.info(
+                logger.coloring("Finish load pretrained model from {}".format(
+                    pretrained_model), "HEADER"))
+
+
+def save_model(net,
+               optimizer,
+               metric_info,
+               model_path,
+               model_name="",
+               prefix='ppcls'):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    model_path = os.path.join(model_path, model_name)
+    _mkdir_if_not_exist(model_path)
+    model_path = os.path.join(model_path, prefix)
+
+    paddle.save(net.state_dict(), model_path + ".pdparams")
+    paddle.save(optimizer.state_dict(), model_path + ".pdopt")
+    paddle.save(metric_info, model_path + ".pdstates")
+    logger.info("Already save model in {}".format(model_path))