Shortcuts

备注

您正在阅读 MMClassification 0.x 版本的文档。MMClassification 0.x 会在 2022 年末被切换为次要分支。建议您升级到 MMClassification 1.0 版本,体验更多新特性和新功能。请查阅 MMClassification 1.0 的安装教程迁移教程以及更新日志

mmcls.models.heads.vision_transformer_head 源代码

# Copyright (c) OpenMMLab. All rights reserved.
import math
from collections import OrderedDict

import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import build_activation_layer
from mmcv.cnn.utils.weight_init import trunc_normal_
from mmcv.runner import Sequential

from ..builder import HEADS
from .cls_head import ClsHead


[文档]@HEADS.register_module() class VisionTransformerClsHead(ClsHead): """Vision Transformer classifier head. Args: num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. hidden_dim (int): Number of the dimensions for hidden layer. Defaults to None, which means no extra hidden layer. act_cfg (dict): The activation config. Only available during pre-training. Defaults to ``dict(type='Tanh')``. init_cfg (dict): The extra initialization configs. Defaults to ``dict(type='Constant', layer='Linear', val=0)``. """ def __init__(self, num_classes, in_channels, hidden_dim=None, act_cfg=dict(type='Tanh'), init_cfg=dict(type='Constant', layer='Linear', val=0), *args, **kwargs): super(VisionTransformerClsHead, self).__init__( init_cfg=init_cfg, *args, **kwargs) self.in_channels = in_channels self.num_classes = num_classes self.hidden_dim = hidden_dim self.act_cfg = act_cfg if self.num_classes <= 0: raise ValueError( f'num_classes={num_classes} must be a positive integer') self._init_layers() def _init_layers(self): if self.hidden_dim is None: layers = [('head', nn.Linear(self.in_channels, self.num_classes))] else: layers = [ ('pre_logits', nn.Linear(self.in_channels, self.hidden_dim)), ('act', build_activation_layer(self.act_cfg)), ('head', nn.Linear(self.hidden_dim, self.num_classes)), ] self.layers = Sequential(OrderedDict(layers)) def init_weights(self): super(VisionTransformerClsHead, self).init_weights() # Modified from ClassyVision if hasattr(self.layers, 'pre_logits'): # Lecun norm trunc_normal_( self.layers.pre_logits.weight, std=math.sqrt(1 / self.layers.pre_logits.in_features)) nn.init.zeros_(self.layers.pre_logits.bias) def pre_logits(self, x): if isinstance(x, tuple): x = x[-1] _, cls_token = x if self.hidden_dim is None: return cls_token else: x = self.layers.pre_logits(cls_token) return self.layers.act(x) def simple_test(self, x, softmax=True, post_process=True): """Inference without augmentation. Args: x (tuple[tuple[tensor, tensor]]): The input features. Multi-stage inputs are acceptable but only the last stage will be used to classify. Every item should be a tuple which includes patch token and cls token. The cls token will be used to classify and the shape of it should be ``(num_samples, in_channels)``. softmax (bool): Whether to softmax the classification score. post_process (bool): Whether to do post processing the inference results. It will convert the output to a list. Returns: Tensor | list: The inference results. - If no post processing, the output is a tensor with shape ``(num_samples, num_classes)``. - If post processing, the output is a multi-dimentional list of float and the dimensions are ``(num_samples, num_classes)``. """ x = self.pre_logits(x) cls_score = self.layers.head(x) if softmax: pred = ( F.softmax(cls_score, dim=1) if cls_score is not None else None) else: pred = cls_score if post_process: return self.post_process(pred) else: return pred def forward_train(self, x, gt_label, **kwargs): x = self.pre_logits(x) cls_score = self.layers.head(x) losses = self.loss(cls_score, gt_label, **kwargs) return losses
Read the Docs v: latest
Versions
master
latest
1.x
dev-1.x
Downloads
pdf
html
epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.