当前位置 博文首页 > 炫云云:AlbertTransformerEncoder

    炫云云:AlbertTransformerEncoder

    作者:[db:作者] 时间:2021-09-09 21:55

    class AlbertTransformerEncoder(Layer):
        """BERT (https://arxiv.org/abs/1810.04805),
        ALBERT将嵌入参数重构为两个更小的矩阵,并跨层共享参数。
        """
        def __init__(self,
                     vocab_size,
                     embedding_width = 128,
                     hidden_size = 768,
                     num_hidden_layers = 12,
                     num_attention_heads = 12,
                     num_hidden_groups = 1,
                     inner_group_num = 1,
                     sequence_length = 512,
                     max_sequence_length = None,
                     type_vocab_size = 16,
                     intermediate_size = 3072,
                     activation = gelu,
                     dropout_rate = 0.1,
                     attention_dropout_rate = 0.1,
                     initializer = tf.keras.initializers.TruncatedNormal(stddev = 0.02),
                     return_all_encoder_outputs = False,
                     **kwargs):
            """
            :param vocab_size:标记词汇表的大小。
            :param embedding_width: 单词嵌入的宽度。如果嵌入宽度不等于隐藏大小,
                            嵌入参数将分解成两个矩阵,形状为['vocab_size', ' embeddding_width ']
                            和[' embeddding_width ', 'hidden_size'] (' embeddding_width '通常比'hidden_size'小得多)。
            :param hidden_size: The size of the transformer hidden layers.
            :param num_hidden_layers: The number of transformer layers.
            :param num_attention_heads: The number of attention heads for each transformer. The
                                    hidden size must be divisible by the number of attention heads.
            :param num_hidden_groups:
            :param inner_group_num:
            :param sequence_length: The sequence length that this encoder expects. If None, the
                              sequence length is dynamic; if an integer, the encoder will require
                              sequences padded to this length.
            :param max_sequence_length:The maximum sequence length that this encoder can
                          consume. If None, max_sequence_length uses the value from sequence length.
                          This determines the variable shape for positional embeddings.
            :param type_vocab_size: The number of types that the 'type_ids' input can take.
            :param intermediate_size:  The intermediate size for the transformer layers.
            :param activation:  The activation to use for the transformer layers.
            :param dropout_rate:  The dropout rate to use for the transformer layers.
            :param attention_dropout_rate: The dropout rate to use for the attention layers
                                 within the transformer layers.
            :param initializer: The initialzer to use for all weights in this encoder.
            :param return_all_encoder_outputs:
            :param kwargs:
            """
            super(AlbertTransformerEncoder, self).__init__(**kwargs)
            if inner_group_num != 1:
                raise ValueError("We only support 'inner_group_num' as 1.")
    
            if not max_sequence_length:
                max_sequence_length = sequence_length
    
            self._activation = activation
            self._config_dict = {
                    'vocab_size':                 vocab_size,
                    'embedding_width':            embedding_width,
                    'hidden_size':                hidden_size,
                    'num_hidden_layers':          num_hidden_layers,
                    'num_attention_heads':        num_attention_heads,
                    'num_hidden_groups':          num_hidden_groups,
                    'inner_group_num':            inner_group_num,
                    'sequence_length':            sequence_length,
                    'max_sequence_length':        max_sequence_length,
                    'type_vocab_size':            type_vocab_size,
                    'intermediate_size':          intermediate_size,
                    'activation':                 tf.keras.activations.serialize(activation), #返回激活函数的字符串标识符
                    'dropout_rate':               dropout_rate,
                    'attention_dropout_rate':     attention_dropout_rate,
                    'initializer':                tf.keras.initializers.serialize(initializer),
                    'return_all_encoder_outputs': return_all_encoder_outputs
            }
    
        def build(self, input_shape):
            if isinstance(input_shape, tf.TensorShape):
                input_shape = input_shape.as_list()#为每个维度返回一个整数或“None”的列表
    
            self._embedding_layer = Layers.OnDeviceEmbedding(
                    vocab_size = self._config_dict['vocab_size'],
                    embedding_width = self._config_dict['embedding_width'],
                    initializer = self._config_dict['initializer'],
                    name = 'word_embeddings')
            # Always uses dynamic slicing for simplicity.
            self._position_embedding_layer = Layers.PositionEmbedding(
                    initializer = self._config_dict['initializer'],
                    use_dynamic_slicing = True,
                    max_sequence_length = self._config_dict['max_sequence_length'],
                    name = 'position_embeddings')
            self._position_embedding_layer.build(input_shape[0] + [self._config_dict["embedding_width"]])
    
            self._type_embedding_layer = Layers.OnDeviceEmbedding(
                    vocab_size = self._config_dict['type_vocab_size'],
                    embedding_width = self._config_dict['embedding_width'],
                    initializer = self._config_dict['initializer'],
                    use_one_hot = True,
                    name = 'type_embeddings')
    
            self._embedding_layer_normalization = tf.keras.layers.LayerNormalization(
                    name = 'embeddings/layer_norm',
                    axis = -1,
                    epsilon = 1e-12,
                    dtype = tf.float32)
            self._embedding_layer_normalization.build(input_shape[0] + [self._config_dict['embedding_width']])
    
            self._embedding_dropout = tf.keras.layers.Dropout(rate = self._config_dict['dropout_rate'])
    
            self._embedding_projection_layer = Layers.DenseEinsum(
                    output_shape = self._config_dict['hidden_size'],
                    kernel_initializer = self._config_dict['initializer'],
                    name = 'embedding_projection')
    
            self._embedding_projection_layer.build(input_shape = input_shape[0] + [self._config_dict['embedding_width']])
    
            self._self_attention_mask = Layers.SelfAttentionMask()
    
            # transformer layer
            self._transformer_layers = []
            last_name = None
            for layer_idx in range(self._config_dict['num_hidden_layers']):
                group_idx = int(layer_idx / self._config_dict["num_hidden_layes"] * self._config_dict['num_hidden_groups'])
                if group_idx == last_name:
                    layer = self._transformer_layers[-1]
                else:
                    layer = Layers.Transformer(
                            num_attention_heads = self._config_dict['num_attention_heads'],
                            intermediate_size = self._config_dict['intermediate_size'],
                            intermediate_activation = self._activation,
                            dropout_rate = self._config_dict['dropout_rate'],
                            attention_dropout_rate = self._config_dict['attention_dropout_rate'],
                            kernel_initializer = self._config_dict['initializer'],
                            name = 'transformer/layer_%d' % group_idx
                    )
                    layer.build([input_shape[0] + [self._config_dict['hidden_size']], input_shape[0] + input_shape[0][-1:]])
                last_name = group_idx
                self._transformer_layers.append(layer)
    
            self._cls_output_layer = tf.keras.layers.Dense(
                    units = self._config_dict['hidden_size'],
                    activation = 'tanh',
                    kernel_initializer = self._config_dict['initializer'],
                    name = 'pooler_transform')
            self._cls_output_layer.build(input_shape[0] + [self._config_dict['hidden_size']])
    
            super().build(input_shape)
    
        def call(self, inputs):
            word_ids, mask, type_ids = inputs
            word_embeddings = self._embedding_layer(word_ids)
            position_embeddings = self._position_embedding_layer(word_embeddings)
            type_embeddings = self._type_embedding_layer(type_ids)
    
            # embeddings = tf.keras.layers.Add()([word_embeddings, position_embeddings, type_embeddings])
            embeddings = word_embeddings + position_embeddings + type_embeddings
            embeddings = self._embedding_layer_normalization(embeddings)
            embeddings = self._embedding_dropout(embeddings, training = True)
    
            # 我们将'embedding' 输出投射到'hidden_size',如果它还不是'hidden_size'。
            if self._config_dict['embedding_width'] != self._config_dict['hidden_size']:
                embeddings = self._embedding_projection_layer(embeddings)
    
            data = embeddings
            attention_mask