当前位置 博文首页 > 炫云云:AlbertTransformerEncoder
class AlbertTransformerEncoder(Layer):
"""BERT (https://arxiv.org/abs/1810.04805),
ALBERT将嵌入参数重构为两个更小的矩阵,并跨层共享参数。
"""
def __init__(self,
vocab_size,
embedding_width = 128,
hidden_size = 768,
num_hidden_layers = 12,
num_attention_heads = 12,
num_hidden_groups = 1,
inner_group_num = 1,
sequence_length = 512,
max_sequence_length = None,
type_vocab_size = 16,
intermediate_size = 3072,
activation = gelu,
dropout_rate = 0.1,
attention_dropout_rate = 0.1,
initializer = tf.keras.initializers.TruncatedNormal(stddev = 0.02),
return_all_encoder_outputs = False,
**kwargs):
"""
:param vocab_size:标记词汇表的大小。
:param embedding_width: 单词嵌入的宽度。如果嵌入宽度不等于隐藏大小,
嵌入参数将分解成两个矩阵,形状为['vocab_size', ' embeddding_width ']
和[' embeddding_width ', 'hidden_size'] (' embeddding_width '通常比'hidden_size'小得多)。
:param hidden_size: The size of the transformer hidden layers.
:param num_hidden_layers: The number of transformer layers.
:param num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
:param num_hidden_groups:
:param inner_group_num:
:param sequence_length: The sequence length that this encoder expects. If None, the
sequence length is dynamic; if an integer, the encoder will require
sequences padded to this length.
:param max_sequence_length:The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
:param type_vocab_size: The number of types that the 'type_ids' input can take.
:param intermediate_size: The intermediate size for the transformer layers.
:param activation: The activation to use for the transformer layers.
:param dropout_rate: The dropout rate to use for the transformer layers.
:param attention_dropout_rate: The dropout rate to use for the attention layers
within the transformer layers.
:param initializer: The initialzer to use for all weights in this encoder.
:param return_all_encoder_outputs:
:param kwargs:
"""
super(AlbertTransformerEncoder, self).__init__(**kwargs)
if inner_group_num != 1:
raise ValueError("We only support 'inner_group_num' as 1.")
if not max_sequence_length:
max_sequence_length = sequence_length
self._activation = activation
self._config_dict = {
'vocab_size': vocab_size,
'embedding_width': embedding_width,
'hidden_size': hidden_size,
'num_hidden_layers': num_hidden_layers,
'num_attention_heads': num_attention_heads,
'num_hidden_groups': num_hidden_groups,
'inner_group_num': inner_group_num,
'sequence_length': sequence_length,
'max_sequence_length': max_sequence_length,
'type_vocab_size': type_vocab_size,
'intermediate_size': intermediate_size,
'activation': tf.keras.activations.serialize(activation), #返回激活函数的字符串标识符
'dropout_rate': dropout_rate,
'attention_dropout_rate': attention_dropout_rate,
'initializer': tf.keras.initializers.serialize(initializer),
'return_all_encoder_outputs': return_all_encoder_outputs
}
def build(self, input_shape):
if isinstance(input_shape, tf.TensorShape):
input_shape = input_shape.as_list()#为每个维度返回一个整数或“None”的列表
self._embedding_layer = Layers.OnDeviceEmbedding(
vocab_size = self._config_dict['vocab_size'],
embedding_width = self._config_dict['embedding_width'],
initializer = self._config_dict['initializer'],
name = 'word_embeddings')
# Always uses dynamic slicing for simplicity.
self._position_embedding_layer = Layers.PositionEmbedding(
initializer = self._config_dict['initializer'],
use_dynamic_slicing = True,
max_sequence_length = self._config_dict['max_sequence_length'],
name = 'position_embeddings')
self._position_embedding_layer.build(input_shape[0] + [self._config_dict["embedding_width"]])
self._type_embedding_layer = Layers.OnDeviceEmbedding(
vocab_size = self._config_dict['type_vocab_size'],
embedding_width = self._config_dict['embedding_width'],
initializer = self._config_dict['initializer'],
use_one_hot = True,
name = 'type_embeddings')
self._embedding_layer_normalization = tf.keras.layers.LayerNormalization(
name = 'embeddings/layer_norm',
axis = -1,
epsilon = 1e-12,
dtype = tf.float32)
self._embedding_layer_normalization.build(input_shape[0] + [self._config_dict['embedding_width']])
self._embedding_dropout = tf.keras.layers.Dropout(rate = self._config_dict['dropout_rate'])
self._embedding_projection_layer = Layers.DenseEinsum(
output_shape = self._config_dict['hidden_size'],
kernel_initializer = self._config_dict['initializer'],
name = 'embedding_projection')
self._embedding_projection_layer.build(input_shape = input_shape[0] + [self._config_dict['embedding_width']])
self._self_attention_mask = Layers.SelfAttentionMask()
# transformer layer
self._transformer_layers = []
last_name = None
for layer_idx in range(self._config_dict['num_hidden_layers']):
group_idx = int(layer_idx / self._config_dict["num_hidden_layes"] * self._config_dict['num_hidden_groups'])
if group_idx == last_name:
layer = self._transformer_layers[-1]
else:
layer = Layers.Transformer(
num_attention_heads = self._config_dict['num_attention_heads'],
intermediate_size = self._config_dict['intermediate_size'],
intermediate_activation = self._activation,
dropout_rate = self._config_dict['dropout_rate'],
attention_dropout_rate = self._config_dict['attention_dropout_rate'],
kernel_initializer = self._config_dict['initializer'],
name = 'transformer/layer_%d' % group_idx
)
layer.build([input_shape[0] + [self._config_dict['hidden_size']], input_shape[0] + input_shape[0][-1:]])
last_name = group_idx
self._transformer_layers.append(layer)
self._cls_output_layer = tf.keras.layers.Dense(
units = self._config_dict['hidden_size'],
activation = 'tanh',
kernel_initializer = self._config_dict['initializer'],
name = 'pooler_transform')
self._cls_output_layer.build(input_shape[0] + [self._config_dict['hidden_size']])
super().build(input_shape)
def call(self, inputs):
word_ids, mask, type_ids = inputs
word_embeddings = self._embedding_layer(word_ids)
position_embeddings = self._position_embedding_layer(word_embeddings)
type_embeddings = self._type_embedding_layer(type_ids)
# embeddings = tf.keras.layers.Add()([word_embeddings, position_embeddings, type_embeddings])
embeddings = word_embeddings + position_embeddings + type_embeddings
embeddings = self._embedding_layer_normalization(embeddings)
embeddings = self._embedding_dropout(embeddings, training = True)
# 我们将'embedding' 输出投射到'hidden_size',如果它还不是'hidden_size'。
if self._config_dict['embedding_width'] != self._config_dict['hidden_size']:
embeddings = self._embedding_projection_layer(embeddings)
data = embeddings
attention_mask