写在前面

继续之前没有介绍完的 Pre-training 部分，在上一篇中(BERT源码分析（PART II）)我们已经完成了对输入数据的处理，接下来看看 BERT 是怎么完成「Masked LM」和「Next Sentence Prediction」两个任务的训练。

run_pretraining^[1]

除了代码块外部，在内部也有注释噢。之前代码黑色背景好像有点不舒服，换成白色试试

另外，把BERT源码分析系列整理成了PDF版本方便阅读，有需要的可以在文末获取（别急着拉到下面，先看完这篇

）

任务#1：Masked LM

get_masked_lm_output函数用于计算「任务#1」的训练 loss。输入为 BertModel 的最后一层 sequence_output 输出（[batch_size, seq_length, hidden_size]），因为对一个序列的 MASK 标记的预测属于标注问题，需要整个 sequence 的输出状态。

defget_masked_lm_output(bert_config, input_tensor, output_weights, positions,

 label_ids, label_weights):
"""Get loss and log probs for the masked LM."""
# 获取mask词的encode
 input_tensor = gather_indexes(input_tensor, positions)


with tf.variable_scope(
"cls/predictions"):

# 在输出之前添加一个非线性变换，只在预训练阶段起作用
with tf.variable_scope(
"transform"):

 input_tensor = tf.layers.dense(

 input_tensor,

 units=bert_config.hidden_size,

 activation=modeling.get_activation(bert_config.hidden_act),

 kernel_initializer=modeling.create_initializer(

 bert_config.initializer_range))

 input_tensor = modeling.layer_norm(input_tensor)


# output_weights是和传入的word embedding一样的
# 这里再添加一个bias
 output_bias = tf.get_variable(

"output_bias",

 shape=[bert_config.vocab_size],

 initializer=tf.zeros_initializer())

 logits = tf.matmul(input_tensor, output_weights, transpose_b=
True)

 logits = tf.nn.bias_add(logits, output_bias)

 log_probs = tf.nn.log_softmax(logits, axis=
-1)


# label_ids表示mask掉的Token的id
 label_ids = tf.reshape(label_ids, [
-1])

 label_weights = tf.reshape(label_weights, [
-1])


 one_hot_labels = tf.one_hot(

 label_ids, depth=bert_config.vocab_size, dtype=tf.float32)


# 但是由于实际MASK的可能不到20，比如只MASK18，那么label_ids有2个0(padding)
# 而label_weights=[1, 1, ...., 0, 0]，说明后面两个label_id是padding的，计算loss要去掉。
 per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[
-1])

 numerator = tf.reduce_sum(label_weights * per_example_loss)

 denominator = tf.reduce_sum(label_weights) + 
1e-5
 loss = numerator / denominator


return (loss, per_example_loss, log_probs)

任务#2 Next Sentence Prediction

get_next_sentence_output函数用于计算「任务#2」的训练 loss。输入为 BertModel 的最后一层 pooled_output 输出（[batch_size, hidden_size]），因为该任务属于二分类问题，所以只需要每个序列的第一个 token【CLS】即可。

defget_next_sentence_output(bert_config, input_tensor, labels):
"""Get loss and log probs for the next sentence prediction."""

# 标签0表示 下一个句子关系成立；标签1表示 下一个句子关系不成立。
# 这个分类器的参数在实际Fine-tuning阶段会丢弃掉
with tf.variable_scope(
"cls/seq_relationship"):

 output_weights = tf.get_variable(

"output_weights",

 shape=[
2, bert_config.hidden_size],

 initializer=modeling.create_initializer(bert_config.initializer_range))

 output_bias = tf.get_variable(

"output_bias", shape=[
2], initializer=tf.zeros_initializer())


 logits = tf.matmul(input_tensor, output_weights, transpose_b=
True)

 logits = tf.nn.bias_add(logits, output_bias)

 log_probs = tf.nn.log_softmax(logits, axis=
-1)

 labels = tf.reshape(labels, [
-1])

 one_hot_labels = tf.one_hot(labels, depth=
2, dtype=tf.float32)

 per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=
-1)

 loss = tf.reduce_mean(per_example_loss)

return (loss, per_example_loss, log_probs)

自定义模型

module_fn_builder函数，用于构造 Estimator 使用的model_fn。定义好了上述两个训练任务，就可以写出训练过程，之后将训练集传入自动训练。

defmodel_fn_builder(bert_config, init_checkpoint, learning_rate,

 num_train_steps, num_warmup_steps, use_tpu,

 use_one_hot_embeddings):

defmodel_fn(features, labels, mode, params):

 tf.logging.info(
"*** Features ***")

for name 
in sorted(features.keys()):

 tf.logging.info(
" name = %s, shape = %s" % (name, features[name].shape))


 input_ids = features[
"input_ids"]

 input_mask = features[
"input_mask"]

 segment_ids = features[
"segment_ids"]

 masked_lm_positions = features[
"masked_lm_positions"]

 masked_lm_ids = features[
"masked_lm_ids"]

 masked_lm_weights = features[
"masked_lm_weights"]

 next_sentence_labels = features[
"next_sentence_labels"]


 is_training = (mode == tf.estimator.ModeKeys.TRAIN)


# 创建Transformer实例对象
 model = modeling.BertModel(

 config=bert_config,

 is_training=is_training,

 input_ids=input_ids,

 input_mask=input_mask,

 token_type_ids=segment_ids,

 use_one_hot_embeddings=use_one_hot_embeddings)


# 获得MASK LM任务的批损失，平均损失以及预测概率矩阵
 (masked_lm_loss,

 masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(

 bert_config, model.get_sequence_output(), model.get_embedding_table(),

 masked_lm_positions, masked_lm_ids, masked_lm_weights)


# 获得NEXT SENTENCE PREDICTION任务的批损失，平均损失以及预测概率矩阵
 (next_sentence_loss, next_sentence_example_loss,

 next_sentence_log_probs) = get_next_sentence_output(

 bert_config, model.get_pooled_output(), next_sentence_labels)


# 总的损失定义为两者之和
 total_loss = masked_lm_loss + next_sentence_loss


# 获取所有变量
 tvars = tf.trainable_variables()


 initialized_variable_names = {}

 scaffold_fn = 
None
# 如果有之前保存的模型，则进行恢复
if init_checkpoint:

 (assignment_map, initialized_variable_names

 ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)

if use_tpu:


deftpu_scaffold():
 tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

return tf.train.Scaffold()


 scaffold_fn = tpu_scaffold

else:

 tf.train.init_from_checkpoint(init_checkpoint, assignment_map)


 tf.logging.info(
"**** Trainable Variables ****")

for var 
in tvars:

 init_string = 
""
if var.name 
in initialized_variable_names:

 init_string = 
", *INIT_FROM_CKPT*"
 tf.logging.info(
" name = %s, shape = %s%s", var.name, var.shape,

 init_string)


 output_spec = 
None
# 训练过程，获得spec
if mode == tf.estimator.ModeKeys.TRAIN:

 train_op = optimization.create_optimizer(

 total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)


 output_spec = tf.contrib.tpu.TPUEstimatorSpec(

 mode=mode,

 loss=total_loss,

 train_op=train_op,

 scaffold_fn=scaffold_fn)

# 验证过程spec
elif mode == tf.estimator.ModeKeys.EVAL:


defmetric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,

 masked_lm_weights, next_sentence_example_loss,

 next_sentence_log_probs, next_sentence_labels):
"""计算损失和准确率"""
 masked_lm_log_probs = tf.reshape(masked_lm_log_probs,

 [
-1, masked_lm_log_probs.shape[
-1]])

 masked_lm_predictions = tf.argmax(

 masked_lm_log_probs, axis=
-1, output_type=tf.int32)

 masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [
-1])

 masked_lm_ids = tf.reshape(masked_lm_ids, [
-1])

 masked_lm_weights = tf.reshape(masked_lm_weights, [
-1])

 masked_lm_accuracy = tf.metrics.accuracy(

 labels=masked_lm_ids,

 predictions=masked_lm_predictions,

 weights=masked_lm_weights)

 masked_lm_mean_loss = tf.metrics.mean(

 values=masked_lm_example_loss, weights=masked_lm_weights)


 next_sentence_log_probs = tf.reshape(

 next_sentence_log_probs, [
-1, next_sentence_log_probs.shape[
-1]])

 next_sentence_predictions = tf.argmax(

 next_sentence_log_probs, axis=
-1, output_type=tf.int32)

 next_sentence_labels = tf.reshape(next_sentence_labels, [
-1])

 next_sentence_accuracy = tf.metrics.accuracy(

 labels=next_sentence_labels, predictions=next_sentence_predictions)

 next_sentence_mean_loss = tf.metrics.mean(

 values=next_sentence_example_loss)


return {

"masked_lm_accuracy": masked_lm_accuracy,

"masked_lm_loss": masked_lm_mean_loss,

"next_sentence_accuracy": next_sentence_accuracy,

"next_sentence_loss": next_sentence_mean_loss,

 }


 eval_metrics = (metric_fn, [

 masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,

 masked_lm_weights, next_sentence_example_loss,

 next_sentence_log_probs, next_sentence_labels

 ])

 output_spec = tf.contrib.tpu.TPUEstimatorSpec(

 mode=mode,

 loss=total_loss,

 eval_metrics=eval_metrics,

 scaffold_fn=scaffold_fn)

else:

raise ValueError(
"Only TRAIN and EVAL modes are supported: %s" % (mode))


return output_spec


return model_fn

主函数

基于上述函数实现训练过程

defmain(_):
 tf.logging.set_verbosity(tf.logging.INFO)

ifnot FLAGS.do_train 
andnot FLAGS.do_eval:

raise ValueError(
"At least one of `do_train` or `do_eval` must be True.")

 bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

 tf.gfile.MakeDirs(FLAGS.output_dir)


 input_files = []

for input_pattern 
in FLAGS.input_file.split(
","):

 input_files.extend(tf.gfile.Glob(input_pattern))


 tf.logging.info(
"*** Input Files ***")

for input_file 
in input_files:

 tf.logging.info(
" %s" % input_file)


 tpu_cluster_resolver = 
None
if FLAGS.use_tpu 
and FLAGS.tpu_name:

 tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(

 FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)


 is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

 run_config = tf.contrib.tpu.RunConfig(

 cluster=tpu_cluster_resolver,

 master=FLAGS.master,

 model_dir=FLAGS.output_dir,

 save_checkpoints_steps=FLAGS.save_checkpoints_steps,

 tpu_config=tf.contrib.tpu.TPUConfig(

 iterations_per_loop=FLAGS.iterations_per_loop,

 num_shards=FLAGS.num_tpu_cores,

 per_host_input_for_training=is_per_host))


# 自定义模型用于estimator训练
 model_fn = model_fn_builder(

 bert_config=bert_config,

 init_checkpoint=FLAGS.init_checkpoint,

 learning_rate=FLAGS.learning_rate,

 num_train_steps=FLAGS.num_train_steps,

 num_warmup_steps=FLAGS.num_warmup_steps,

 use_tpu=FLAGS.use_tpu,

 use_one_hot_embeddings=FLAGS.use_tpu)


# 如果没有TPU，会自动转为CPU/GPU的Estimator
 estimator = tf.contrib.tpu.TPUEstimator(

 use_tpu=FLAGS.use_tpu,

 model_fn=model_fn,

 config=run_config,

 train_batch_size=FLAGS.train_batch_size,

 eval_batch_size=FLAGS.eval_batch_size)


if FLAGS.do_train:

 tf.logging.info(
"***** Running training *****")

 tf.logging.info(
" Batch size = %d", FLAGS.train_batch_size)

 train_input_fn = input_fn_builder(

 input_files=input_files,

 max_seq_length=FLAGS.max_seq_length,

 max_predictions_per_seq=FLAGS.max_predictions_per_seq,

 is_training=
True)

 estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)


if FLAGS.do_eval:

 tf.logging.info(
"***** Running evaluation *****")

 tf.logging.info(
" Batch size = %d", FLAGS.eval_batch_size)


 eval_input_fn = input_fn_builder(

 input_files=input_files,

 max_seq_length=FLAGS.max_seq_length,

 max_predictions_per_seq=FLAGS.max_predictions_per_seq,

 is_training=
False)


 result = estimator.evaluate(

 input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)


 output_eval_file = os.path.join(FLAGS.output_dir, 
"eval_results.txt")

with tf.gfile.GFile(output_eval_file, 
"w") 
as writer:

 tf.logging.info(
"***** Eval results *****")

for key 
in sorted(result.keys()):

 tf.logging.info(
" %s = %s", key, str(result[key]))

 writer.write(
"%s = %s\n" % (key, str(result[key])))

代码测试

预训练运行脚本

python run_pretraining.py \

 --input_file=/tmp/tf_examples.tfrecord \

 --output_dir=/tmp/pretraining_output \

 --do_train=True \

 --do_eval=True \

 --bert_config_file=$BERT_BASE_DIR/bert_config.json \

 --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \

 --train_batch_size=32 \

 --max_seq_length=128 \

 --max_predictions_per_seq=20 \

 --num_train_steps=20 \

 --num_warmup_steps=10 \

 --learning_rate=2e-5

之后你可以得到类似以下输出日志：

***** Eval results *****

 global_step = 20

 loss = 0.0979674

 masked_lm_accuracy = 0.985479

 masked_lm_loss = 0.0979328

 next_sentence_accuracy = 1.0

 next_sentence_loss = 3.45724e-05

最后贴一个预训练过程的 tips【反正我也做不了，看看就行= 。=】