diff --git a/Basic/Albert/albert_tiny_tf/albert/__pycache__/__init__.cpython-36.pyc b/Basic/Albert/albert_tiny_tf/albert/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000..07ec522
Binary files /dev/null and b/Basic/Albert/albert_tiny_tf/albert/__pycache__/__init__.cpython-36.pyc differ
diff --git a/Basic/Albert/albert_tiny_tf/albert/args.py b/Basic/Albert/albert_tiny_tf/albert/args.py
new file mode 100644
index 0000000..993c1bd
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/args.py
@@ -0,0 +1,37 @@
+import os
+import tensorflow as tf
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+file_path = os.path.dirname(__file__)
+
+
+#模型目录
+model_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/')
+
+#config文件
+config_name = os.path.join(file_path, 'albert_config/albert_config.json')
+#ckpt文件名称
+ckpt_name = os.path.join(model_dir, 'model.ckpt')
+#输出文件目录
+output_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/')
+#vocab文件目录
+vocab_file = os.path.join(file_path, 'albert_config/vocab.txt')
+#数据目录
+data_dir = os.path.join(file_path, 'data/')
+
+num_train_epochs = 10
+batch_size = 128
+learning_rate = 0.00005
+
+# gpu使用率
+gpu_memory_fraction = 0.8
+
+# 默认取倒数第二层的输出值作为句向量
+layer_indexes = [-2]
+
+# 序列的最大程度，单文本建议把该值调小
+max_seq_len = 128
+
+# graph名字
+graph_file = os.path.join(file_path, 'albert_lcqmc_checkpoints/graph')
\ No newline at end of file
diff --git a/Basic/Albert/albert_tiny_tf/albert/bert_utils.py b/Basic/Albert/albert_tiny_tf/albert/bert_utils.py
new file mode 100644
index 0000000..f0a731f
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/bert_utils.py
@@ -0,0 +1,148 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import json
+import math
+import re
+import six
+import tensorflow as tf
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+    """Returns a list of the shape of tensor, preferring static dimensions.
+
+    Args:
+        tensor: A tf.Tensor object to find the shape of.
+        expected_rank: (optional) int. The expected rank of `tensor`. If this is
+            specified and the `tensor` has a different rank, and exception will be
+            thrown.
+        name: Optional name of the tensor for the error message.
+
+    Returns:
+        A list of dimensions of the shape of tensor. All static dimensions will
+        be returned as python integers, and dynamic dimensions will be returned
+        as tf.Tensor scalars.
+    """
+    if name is None:
+        name = tensor.name
+
+    if expected_rank is not None:
+        assert_rank(tensor, expected_rank, name)
+
+    shape = tensor.shape.as_list()
+
+    non_static_indexes = []
+    for (index, dim) in enumerate(shape):
+        if dim is None:
+            non_static_indexes.append(index)
+
+    if not non_static_indexes:
+        return shape
+
+    dyn_shape = tf.shape(tensor)
+    for index in non_static_indexes:
+        shape[index] = dyn_shape[index]
+    return shape
+
+
+def reshape_to_matrix(input_tensor):
+    """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
+    ndims = input_tensor.shape.ndims
+    if ndims < 2:
+        raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
+                         (input_tensor.shape))
+    if ndims == 2:
+        return input_tensor
+
+    width = input_tensor.shape[-1]
+    output_tensor = tf.reshape(input_tensor, [-1, width])
+    return output_tensor
+
+
+def reshape_from_matrix(output_tensor, orig_shape_list):
+    """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
+    if len(orig_shape_list) == 2:
+        return output_tensor
+
+    output_shape = get_shape_list(output_tensor)
+
+    orig_dims = orig_shape_list[0:-1]
+    width = output_shape[-1]
+
+    return tf.reshape(output_tensor, orig_dims + [width])
+
+
+def assert_rank(tensor, expected_rank, name=None):
+    """Raises an exception if the tensor rank is not of the expected rank.
+
+    Args:
+        tensor: A tf.Tensor to check the rank of.
+        expected_rank: Python integer or list of integers, expected rank.
+        name: Optional name of the tensor for the error message.
+
+    Raises:
+        ValueError: If the expected shape doesn't match the actual shape.
+    """
+    if name is None:
+        name = tensor.name
+
+    expected_rank_dict = {}
+    if isinstance(expected_rank, six.integer_types):
+        expected_rank_dict[expected_rank] = True
+    else:
+        for x in expected_rank:
+            expected_rank_dict[x] = True
+
+    actual_rank = tensor.shape.ndims
+    if actual_rank not in expected_rank_dict:
+        scope_name = tf.get_variable_scope().name
+        raise ValueError(
+            "For the tensor `%s` in scope `%s`, the actual rank "
+            "`%d` (shape = %s) is not equal to the expected rank `%s`" %
+            (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
+
+
+def gather_indexes(sequence_tensor, positions):
+    """Gathers the vectors at the specific positions over a minibatch."""
+    sequence_shape = get_shape_list(sequence_tensor, expected_rank=3)
+    batch_size = sequence_shape[0]
+    seq_length = sequence_shape[1]
+    width = sequence_shape[2]
+
+    flat_offsets = tf.reshape(
+        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                      [batch_size * seq_length, width])
+    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+    return output_tensor
+
+
+# add sequence mask for:
+# 1. random shuffle lm modeling---xlnet with random shuffled input
+# 2. left2right and right2left language modeling
+# 3. conditional generation
+def generate_seq2seq_mask(attention_mask, mask_sequence, seq_type, **kargs):
+    if seq_type == 'seq2seq':
+        if mask_sequence is not None:
+            seq_shape = get_shape_list(mask_sequence, expected_rank=2)
+            seq_len = seq_shape[1]
+            ones = tf.ones((1, seq_len, seq_len))
+            a_mask = tf.matrix_band_part(ones, -1, 0)
+            s_ex12 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 2)
+            s_ex13 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 3)
+            a_mask = (1 - s_ex13) * (1 - s_ex12) + s_ex13 * a_mask
+            # generate mask of batch x seq_len x seq_len
+            a_mask = tf.reshape(a_mask, (-1, seq_len, seq_len))
+            out_mask = attention_mask * a_mask
+        else:
+            ones = tf.ones_like(attention_mask[:1])
+            mask = (tf.matrix_band_part(ones, -1, 0))
+            out_mask = attention_mask * mask
+    else:
+        out_mask = attention_mask
+
+    return out_mask
diff --git a/Basic/Albert/albert_tiny_tf/albert/create_pretrain_data.sh b/Basic/Albert/albert_tiny_tf/albert/create_pretrain_data.sh
new file mode 100644
index 0000000..b7185f1
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/create_pretrain_data.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+BERT_BASE_DIR=./albert_config
+python3 create_pretraining_data.py --do_whole_word_mask=True --input_file=data/news_zh_1.txt \
+--output_file=data/tf_news_2016_zh_raw_news2016zh_1.tfrecord --vocab_file=$BERT_BASE_DIR/vocab.txt --do_lower_case=True \
+--max_seq_length=512 --max_predictions_per_seq=51 --masked_lm_prob=0.10
\ No newline at end of file
diff --git a/Basic/Albert/albert_tiny_tf/albert/create_pretraining_data.py b/Basic/Albert/albert_tiny_tf/albert/create_pretraining_data.py
new file mode 100644
index 0000000..63a4234
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/create_pretraining_data.py
@@ -0,0 +1,708 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import random
+import tokenization
+import tensorflow as tf
+import jieba
+import re
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None,
+                    "Input raw text file (or comma-separated list of files).")
+
+flags.DEFINE_string(
+    "output_file", None,
+    "Output TF example file (or comma-separated list of files).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_bool(
+    "do_whole_word_mask", False,
+    "Whether to use whole word masking rather than per-WordPiece masking.")
+
+flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+
+flags.DEFINE_integer("max_predictions_per_seq", 20,
+                     "Maximum number of masked LM predictions per sequence.")
+
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+
+flags.DEFINE_integer(
+    "dupe_factor", 10,
+    "Number of times to duplicate the input data (with different masks).")
+
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+
+flags.DEFINE_float(
+    "short_seq_prob", 0.1,
+    "Probability of creating sequences which are shorter than the "
+    "maximum length.")
+
+flags.DEFINE_bool("non_chinese", False,"manually set this to True if you are not doing chinese pre-train task.")
+
+
+class TrainingInstance(object):
+  """A single training instance (sentence pair)."""
+
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+               is_random_next):
+    self.tokens = tokens
+    self.segment_ids = segment_ids
+    self.is_random_next = is_random_next
+    self.masked_lm_positions = masked_lm_positions
+    self.masked_lm_labels = masked_lm_labels
+
+  def __str__(self):
+    s = ""
+    s += "tokens: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+    s += "is_random_next: %s\n" % self.is_random_next
+    s += "masked_lm_positions: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "\n"
+    return s
+
+  def __repr__(self):
+    return self.__str__()
+
+
+def write_instance_to_example_files(instances, tokenizer, max_seq_length,
+                                    max_predictions_per_seq, output_files):
+  """Create TF example files from `TrainingInstance`s."""
+  writers = []
+  for output_file in output_files:
+    writers.append(tf.python_io.TFRecordWriter(output_file))
+
+  writer_index = 0
+
+  total_written = 0
+  for (inst_index, instance) in enumerate(instances):
+    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+    input_mask = [1] * len(input_ids)
+    segment_ids = list(instance.segment_ids)
+    assert len(input_ids) <= max_seq_length
+
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    masked_lm_positions = list(instance.masked_lm_positions)
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+    masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_ids.append(0)
+      masked_lm_weights.append(0.0)
+
+    next_sentence_label = 1 if instance.is_random_next else 0
+
+    features = collections.OrderedDict()
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(input_mask)
+    features["segment_ids"] = create_int_feature(segment_ids)
+    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
+    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
+    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
+    features["next_sentence_labels"] = create_int_feature([next_sentence_label])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+
+    writers[writer_index].write(tf_example.SerializeToString())
+    writer_index = (writer_index + 1) % len(writers)
+
+    total_written += 1
+
+    if inst_index < 20:
+      tf.logging.info("*** Example ***")
+      tf.logging.info("tokens: %s" % " ".join(
+          [tokenization.printable_text(x) for x in instance.tokens]))
+
+      for feature_name in features.keys():
+        feature = features[feature_name]
+        values = []
+        if feature.int64_list.value:
+          values = feature.int64_list.value
+        elif feature.float_list.value:
+          values = feature.float_list.value
+        tf.logging.info(
+            "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
+
+  for writer in writers:
+    writer.close()
+
+  tf.logging.info("Wrote %d total instances", total_written)
+
+
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+
+
+def create_float_feature(values):
+  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+  return feature
+
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+  """Create `TrainingInstance`s from raw text."""
+  all_documents = [[]]
+
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    with tf.gfile.GFile(input_file, "r") as reader:
+      while True:
+        strings=reader.readline()
+        strings=strings.replace("   "," ").replace("  "," ") # 如果有两个或三个空格，替换为一个空格
+        line = tokenization.convert_to_unicode(strings)
+        if not line:
+          break
+        line = line.strip()
+
+        # Empty lines are used as document delimiters
+        if not line:
+          all_documents.append([])
+        tokens = tokenizer.tokenize(line)
+        if tokens:
+          all_documents[-1].append(tokens)
+
+  # Remove empty documents
+  all_documents = [x for x in all_documents if x]
+  rng.shuffle(all_documents)
+
+  vocab_words = list(tokenizer.vocab.keys())
+  instances = []
+  for _ in range(dupe_factor):
+    for document_index in range(len(all_documents)):
+      instances.extend(
+        create_instances_from_document_albert( # change to albert style for sentence order prediction(SOP), 2019-08-28, brightmart
+              all_documents, document_index, max_seq_length, short_seq_prob,
+              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
+
+  rng.shuffle(instances)
+  return instances
+
+def get_new_segment(segment):  # 新增的方法 ####
+    """
+    输入一句话，返回一句经过处理的话: 为了支持中文全称mask，将被分开的词，将上特殊标记("#")，使得后续处理模块，能够知道哪些字是属于同一个词的。
+    :param segment: 一句话. e.g.  ['悬', '灸', '技', '术', '培', '训', '专', '家', '教', '你', '艾', '灸', '降', '血', '糖', '，', '为', '爸', '妈', '收', '好', '了', '！']
+    :return: 一句处理过的话 e.g.    ['悬', '##灸', '技', '术', '培', '训', '专', '##家', '教', '你', '艾', '##灸', '降', '##血', '##糖', '，', '为', '爸', '##妈', '收', '##好', '了', '！']
+    """
+    seq_cws = jieba.lcut("".join(segment)) # 分词
+    seq_cws_dict = {x: 1 for x in seq_cws} # 分词后的词加入到词典dict
+    new_segment = []
+    i = 0
+    while i < len(segment): # 从句子的第一个字开始处理，知道处理完整个句子
+      if len(re.findall('[\u4E00-\u9FA5]', segment[i])) == 0:  # 如果找不到中文的，原文加进去即不用特殊处理。
+        new_segment.append(segment[i])
+        i += 1
+        continue
+
+      has_add = False
+      for length in range(3, 0, -1):
+        if i + length > len(segment):
+          continue
+        if ''.join(segment[i:i + length]) in seq_cws_dict:
+          new_segment.append(segment[i])
+          for l in range(1, length):
+            new_segment.append('##' + segment[i + l])
+          i += length
+          has_add = True
+          break
+      if not has_add:
+        new_segment.append(segment[i])
+        i += 1
+    # print("get_new_segment.wwm.get_new_segment:",new_segment)
+    return new_segment
+
+def create_instances_from_document_albert(
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+  """Creates `TrainingInstance`s for a single document.
+     This method is changed to create sentence-order prediction (SOP) followed by idea from paper of ALBERT, 2019-08-28, brightmart
+  """
+  document = all_documents[document_index] # 得到一个文档
+
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob: # 有一定的比例，如10%的概率，我们使用比较短的序列长度，以缓解预训练的长序列和调优阶段（可能的）短序列的不一致情况
+    target_seq_length = rng.randint(2, max_num_tokens)
+
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  # 设法使用实际的句子，而不是任意的截断句子，从而更好的构造句子连贯性预测的任务
+  instances = []
+  current_chunk = [] # 当前处理的文本段，包含多个句子
+  current_length = 0
+  i = 0
+  # print("###document:",document) # 一个document可以是一整篇文章、新闻、词条等. document:[['是', '爷', '们', '，', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', '，', '获', '取', '育', '儿', '的', '智', '慧', '，', '与', '孩', '子', '一', '同', '成', '长', '！'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', '，', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', '，', '长', '大', '了', '，', '就', '底', '报', '答', '父', '母', '，', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', '，', '可', '以', '花', '心', '，', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', '，', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', '，', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', '，', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', '，', '每', '次', '把', '她', '抱', '在', '怀', '里', '，', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', '，', '我', '都', '会', '引', '以', '为', '傲', '，', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', '，', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', '，', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', '，', '喝', '到', '很', '晚', '，', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', '，', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', '，', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', '，', '喝', '酒', '还', '勉', '强', '过', '得', '去', '，', '不', '过', '该', '喝', '的', '时', '候', '喝', '，', '不', '该', '喝', '的', '时', '候', '，', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', '，', '我', '媳', '妇', '必', '须', '听', '我', '话', '，', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', '，', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', '，', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', '，', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', '，', '不', '管', '上', '学', '还', '是', '上', '班', '，', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', '，', '交', '往', '不', '到', '1', '年', '，', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', '，', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', '，', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', '，', '只', '要', '媳', '妇', '发', '话', '，', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', '，', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', '，', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', '，', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', '，', '可', '是', '我', '会', '告', '诉', '全', '世', '界', '，', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', '，', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', '，', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', '，', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', '，', '做', '恶', '梦', '的', '时', '候', '，', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', '，', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', '，', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', '，', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', '，', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', '，', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', '，', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', '，', '咱', '是', '一', '爷', '们', '，', '给', '媳', '妇', '服', '个', '软', '，', '道', '个', '歉', '怎', '么', '了', '？'], ['我', '是', '一', '爷', '们', '，', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', '，', '拿', '她', '和', '别', '人', '比', '，', '说', '她', '这', '不', '如', '人', '家', '，', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', '，', '陪', '媳', '妇', '逛', '街', '时', '，', '碰', '见', '熟', '人', '，', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', '，', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', '，', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', '，', '来', '自', '网', '络', '，', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', '，', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '（', '微', '信', '号', '：', 'he', '##bc', '##x', '##jy', '）', '。'], ['打', '开', '微', '信', '，', '扫', '描', '二', '维', '码', '，', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', '，', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', '，', '会', '有', '更', '多', '惊', '喜', '哦', '！']]
+  while i < len(document): # 从文档的第一个位置开始，按个往下看
+    segment = document[i] # segment是列表，代表的是按字分开的一个完整句子，如 segment=['我', '是', '一', '爷', '们', '，', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。']
+    if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then do something to make chinese whole word mask works.
+      segment = get_new_segment(segment)  # whole word mask for chinese: 结合分词的中文的whole mask设置即在需要的地方加上“##”
+
+    current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中
+    current_length += len(segment) # 累计到为止位置接触到句子的总长度
+    if i == len(document) - 1 or current_length >= target_seq_length:
+      # 如果累计的序列长度达到了目标的长度，或当前走到了文档结尾==>构造并添加到“A[SEP]B“中的A和B中；
+      if current_chunk: # 如果当前块不为空
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2: # 当前块，如果包含超过两个句子，取当前块的一部分作为“A[SEP]B“中的A部分
+          a_end = rng.randint(1, len(current_chunk) - 1)
+        # 将当前文本段中选取出来的前半部分，赋值给A即tokens_a
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+
+        # 构造“A[SEP]B“中的B部分(有一部分是正常的当前文档中的后半部;在原BERT的实现中一部分是随机的从另一个文档中选取的，）
+        tokens_b = []
+        for j in range(a_end, len(current_chunk)):
+          tokens_b.extend(current_chunk[j])
+
+        # 有百分之50%的概率交换一下tokens_a和tokens_b的位置
+        # print("tokens_a length1:",len(tokens_a))
+        # print("tokens_b length1:",len(tokens_b)) # len(tokens_b) = 0
+
+        if len(tokens_a) == 0 or len(tokens_b) == 0: i += 1; continue
+        if rng.random() < 0.5: # 交换一下tokens_a和tokens_b
+          is_random_next=True
+          temp=tokens_a
+          tokens_a=tokens_b
+          tokens_b=temp
+        else:
+          is_random_next=False
+
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+
+        # 把tokens_a & tokens_b加入到按照bert的风格，即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式，结合到一起，作为最终的tokens; 也带上segment_ids，前面部分segment_ids的值是0，后面部分的值是1.
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+        # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+        instance = TrainingInstance( # 创建训练实例的对象
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = [] # 清空当前块
+      current_length = 0 # 重置当前文本块的长度
+    i += 1 # 接着文档中的内容往后看
+
+  return instances
+
+
+def create_instances_from_document_original( # THIS IS ORIGINAL BERT STYLE FOR CREATE DATA OF MLM AND NEXT SENTENCE PREDICTION TASK
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+  """Creates `TrainingInstance`s for a single document."""
+  document = all_documents[document_index] # 得到一个文档
+
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob: # 有一定的比例，如10%的概率，我们使用比较短的序列长度，以缓解预训练的长序列和调优阶段（可能的）短序列的不一致情况
+    target_seq_length = rng.randint(2, max_num_tokens)
+
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  # 设法使用实际的句子，而不是任意的截断句子，从而更好的构造句子连贯性预测的任务
+  instances = []
+  current_chunk = [] # 当前处理的文本段，包含多个句子
+  current_length = 0
+  i = 0
+  # print("###document:",document) # 一个document可以是一整篇文章、新闻、一个词条等. document:[['是', '爷', '们', '，', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', '，', '获', '取', '育', '儿', '的', '智', '慧', '，', '与', '孩', '子', '一', '同', '成', '长', '！'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', '，', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', '，', '长', '大', '了', '，', '就', '底', '报', '答', '父', '母', '，', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', '，', '可', '以', '花', '心', '，', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', '，', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', '，', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', '，', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', '，', '每', '次', '把', '她', '抱', '在', '怀', '里', '，', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', '，', '我', '都', '会', '引', '以', '为', '傲', '，', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', '，', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', '，', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', '，', '喝', '到', '很', '晚', '，', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', '，', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', '，', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', '，', '喝', '酒', '还', '勉', '强', '过', '得', '去', '，', '不', '过', '该', '喝', '的', '时', '候', '喝', '，', '不', '该', '喝', '的', '时', '候', '，', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', '，', '我', '媳', '妇', '必', '须', '听', '我', '话', '，', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', '，', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', '，', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', '，', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', '，', '不', '管', '上', '学', '还', '是', '上', '班', '，', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', '，', '交', '往', '不', '到', '1', '年', '，', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', '，', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', '，', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', '，', '只', '要', '媳', '妇', '发', '话', '，', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', '，', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', '，', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', '，', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', '，', '可', '是', '我', '会', '告', '诉', '全', '世', '界', '，', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', '，', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', '，', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', '，', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', '，', '做', '恶', '梦', '的', '时', '候', '，', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', '，', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', '，', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', '，', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', '，', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', '，', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', '，', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', '，', '咱', '是', '一', '爷', '们', '，', '给', '媳', '妇', '服', '个', '软', '，', '道', '个', '歉', '怎', '么', '了', '？'], ['我', '是', '一', '爷', '们', '，', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', '，', '拿', '她', '和', '别', '人', '比', '，', '说', '她', '这', '不', '如', '人', '家', '，', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', '，', '陪', '媳', '妇', '逛', '街', '时', '，', '碰', '见', '熟', '人', '，', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', '，', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', '，', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', '，', '来', '自', '网', '络', '，', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', '，', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '（', '微', '信', '号', '：', 'he', '##bc', '##x', '##jy', '）', '。'], ['打', '开', '微', '信', '，', '扫', '描', '二', '维', '码', '，', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', '，', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', '，', '会', '有', '更', '多', '惊', '喜', '哦', '！']]
+  while i < len(document): # 从文档的第一个位置开始，按个往下看
+    segment = document[i] # segment是列表，代表的是按字分开的一个完整句子，如 segment=['我', '是', '一', '爷', '们', '，', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。']
+    # print("###i:",i,";segment:",segment)
+    current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中
+    current_length += len(segment) # 累计到为止位置接触到句子的总长度
+    if i == len(document) - 1 or current_length >= target_seq_length: # 如果累计的序列长度达到了目标的长度==>构造并添加到“A[SEP]B“中的A和B中。
+      if current_chunk: # 如果当前块不为空
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2: # 当前块，如果包含超过两个句子，怎取当前块的一部分作为“A[SEP]B“中的A部分
+          a_end = rng.randint(1, len(current_chunk) - 1)
+        # 将当前文本段中选取出来的前半部分，赋值给A即tokens_a
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+
+        # 构造“A[SEP]B“中的B部分(原本的B有一部分是随机的从另一个文档中选取的，有一部分是正常的当前文档中的后半部）
+        tokens_b = []
+        # Random next
+        is_random_next = False
+        if len(current_chunk) == 1 or rng.random() < 0.5: # 有50%的概率，是从其他文档中随机的选取一个文档，并得到这个文档的后半版本作为B即tokens_b
+          is_random_next = True
+          target_b_length = target_seq_length - len(tokens_a)
+
+          # This should rarely go for more than one iteration for large
+          # corpora. However, just to be careful, we try to make sure that
+          # the random document is not the same as the document
+          # we're processing.
+          random_document_index=0
+          for _ in range(10): # 随机的选出一个与当前的文档不一样的文档的索引
+            random_document_index = rng.randint(0, len(all_documents) - 1)
+            if random_document_index != document_index:
+              break
+
+          random_document = all_documents[random_document_index] # 选出这个文档
+          random_start = rng.randint(0, len(random_document) - 1) # 从这个文档选出一个段落的开始位置
+          for j in range(random_start, len(random_document)): # 从这个文档的开始位置到结束，作为我们的“A[SEP]B“中的B即tokens_b
+            tokens_b.extend(random_document[j])
+            if len(tokens_b) >= target_b_length:
+              break
+          # We didn't actually use these segments so we "put them back" so
+          # they don't go to waste. 这里是为了防止文本的浪费的一个小技巧
+          num_unused_segments = len(current_chunk) - a_end # e.g. 550-200=350
+          i -= num_unused_segments # i=i-num_unused_segments, e.g. i=400, num_unused_segments=350, 那么 i=i-num_unused_segments=400-350=50
+        # Actual next
+        else: # 有另外50%的几乎，从当前文本块（长度为max_sequence_length）中的后段中填充到tokens_b即“A[SEP]B“中的B。
+          is_random_next = False
+          for j in range(a_end, len(current_chunk)):
+            tokens_b.extend(current_chunk[j])
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+
+        # 把tokens_a & tokens_b加入到按照bert的风格，即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式，结合到一起，作为最终的tokens; 也带上segment_ids，前面部分segment_ids的值是0，后面部分的值是1.
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+        # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+        instance = TrainingInstance( # 创建训练实例的对象
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = [] # 清空当前块
+      current_length = 0 # 重置当前文本块的长度
+    i += 1 # 接着文档中的内容往后看
+
+  return instances
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+  """Creates the predictions for the masked LM objective."""
+
+  cand_indexes = []
+  for (i, token) in enumerate(tokens):
+    if token == "[CLS]" or token == "[SEP]":
+      continue
+    # Whole Word Masking means that if we mask all of the wordpieces
+    # corresponding to an original word. When a word has been split into
+    # WordPieces, the first token does not have any marker and any subsequence
+    # tokens are prefixed with ##. So whenever we see the ## token, we
+    # append it to the previous set of word indexes.
+    #
+    # Note that Whole Word Masking does *not* change the training code
+    # at all -- we still predict each WordPiece independently, softmaxed
+    # over the entire vocabulary.
+    if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and
+            token.startswith("##")):
+      cand_indexes[-1].append(i)
+    else:
+      cand_indexes.append([i])
+
+  rng.shuffle(cand_indexes)
+
+  if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously
+    output_tokens = [t[2:] if len(re.findall('##[\u4E00-\u9FA5]', t)) > 0 else t for t in tokens]  # 去掉"##"
+  else: # english and other language, which is not chinese
+    output_tokens = list(tokens)
+
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+
+  masked_lms = []
+  covered_indexes = set()
+  for index_set in cand_indexes:
+    if len(masked_lms) >= num_to_predict:
+      break
+    # If adding a whole-word mask would exceed the maximum number of
+    # predictions, then just skip this candidate.
+    if len(masked_lms) + len(index_set) > num_to_predict:
+      continue
+    is_any_index_covered = False
+    for index in index_set:
+      if index in covered_indexes:
+        is_any_index_covered = True
+        break
+    if is_any_index_covered:
+      continue
+    for index in index_set:
+      covered_indexes.add(index)
+
+      masked_token = None
+      # 80% of the time, replace with [MASK]
+      if rng.random() < 0.8:
+        masked_token = "[MASK]"
+      else:
+        # 10% of the time, keep original
+        if rng.random() < 0.5:
+          if FLAGS.non_chinese == False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously
+            masked_token = tokens[index][2:] if len(re.findall('##[\u4E00-\u9FA5]', tokens[index])) > 0 else tokens[index]  # 去掉"##"
+          else:
+            masked_token = tokens[index]
+        # 10% of the time, replace with random word
+        else:
+          masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+      output_tokens[index] = masked_token
+
+      masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+  assert len(masked_lms) <= num_to_predict
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+
+  # tf.logging.info('%s' % (tokens))
+  # tf.logging.info('%s' % (output_tokens))
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+def create_masked_lm_predictions_original(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+  """Creates the predictions for the masked LM objective."""
+
+  cand_indexes = []
+  for (i, token) in enumerate(tokens):
+    if token == "[CLS]" or token == "[SEP]":
+      continue
+    # Whole Word Masking means that if we mask all of the wordpieces
+    # corresponding to an original word. When a word has been split into
+    # WordPieces, the first token does not have any marker and any subsequence
+    # tokens are prefixed with ##. So whenever we see the ## token, we
+    # append it to the previous set of word indexes.
+    #
+    # Note that Whole Word Masking does *not* change the training code
+    # at all -- we still predict each WordPiece independently, softmaxed
+    # over the entire vocabulary.
+    if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and
+        token.startswith("##")):
+      cand_indexes[-1].append(i)
+    else:
+      cand_indexes.append([i])
+
+  rng.shuffle(cand_indexes)
+
+  output_tokens = list(tokens)
+
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+
+  masked_lms = []
+  covered_indexes = set()
+  for index_set in cand_indexes:
+    if len(masked_lms) >= num_to_predict:
+      break
+    # If adding a whole-word mask would exceed the maximum number of
+    # predictions, then just skip this candidate.
+    if len(masked_lms) + len(index_set) > num_to_predict:
+      continue
+    is_any_index_covered = False
+    for index in index_set:
+      if index in covered_indexes:
+        is_any_index_covered = True
+        break
+    if is_any_index_covered:
+      continue
+    for index in index_set:
+      covered_indexes.add(index)
+
+      masked_token = None
+      # 80% of the time, replace with [MASK]
+      if rng.random() < 0.8:
+        masked_token = "[MASK]"
+      else:
+        # 10% of the time, keep original
+        if rng.random() < 0.5:
+          masked_token = tokens[index]
+        # 10% of the time, replace with random word
+        else:
+          masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+      output_tokens[index] = masked_token
+
+      masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+  assert len(masked_lms) <= num_to_predict
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+  """Truncates a pair of sequences to a maximum sequence length."""
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_num_tokens:
+      break
+
+    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+    assert len(trunc_tokens) >= 1
+
+    # We want to sometimes truncate from the front and sometimes from the
+    # back to add more randomness and avoid biases.
+    if rng.random() < 0.5:
+      del trunc_tokens[0]
+    else:
+      trunc_tokens.pop()
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  input_files = []
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.gfile.Glob(input_pattern))
+
+  tf.logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    tf.logging.info("  %s", input_file)
+
+  rng = random.Random(FLAGS.random_seed)
+  instances = create_training_instances(
+      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
+      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
+      rng)
+
+  output_files = FLAGS.output_file.split(",")
+  tf.logging.info("*** Writing to output files ***")
+  for output_file in output_files:
+    tf.logging.info("  %s", output_file)
+
+  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
+                                  FLAGS.max_predictions_per_seq, output_files)
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("output_file")
+  flags.mark_flag_as_required("vocab_file")
+  tf.app.run()
\ No newline at end of file
diff --git a/Basic/Albert/albert_tiny_tf/albert/modeling.py b/Basic/Albert/albert_tiny_tf/albert/modeling.py
new file mode 100644
index 0000000..a521690
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/modeling.py
@@ -0,0 +1,1280 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The main BERT model and related functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import json
+import math
+import re
+import numpy as np
+import six
+import tensorflow as tf
+from albert import bert_utils
+
+
+class BertConfig(object):
+    """Configuration for `BertModel`."""
+
+    def __init__(self,
+                 vocab_size,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs BertConfig.
+
+        Args:
+          vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
+          hidden_size: Size of the encoder layers and the pooler layer.
+          num_hidden_layers: Number of hidden layers in the Transformer encoder.
+          num_attention_heads: Number of attention heads for each attention layer in
+            the Transformer encoder.
+          intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+          hidden_act: The non-linear activation function (function or string) in the
+            encoder and pooler.
+          hidden_dropout_prob: The dropout probability for all fully connected
+            layers in the embeddings, encoder, and pooler.
+          attention_probs_dropout_prob: The dropout ratio for the attention
+            probabilities.
+          max_position_embeddings: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+          type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+            `BertModel`.
+          initializer_range: The stdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        """
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size=None)
+        for (key, value) in six.iteritems(json_object):
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with tf.gfile.GFile(json_file, "r") as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class BertModel(object):
+    """BERT model ("Bidirectional Encoder Representations from Transformers").
+
+    Example usage:
+
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
+    input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
+
+    config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
+      num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
+
+    model = modeling.BertModel(config=config, is_training=True,
+      input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
+
+    label_embeddings = tf.get_variable(...)
+    pooled_output = model.get_pooled_output()
+    logits = tf.matmul(pooled_output, label_embeddings)
+    ...
+    ```
+    """
+
+    def __init__(self,
+                 config,
+                 is_training,
+                 input_ids,
+                 input_mask=None,
+                 token_type_ids=None,
+                 use_one_hot_embeddings=False,
+                 scope=None):
+        """Constructor for BertModel.
+
+        Args:
+          config: `BertConfig` instance.
+          is_training: bool. true for training model, false for eval model. Controls
+            whether dropout will be applied.
+          input_ids: int32 Tensor of shape [batch_size, seq_length].
+          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
+          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
+          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
+            embeddings or tf.embedding_lookup() for the word embeddings.
+          scope: (optional) variable scope. Defaults to "bert".
+
+        Raises:
+          ValueError: The config is invalid or one of the input tensor shapes
+            is invalid.
+        """
+        config = copy.deepcopy(config)
+        if not is_training:
+            config.hidden_dropout_prob = 0.0
+            config.attention_probs_dropout_prob = 0.0
+
+        input_shape = get_shape_list(input_ids, expected_rank=2)
+        batch_size = input_shape[0]
+        seq_length = input_shape[1]
+
+        if input_mask is None:
+            input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
+
+        if token_type_ids is None:
+            token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
+
+        with tf.variable_scope(scope, default_name="bert"):
+            with tf.variable_scope("embeddings"):
+                # Perform embedding lookup on the word ids, but use stype of factorized embedding parameterization from albert. add by brightmart, 2019-09-28
+                (self.embedding_output, self.embedding_table, self.embedding_table_2) = embedding_lookup_factorized(
+                    input_ids=input_ids,
+                    vocab_size=config.vocab_size,
+                    hidden_size=config.hidden_size,
+                    embedding_size=config.embedding_size,
+                    initializer_range=config.initializer_range,
+                    word_embedding_name="word_embeddings",
+                    use_one_hot_embeddings=use_one_hot_embeddings)
+
+                # Add positional embeddings and token type embeddings, then layer
+                # normalize and perform dropout.
+                self.embedding_output = embedding_postprocessor(
+                    input_tensor=self.embedding_output,
+                    use_token_type=True,
+                    token_type_ids=token_type_ids,
+                    token_type_vocab_size=config.type_vocab_size,
+                    token_type_embedding_name="token_type_embeddings",
+                    use_position_embeddings=True,
+                    position_embedding_name="position_embeddings",
+                    initializer_range=config.initializer_range,
+                    max_position_embeddings=config.max_position_embeddings,
+                    dropout_prob=config.hidden_dropout_prob)
+
+            with tf.variable_scope("encoder"):
+                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
+                # mask of shape [batch_size, seq_length, seq_length] which is used
+                # for the attention scores.
+                attention_mask = create_attention_mask_from_input_mask(
+                    input_ids, input_mask)
+
+                # Run the stacked transformer.
+                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
+                ln_type = config.ln_type
+
+                if ln_type == 'postln' or ln_type is None:  # currently, base or large of albert used post-LN structure
+                    print("old structure of transformer.use: transformer_model,which use post-LN")
+                    self.all_encoder_layers = transformer_model(
+                        input_tensor=self.embedding_output,
+                        attention_mask=attention_mask,
+                        hidden_size=config.hidden_size,
+                        num_hidden_layers=config.num_hidden_layers,
+                        num_attention_heads=config.num_attention_heads,
+                        intermediate_size=config.intermediate_size,
+                        intermediate_act_fn=get_activation(config.hidden_act),
+                        hidden_dropout_prob=config.hidden_dropout_prob,
+                        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+                        initializer_range=config.initializer_range,
+                        do_return_all_layers=True)
+                else:  # xlarge or xxlarge of albert, used pre-LN structure
+                    print("new structure of transformer.use: prelln_transformer_model,which use pre-LN")
+                    self.all_encoder_layers = prelln_transformer_model(
+                        # change by brightmart, 4th, oct, 2019. pre-Layer Normalization can converge fast and better. check paper: ON LAYER NORMALIZATION IN THE TRANSFORMER ARCHITECTURE
+                        input_tensor=self.embedding_output,
+                        attention_mask=attention_mask,
+                        hidden_size=config.hidden_size,
+                        num_hidden_layers=config.num_hidden_layers,
+                        num_attention_heads=config.num_attention_heads,
+                        intermediate_size=config.intermediate_size,
+                        intermediate_act_fn=get_activation(config.hidden_act),
+                        hidden_dropout_prob=config.hidden_dropout_prob,
+                        attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+                        initializer_range=config.initializer_range,
+                        do_return_all_layers=True,
+                        shared_type='all')  # do_return_all_layers=True
+
+            self.sequence_output = self.all_encoder_layers[-1]  # [batch_size, seq_length, hidden_size]
+            # The "pooler" converts the encoded sequence tensor of shape
+            # [batch_size, seq_length, hidden_size] to a tensor of shape
+            # [batch_size, hidden_size]. This is necessary for segment-level
+            # (or segment-pair-level) classification tasks where we need a fixed
+            # dimensional representation of the segment.
+            with tf.variable_scope("pooler"):
+                # We "pool" the model by simply taking the hidden state corresponding
+                # to the first token. We assume that this has been pre-trained
+                first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
+                self.pooled_output = tf.layers.dense(
+                    first_token_tensor,
+                    config.hidden_size,
+                    activation=tf.tanh,
+                    kernel_initializer=create_initializer(config.initializer_range))
+
+    def get_pooled_output(self):
+        return self.pooled_output
+
+    def get_sequence_output(self):
+        """Gets final hidden layer of encoder.
+
+        Returns:
+          float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
+          to the final hidden of the transformer encoder.
+        """
+        return self.sequence_output
+
+    def get_all_encoder_layers(self):
+        return self.all_encoder_layers
+
+    def get_embedding_output(self):
+        """Gets output of the embedding lookup (i.e., input to the transformer).
+
+        Returns:
+          float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
+          to the output of the embedding layer, after summing the word
+          embeddings with the positional embeddings and the token type embeddings,
+          then performing layer normalization. This is the input to the transformer.
+        """
+        return self.embedding_output
+
+    def get_embedding_table(self):
+        return self.embedding_table
+
+    def get_embedding_table_2(self):
+        return self.embedding_table_2
+
+
+def gelu(x):
+    """Gaussian Error Linear Unit.
+
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+      x: float Tensor to perform activation.
+
+    Returns:
+      `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh(
+        (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+
+
+def get_activation(activation_string):
+    """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
+
+    Args:
+      activation_string: String name of the activation function.
+
+    Returns:
+      A Python function corresponding to the activation function. If
+      `activation_string` is None, empty, or "linear", this will return None.
+      If `activation_string` is not a string, it will return `activation_string`.
+
+    Raises:
+      ValueError: The `activation_string` does not correspond to a known
+        activation.
+    """
+
+    # We assume that anything that"s not a string is already an activation
+    # function, so we just return it.
+    if not isinstance(activation_string, six.string_types):
+        return activation_string
+
+    if not activation_string:
+        return None
+
+    act = activation_string.lower()
+    if act == "linear":
+        return None
+    elif act == "relu":
+        return tf.nn.relu
+    elif act == "gelu":
+        return gelu
+    elif act == "tanh":
+        return tf.tanh
+    else:
+        raise ValueError("Unsupported activation: %s" % act)
+
+
+def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
+    """Compute the union of the current variables and checkpoint variables."""
+    assignment_map = {}
+    initialized_variable_names = {}
+
+    name_to_variable = collections.OrderedDict()
+    for var in tvars:
+        name = var.name
+        m = re.match("^(.*):\\d+$", name)
+        if m is not None:
+            name = m.group(1)
+        name_to_variable[name] = var
+
+    init_vars = tf.train.list_variables(init_checkpoint)
+
+    assignment_map = collections.OrderedDict()
+    for x in init_vars:
+        (name, var) = (x[0], x[1])
+        if name not in name_to_variable:
+            continue
+        assignment_map[name] = name
+        initialized_variable_names[name] = 1
+        initialized_variable_names[name + ":0"] = 1
+
+    return (assignment_map, initialized_variable_names)
+
+
+def dropout(input_tensor, dropout_prob):
+    """Perform dropout.
+
+    Args:
+      input_tensor: float Tensor.
+      dropout_prob: Python float. The probability of dropping out a value (NOT of
+        *keeping* a dimension as in `tf.nn.dropout`).
+
+    Returns:
+      A version of `input_tensor` with dropout applied.
+    """
+    if dropout_prob is None or dropout_prob == 0.0:
+        return input_tensor
+
+    output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
+    return output
+
+
+def layer_norm(input_tensor, name=None):
+    """Run layer normalization on the last dimension of the tensor."""
+    return tf.contrib.layers.layer_norm(
+        inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
+
+
+def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
+    """Runs layer normalization followed by dropout."""
+    output_tensor = layer_norm(input_tensor, name)
+    output_tensor = dropout(output_tensor, dropout_prob)
+    return output_tensor
+
+
+def create_initializer(initializer_range=0.02):
+    """Creates a `truncated_normal_initializer` with the given range."""
+    return tf.truncated_normal_initializer(stddev=initializer_range)
+
+
+def embedding_lookup(input_ids,
+                     vocab_size,
+                     embedding_size=128,
+                     initializer_range=0.02,
+                     word_embedding_name="word_embeddings",
+                     use_one_hot_embeddings=False):
+    """Looks up words embeddings for id tensor.
+
+    Args:
+      input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
+        ids.
+      vocab_size: int. Size of the embedding vocabulary.
+      embedding_size: int. Width of the word embeddings.
+      initializer_range: float. Embedding initialization range.
+      word_embedding_name: string. Name of the embedding table.
+      use_one_hot_embeddings: bool. If True, use one-hot method for word
+        embeddings. If False, use `tf.gather()`.
+
+    Returns:
+      float Tensor of shape [batch_size, seq_length, embedding_size].
+    """
+    # This function assumes that the input is of shape [batch_size, seq_length,
+    # num_inputs].
+    #
+    # If the input is a 2D tensor of shape [batch_size, seq_length], we
+    # reshape to [batch_size, seq_length, 1].
+    if input_ids.shape.ndims == 2:
+        input_ids = tf.expand_dims(input_ids, axis=[-1])  # shape of input_ids is:[ batch_size, seq_length, 1]
+
+    embedding_table = tf.get_variable(  # [vocab_size, embedding_size]
+        name=word_embedding_name,
+        shape=[vocab_size, embedding_size],
+        initializer=create_initializer(initializer_range))
+
+    flat_input_ids = tf.reshape(input_ids, [-1])  # one rank. shape as (batch_size * sequence_length,)
+    if use_one_hot_embeddings:
+        one_hot_input_ids = tf.one_hot(flat_input_ids,
+                                       depth=vocab_size)  # one_hot_input_ids=[batch_size * sequence_length,vocab_size]
+        output = tf.matmul(one_hot_input_ids, embedding_table)  # output=[batch_size * sequence_length,embedding_size]
+    else:
+        output = tf.gather(embedding_table,
+                           flat_input_ids)  # [vocab_size, embedding_size]*[batch_size * sequence_length,]--->[batch_size * sequence_length,embedding_size]
+
+    input_shape = get_shape_list(input_ids)  # input_shape=[ batch_size, seq_length, 1]
+
+    output = tf.reshape(output, input_shape[0:-1] + [
+        input_shape[-1] * embedding_size])  # output=[batch_size,sequence_length,embedding_size]
+    return (output, embedding_table)
+
+
+def embedding_lookup_factorized(input_ids,  # Factorized embedding parameterization provide by albert
+                                vocab_size,
+                                hidden_size,
+                                embedding_size=128,
+                                initializer_range=0.02,
+                                word_embedding_name="word_embeddings",
+                                use_one_hot_embeddings=False):
+    """Looks up words embeddings for id tensor, but in a factorized style followed by albert. it is used to reduce much percentage of parameters previous exists.
+       Check "Factorized embedding parameterization" session in the paper.
+
+     Args:
+       input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
+         ids.
+       vocab_size: int. Size of the embedding vocabulary.
+       embedding_size: int. Width of the word embeddings.
+       initializer_range: float. Embedding initialization range.
+       word_embedding_name: string. Name of the embedding table.
+       use_one_hot_embeddings: bool. If True, use one-hot method for word
+         embeddings. If False, use `tf.gather()`.
+
+     Returns:
+       float Tensor of shape [batch_size, seq_length, embedding_size].
+     """
+    # This function assumes that the input is of shape [batch_size, seq_length,
+    # num_inputs].
+    #
+    # If the input is a 2D tensor of shape [batch_size, seq_length], we
+    # reshape to [batch_size, seq_length, 1].
+
+    # 1.first project one-hot vectors into a lower dimensional embedding space of size E
+    print("embedding_lookup_factorized. factorized embedding parameterization is used.")
+    if input_ids.shape.ndims == 2:
+        input_ids = tf.expand_dims(input_ids, axis=[-1])  # shape of input_ids is:[ batch_size, seq_length, 1]
+
+    embedding_table = tf.get_variable(  # [vocab_size, embedding_size]
+        name=word_embedding_name,
+        shape=[vocab_size, embedding_size],
+        initializer=create_initializer(initializer_range))
+
+    flat_input_ids = tf.reshape(input_ids, [-1])  # one rank. shape as (batch_size * sequence_length,)
+    if use_one_hot_embeddings:
+        one_hot_input_ids = tf.one_hot(flat_input_ids,
+                                       depth=vocab_size)  # one_hot_input_ids=[batch_size * sequence_length,vocab_size]
+        output_middle = tf.matmul(one_hot_input_ids,
+                                  embedding_table)  # output=[batch_size * sequence_length,embedding_size]
+    else:
+        output_middle = tf.gather(embedding_table,
+                                  flat_input_ids)  # [vocab_size, embedding_size]*[batch_size * sequence_length,]--->[batch_size * sequence_length,embedding_size]
+
+    # 2. project vector(output_middle) to the hidden space
+    project_variable = tf.get_variable(  # [embedding_size, hidden_size]
+        name=word_embedding_name + "_2",
+        shape=[embedding_size, hidden_size],
+        initializer=create_initializer(initializer_range))
+    output = tf.matmul(output_middle,
+                       project_variable)  # ([batch_size * sequence_length, embedding_size] * [embedding_size, hidden_size])--->[batch_size * sequence_length, hidden_size]
+    # reshape back to 3 rank
+    input_shape = get_shape_list(input_ids)  # input_shape=[ batch_size, seq_length, 1]
+    batch_size, sequene_length, _ = input_shape
+    output = tf.reshape(output,
+                        (batch_size, sequene_length, hidden_size))  # output=[batch_size, sequence_length, hidden_size]
+    return (output, embedding_table, project_variable)
+
+
+def embedding_postprocessor(input_tensor,
+                            use_token_type=False,
+                            token_type_ids=None,
+                            token_type_vocab_size=16,
+                            token_type_embedding_name="token_type_embeddings",
+                            use_position_embeddings=True,
+                            position_embedding_name="position_embeddings",
+                            initializer_range=0.02,
+                            max_position_embeddings=512,
+                            dropout_prob=0.1):
+    """Performs various post-processing on a word embedding tensor.
+
+    Args:
+      input_tensor: float Tensor of shape [batch_size, seq_length,
+        embedding_size].
+      use_token_type: bool. Whether to add embeddings for `token_type_ids`.
+      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
+        Must be specified if `use_token_type` is True.
+      token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
+      token_type_embedding_name: string. The name of the embedding table variable
+        for token type ids.
+      use_position_embeddings: bool. Whether to add position embeddings for the
+        position of each token in the sequence.
+      position_embedding_name: string. The name of the embedding table variable
+        for positional embeddings.
+      initializer_range: float. Range of the weight initialization.
+      max_position_embeddings: int. Maximum sequence length that might ever be
+        used with this model. This can be longer than the sequence length of
+        input_tensor, but cannot be shorter.
+      dropout_prob: float. Dropout probability applied to the final output tensor.
+
+    Returns:
+      float tensor with same shape as `input_tensor`.
+
+    Raises:
+      ValueError: One of the tensor shapes or input values is invalid.
+    """
+    input_shape = get_shape_list(input_tensor, expected_rank=3)
+    batch_size = input_shape[0]
+    seq_length = input_shape[1]
+    width = input_shape[2]
+
+    output = input_tensor
+
+    if use_token_type:
+        if token_type_ids is None:
+            raise ValueError("`token_type_ids` must be specified if"
+                             "`use_token_type` is True.")
+        token_type_table = tf.get_variable(
+            name=token_type_embedding_name,
+            shape=[token_type_vocab_size, width],
+            initializer=create_initializer(initializer_range))
+        # This vocab will be small so we always do one-hot here, since it is always
+        # faster for a small vocabulary.
+        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
+        one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
+        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
+        token_type_embeddings = tf.reshape(token_type_embeddings,
+                                           [batch_size, seq_length, width])
+        output += token_type_embeddings
+
+    if use_position_embeddings:
+        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
+        with tf.control_dependencies([assert_op]):
+            full_position_embeddings = tf.get_variable(
+                name=position_embedding_name,
+                shape=[max_position_embeddings, width],
+                initializer=create_initializer(initializer_range))
+            # Since the position embedding table is a learned variable, we create it
+            # using a (long) sequence length `max_position_embeddings`. The actual
+            # sequence length might be shorter than this, for faster training of
+            # tasks that do not have long sequences.
+            #
+            # So `full_position_embeddings` is effectively an embedding table
+            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
+            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
+            # perform a slice.
+            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
+                                           [seq_length, -1])
+            num_dims = len(output.shape.as_list())
+
+            # Only the last two dimensions are relevant (`seq_length` and `width`), so
+            # we broadcast among the first dimensions, which is typically just
+            # the batch size.
+            position_broadcast_shape = []
+            for _ in range(num_dims - 2):
+                position_broadcast_shape.append(1)
+            position_broadcast_shape.extend([seq_length, width])
+            position_embeddings = tf.reshape(position_embeddings,
+                                             position_broadcast_shape)
+            output += position_embeddings
+
+    output = layer_norm_and_dropout(
+        output, dropout_prob)
+
+    return output
+
+
+def create_attention_mask_from_input_mask(from_tensor, to_mask):
+    """Create 3D attention mask from a 2D tensor mask.
+
+    Args:
+      from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
+      to_mask: int32 Tensor of shape [batch_size, to_seq_length].
+
+    Returns:
+      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
+    """
+    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
+    batch_size = from_shape[0]
+    from_seq_length = from_shape[1]
+
+    to_shape = get_shape_list(to_mask, expected_rank=2)
+    to_seq_length = to_shape[1]
+
+    to_mask = tf.cast(
+        tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
+
+    # We don't assume that `from_tensor` is a mask (although it could be). We
+    # don't actually care if we attend *from* padding tokens (only *to* padding)
+    # tokens so we create a tensor of all ones.
+    #
+    # `broadcast_ones` = [batch_size, from_seq_length, 1]
+    broadcast_ones = tf.ones(
+        shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
+
+    # Here we broadcast along two dimensions to create the mask.
+    mask = broadcast_ones * to_mask
+
+    return mask
+
+
+def attention_layer(from_tensor,
+                    to_tensor,
+                    attention_mask=None,
+                    num_attention_heads=1,
+                    size_per_head=512,
+                    query_act=None,
+                    key_act=None,
+                    value_act=None,
+                    attention_probs_dropout_prob=0.0,
+                    initializer_range=0.02,
+                    do_return_2d_tensor=False,
+                    batch_size=None,
+                    from_seq_length=None,
+                    to_seq_length=None):
+    """Performs multi-headed attention from `from_tensor` to `to_tensor`.
+
+    This is an implementation of multi-headed attention based on "Attention
+    is all you Need". If `from_tensor` and `to_tensor` are the same, then
+    this is self-attention. Each timestep in `from_tensor` attends to the
+    corresponding sequence in `to_tensor`, and returns a fixed-with vector.
+
+    This function first projects `from_tensor` into a "query" tensor and
+    `to_tensor` into "key" and "value" tensors. These are (effectively) a list
+    of tensors of length `num_attention_heads`, where each tensor is of shape
+    [batch_size, seq_length, size_per_head].
+
+    Then, the query and key tensors are dot-producted and scaled. These are
+    softmaxed to obtain attention probabilities. The value tensors are then
+    interpolated by these probabilities, then concatenated back to a single
+    tensor and returned.
+
+    In practice, the multi-headed attention are done with transposes and
+    reshapes rather than actual separate tensors.
+
+    Args:
+      from_tensor: float Tensor of shape [batch_size, from_seq_length,
+        from_width].
+      to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
+      attention_mask: (optional) int32 Tensor of shape [batch_size,
+        from_seq_length, to_seq_length]. The values should be 1 or 0. The
+        attention scores will effectively be set to -infinity for any positions in
+        the mask that are 0, and will be unchanged for positions that are 1.
+      num_attention_heads: int. Number of attention heads.
+      size_per_head: int. Size of each attention head.
+      query_act: (optional) Activation function for the query transform.
+      key_act: (optional) Activation function for the key transform.
+      value_act: (optional) Activation function for the value transform.
+      attention_probs_dropout_prob: (optional) float. Dropout probability of the
+        attention probabilities.
+      initializer_range: float. Range of the weight initializer.
+      do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
+        * from_seq_length, num_attention_heads * size_per_head]. If False, the
+        output will be of shape [batch_size, from_seq_length, num_attention_heads
+        * size_per_head].
+      batch_size: (Optional) int. If the input is 2D, this might be the batch size
+        of the 3D version of the `from_tensor` and `to_tensor`.
+      from_seq_length: (Optional) If the input is 2D, this might be the seq length
+        of the 3D version of the `from_tensor`.
+      to_seq_length: (Optional) If the input is 2D, this might be the seq length
+        of the 3D version of the `to_tensor`.
+
+    Returns:
+      float Tensor of shape [batch_size, from_seq_length,
+        num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
+        true, this will be of shape [batch_size * from_seq_length,
+        num_attention_heads * size_per_head]).
+
+    Raises:
+      ValueError: Any of the arguments or tensor shapes are invalid.
+    """
+
+    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
+                             seq_length, width):
+        output_tensor = tf.reshape(
+            input_tensor, [batch_size, seq_length, num_attention_heads, width])
+
+        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
+        return output_tensor
+
+    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
+    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
+
+    if len(from_shape) != len(to_shape):
+        raise ValueError(
+            "The rank of `from_tensor` must match the rank of `to_tensor`.")
+
+    if len(from_shape) == 3:
+        batch_size = from_shape[0]
+        from_seq_length = from_shape[1]
+        to_seq_length = to_shape[1]
+    elif len(from_shape) == 2:
+        if (batch_size is None or from_seq_length is None or to_seq_length is None):
+            raise ValueError(
+                "When passing in rank 2 tensors to attention_layer, the values "
+                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
+                "must all be specified.")
+
+    # Scalar dimensions referenced here:
+    #   B = batch size (number of sequences)
+    #   F = `from_tensor` sequence length
+    #   T = `to_tensor` sequence length
+    #   N = `num_attention_heads`
+    #   H = `size_per_head`
+
+    from_tensor_2d = reshape_to_matrix(from_tensor)
+    to_tensor_2d = reshape_to_matrix(to_tensor)
+
+    # `query_layer` = [B*F, N*H]
+    query_layer = tf.layers.dense(
+        from_tensor_2d,
+        num_attention_heads * size_per_head,
+        activation=query_act,
+        name="query",
+        kernel_initializer=create_initializer(initializer_range))
+
+    # `key_layer` = [B*T, N*H]
+    key_layer = tf.layers.dense(
+        to_tensor_2d,
+        num_attention_heads * size_per_head,
+        activation=key_act,
+        name="key",
+        kernel_initializer=create_initializer(initializer_range))
+
+    # `value_layer` = [B*T, N*H]
+    value_layer = tf.layers.dense(
+        to_tensor_2d,
+        num_attention_heads * size_per_head,
+        activation=value_act,
+        name="value",
+        kernel_initializer=create_initializer(initializer_range))
+
+    # `query_layer` = [B, N, F, H]
+    query_layer = transpose_for_scores(query_layer, batch_size,
+                                       num_attention_heads, from_seq_length,
+                                       size_per_head)
+
+    # `key_layer` = [B, N, T, H]
+    key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
+                                     to_seq_length, size_per_head)
+
+    # Take the dot product between "query" and "key" to get the raw
+    # attention scores.
+    # `attention_scores` = [B, N, F, T]
+    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+    attention_scores = tf.multiply(attention_scores,
+                                   1.0 / math.sqrt(float(size_per_head)))
+
+    if attention_mask is not None:
+        # `attention_mask` = [B, 1, F, T]
+        attention_mask = tf.expand_dims(attention_mask, axis=[1])
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
+
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        attention_scores += adder
+
+    # Normalize the attention scores to probabilities.
+    # `attention_probs` = [B, N, F, T]
+    attention_probs = tf.nn.softmax(attention_scores)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
+
+    # `value_layer` = [B, T, N, H]
+    value_layer = tf.reshape(
+        value_layer,
+        [batch_size, to_seq_length, num_attention_heads, size_per_head])
+
+    # `value_layer` = [B, N, T, H]
+    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
+
+    # `context_layer` = [B, N, F, H]
+    context_layer = tf.matmul(attention_probs, value_layer)
+
+    # `context_layer` = [B, F, N, H]
+    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
+
+    if do_return_2d_tensor:
+        # `context_layer` = [B*F, N*H]
+        context_layer = tf.reshape(
+            context_layer,
+            [batch_size * from_seq_length, num_attention_heads * size_per_head])
+    else:
+        # `context_layer` = [B, F, N*H]
+        context_layer = tf.reshape(
+            context_layer,
+            [batch_size, from_seq_length, num_attention_heads * size_per_head])
+
+    return context_layer
+
+
+def transformer_model(input_tensor,
+                      attention_mask=None,
+                      hidden_size=768,
+                      num_hidden_layers=12,
+                      num_attention_heads=12,
+                      intermediate_size=3072,
+                      intermediate_act_fn=gelu,
+                      hidden_dropout_prob=0.1,
+                      attention_probs_dropout_prob=0.1,
+                      initializer_range=0.02,
+                      do_return_all_layers=False,
+                      share_parameter_across_layers=True):
+    """Multi-headed, multi-layer Transformer from "Attention is All You Need".
+
+    This is almost an exact implementation of the original Transformer encoder.
+
+    See the original paper:
+    https://arxiv.org/abs/1706.03762
+
+    Also see:
+    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
+
+    Args:
+      input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
+      attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
+        seq_length], with 1 for positions that can be attended to and 0 in
+        positions that should not be.
+      hidden_size: int. Hidden size of the Transformer.
+      num_hidden_layers: int. Number of layers (blocks) in the Transformer.
+      num_attention_heads: int. Number of attention heads in the Transformer.
+      intermediate_size: int. The size of the "intermediate" (a.k.a., feed
+        forward) layer.
+      intermediate_act_fn: function. The non-linear activation function to apply
+        to the output of the intermediate/feed-forward layer.
+      hidden_dropout_prob: float. Dropout probability for the hidden layers.
+      attention_probs_dropout_prob: float. Dropout probability of the attention
+        probabilities.
+      initializer_range: float. Range of the initializer (stddev of truncated
+        normal).
+      do_return_all_layers: Whether to also return all layers or just the final
+        layer.
+
+    Returns:
+      float Tensor of shape [batch_size, seq_length, hidden_size], the final
+      hidden layer of the Transformer.
+
+    Raises:
+      ValueError: A Tensor shape or parameter is invalid.
+    """
+    if hidden_size % num_attention_heads != 0:
+        raise ValueError(
+            "The hidden size (%d) is not a multiple of the number of attention "
+            "heads (%d)" % (hidden_size, num_attention_heads))
+
+    attention_head_size = int(hidden_size / num_attention_heads)
+    input_shape = get_shape_list(input_tensor, expected_rank=3)
+    batch_size = input_shape[0]
+    seq_length = input_shape[1]
+    input_width = input_shape[2]
+
+    # The Transformer performs sum residuals on all layers so the input needs
+    # to be the same as the hidden size.
+    if input_width != hidden_size:
+        raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
+                         (input_width, hidden_size))
+
+    # We keep the representation as a 2D tensor to avoid re-shaping it back and
+    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
+    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
+    # help the optimizer.
+    prev_output = reshape_to_matrix(input_tensor)
+    all_layer_outputs = []
+    for layer_idx in range(num_hidden_layers):
+        if share_parameter_across_layers:
+            name_variable_scope = "layer_shared"
+        else:
+            name_variable_scope = "layer_%d" % layer_idx
+        # share all parameters across layers. add by brightmart, 2019-09-28. previous it is like this: "layer_%d" % layer_idx
+        with tf.variable_scope(name_variable_scope,
+                               reuse=True if (share_parameter_across_layers and layer_idx > 0) else False):
+
+            layer_input = prev_output
+
+            with tf.variable_scope("attention"):
+                attention_heads = []
+                with tf.variable_scope("self"):
+                    attention_head = attention_layer(
+                        from_tensor=layer_input,
+                        to_tensor=layer_input,
+                        attention_mask=attention_mask,
+                        num_attention_heads=num_attention_heads,
+                        size_per_head=attention_head_size,
+                        attention_probs_dropout_prob=attention_probs_dropout_prob,
+                        initializer_range=initializer_range,
+                        do_return_2d_tensor=True,
+                        batch_size=batch_size,
+                        from_seq_length=seq_length,
+                        to_seq_length=seq_length)
+                    attention_heads.append(attention_head)
+
+                attention_output = None
+                if len(attention_heads) == 1:
+                    attention_output = attention_heads[0]
+                else:
+                    # In the case where we have other sequences, we just concatenate
+                    # them to the self-attention head before the projection.
+                    attention_output = tf.concat(attention_heads, axis=-1)
+
+                # Run a linear projection of `hidden_size` then add a residual
+                # with `layer_input`.
+                with tf.variable_scope("output"):
+                    attention_output = tf.layers.dense(
+                        attention_output,
+                        hidden_size,
+                        kernel_initializer=create_initializer(initializer_range))
+                    attention_output = dropout(attention_output, hidden_dropout_prob)
+                    attention_output = layer_norm(attention_output + layer_input)
+
+            # The activation is only applied to the "intermediate" hidden layer.
+            with tf.variable_scope("intermediate"):
+                intermediate_output = tf.layers.dense(
+                    attention_output,
+                    intermediate_size,
+                    activation=intermediate_act_fn,
+                    kernel_initializer=tf.glorot_normal_initializer())
+
+            # Down-project back to `hidden_size` then add the residual.
+            with tf.variable_scope("output"):
+                layer_output = tf.layers.dense(
+                    intermediate_output,
+                    hidden_size,
+                    kernel_initializer=create_initializer(initializer_range))
+
+                layer_output = dropout(layer_output, hidden_dropout_prob)
+                layer_output = layer_norm(layer_output + attention_output)
+                prev_output = layer_output
+                all_layer_outputs.append(layer_output)
+
+    if do_return_all_layers:
+        final_outputs = []
+        for layer_output in all_layer_outputs:
+            final_output = reshape_from_matrix(layer_output, input_shape)
+            final_outputs.append(final_output)
+        return final_outputs
+    else:
+        final_output = reshape_from_matrix(prev_output, input_shape)
+        return final_output
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+    """Returns a list of the shape of tensor, preferring static dimensions.
+
+    Args:
+      tensor: A tf.Tensor object to find the shape of.
+      expected_rank: (optional) int. The expected rank of `tensor`. If this is
+        specified and the `tensor` has a different rank, and exception will be
+        thrown.
+      name: Optional name of the tensor for the error message.
+
+    Returns:
+      A list of dimensions of the shape of tensor. All static dimensions will
+      be returned as python integers, and dynamic dimensions will be returned
+      as tf.Tensor scalars.
+    """
+    if name is None:
+        name = tensor.name
+
+    if expected_rank is not None:
+        assert_rank(tensor, expected_rank, name)
+
+    shape = tensor.shape.as_list()
+
+    non_static_indexes = []
+    for (index, dim) in enumerate(shape):
+        if dim is None:
+            non_static_indexes.append(index)
+
+    if not non_static_indexes:
+        return shape
+
+    dyn_shape = tf.shape(tensor)
+    for index in non_static_indexes:
+        shape[index] = dyn_shape[index]
+    return shape
+
+
+def reshape_to_matrix(input_tensor):
+    """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
+    ndims = input_tensor.shape.ndims
+    if ndims < 2:
+        raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
+                         (input_tensor.shape))
+    if ndims == 2:
+        return input_tensor
+
+    width = input_tensor.shape[-1]
+    output_tensor = tf.reshape(input_tensor, [-1, width])
+    return output_tensor
+
+
+def reshape_from_matrix(output_tensor, orig_shape_list):
+    """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
+    if len(orig_shape_list) == 2:
+        return output_tensor
+
+    output_shape = get_shape_list(output_tensor)
+
+    orig_dims = orig_shape_list[0:-1]
+    width = output_shape[-1]
+
+    return tf.reshape(output_tensor, orig_dims + [width])
+
+
+def assert_rank(tensor, expected_rank, name=None):
+    """Raises an exception if the tensor rank is not of the expected rank.
+
+    Args:
+      tensor: A tf.Tensor to check the rank of.
+      expected_rank: Python integer or list of integers, expected rank.
+      name: Optional name of the tensor for the error message.
+
+    Raises:
+      ValueError: If the expected shape doesn't match the actual shape.
+    """
+    if name is None:
+        name = tensor.name
+
+    expected_rank_dict = {}
+    if isinstance(expected_rank, six.integer_types):
+        expected_rank_dict[expected_rank] = True
+    else:
+        for x in expected_rank:
+            expected_rank_dict[x] = True
+
+    actual_rank = tensor.shape.ndims
+    if actual_rank not in expected_rank_dict:
+        scope_name = tf.get_variable_scope().name
+        raise ValueError(
+            "For the tensor `%s` in scope `%s`, the actual rank "
+            "`%d` (shape = %s) is not equal to the expected rank `%s`" %
+            (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
+
+
+def prelln_transformer_model(input_tensor,
+                             attention_mask=None,
+                             hidden_size=768,
+                             num_hidden_layers=12,
+                             num_attention_heads=12,
+                             intermediate_size=3072,
+                             intermediate_act_fn=gelu,
+                             hidden_dropout_prob=0.1,
+                             attention_probs_dropout_prob=0.1,
+                             initializer_range=0.02,
+                             do_return_all_layers=False,
+                             shared_type='all',  # None,
+                             adapter_fn=None):
+    """Multi-headed, multi-layer Transformer from "Attention is All You Need".
+
+    This is almost an exact implementation of the original Transformer encoder.
+
+    See the original paper:
+    https://arxiv.org/abs/1706.03762
+
+    Also see:
+    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
+
+    Args:
+        input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
+        attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
+            seq_length], with 1 for positions that can be attended to and 0 in
+            positions that should not be.
+        hidden_size: int. Hidden size of the Transformer.
+        num_hidden_layers: int. Number of layers (blocks) in the Transformer.
+        num_attention_heads: int. Number of attention heads in the Transformer.
+        intermediate_size: int. The size of the "intermediate" (a.k.a., feed
+            forward) layer.
+        intermediate_act_fn: function. The non-linear activation function to apply
+            to the output of the intermediate/feed-forward layer.
+        hidden_dropout_prob: float. Dropout probability for the hidden layers.
+        attention_probs_dropout_prob: float. Dropout probability of the attention
+            probabilities.
+        initializer_range: float. Range of the initializer (stddev of truncated
+            normal).
+        do_return_all_layers: Whether to also return all layers or just the final
+            layer.
+
+    Returns:
+        float Tensor of shape [batch_size, seq_length, hidden_size], the final
+        hidden layer of the Transformer.
+
+    Raises:
+        ValueError: A Tensor shape or parameter is invalid.
+    """
+    if hidden_size % num_attention_heads != 0:
+        raise ValueError(
+            "The hidden size (%d) is not a multiple of the number of attention "
+            "heads (%d)" % (hidden_size, num_attention_heads))
+
+    attention_head_size = int(hidden_size / num_attention_heads)
+
+    input_shape = bert_utils.get_shape_list(input_tensor, expected_rank=3)
+    batch_size = input_shape[0]
+    seq_length = input_shape[1]
+    input_width = input_shape[2]
+
+    # The Transformer performs sum residuals on all layers so the input needs
+    # to be the same as the hidden size.
+    if input_width != hidden_size:
+        raise ValueError("The width of the input tensor (%d) != hidden size (%d)" %
+                         (input_width, hidden_size))
+
+    # We keep the representation as a 2D tensor to avoid re-shaping it back and
+    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
+    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
+    # help the optimizer.
+    prev_output = bert_utils.reshape_to_matrix(input_tensor)
+
+    all_layer_outputs = []
+
+    def layer_scope(idx, shared_type):
+        if shared_type == 'all':
+            tmp = {
+                "layer": "layer_shared",
+                'attention': 'attention',
+                'intermediate': 'intermediate',
+                'output': 'output'
+            }
+        elif shared_type == 'attention':
+            tmp = {
+                "layer": "layer_shared",
+                'attention': 'attention',
+                'intermediate': 'intermediate_{}'.format(idx),
+                'output': 'output_{}'.format(idx)
+            }
+        elif shared_type == 'ffn':
+            tmp = {
+                "layer": "layer_shared",
+                'attention': 'attention_{}'.format(idx),
+                'intermediate': 'intermediate',
+                'output': 'output'
+            }
+        else:
+            tmp = {
+                "layer": "layer_{}".format(idx),
+                'attention': 'attention',
+                'intermediate': 'intermediate',
+                'output': 'output'
+            }
+
+        return tmp
+
+    all_layer_outputs = []
+
+    for layer_idx in range(num_hidden_layers):
+
+        idx_scope = layer_scope(layer_idx, shared_type)
+
+        with tf.variable_scope(idx_scope['layer'], reuse=tf.AUTO_REUSE):
+            layer_input = prev_output
+
+            with tf.variable_scope(idx_scope['attention'], reuse=tf.AUTO_REUSE):
+                attention_heads = []
+
+                with tf.variable_scope("output", reuse=tf.AUTO_REUSE):
+                    layer_input_pre = layer_norm(layer_input)
+
+                with tf.variable_scope("self"):
+                    attention_head = attention_layer(
+                        from_tensor=layer_input_pre,
+                        to_tensor=layer_input_pre,
+                        attention_mask=attention_mask,
+                        num_attention_heads=num_attention_heads,
+                        size_per_head=attention_head_size,
+                        attention_probs_dropout_prob=attention_probs_dropout_prob,
+                        initializer_range=initializer_range,
+                        do_return_2d_tensor=True,
+                        batch_size=batch_size,
+                        from_seq_length=seq_length,
+                        to_seq_length=seq_length)
+                    attention_heads.append(attention_head)
+
+                attention_output = None
+                if len(attention_heads) == 1:
+                    attention_output = attention_heads[0]
+                else:
+                    # In the case where we have other sequences, we just concatenate
+                    # them to the self-attention head before the projection.
+                    attention_output = tf.concat(attention_heads, axis=-1)
+
+                # Run a linear projection of `hidden_size` then add a residual
+                # with `layer_input`.
+                with tf.variable_scope("output", reuse=tf.AUTO_REUSE):
+                    attention_output = tf.layers.dense(
+                        attention_output,
+                        hidden_size,
+                        kernel_initializer=create_initializer(initializer_range))
+                    attention_output = dropout(attention_output, hidden_dropout_prob)
+
+                    # attention_output = layer_norm(attention_output + layer_input)
+                    attention_output = attention_output + layer_input
+
+            with tf.variable_scope(idx_scope['output'], reuse=tf.AUTO_REUSE):
+                attention_output_pre = layer_norm(attention_output)
+
+            # The activation is only applied to the "intermediate" hidden layer.
+            with tf.variable_scope(idx_scope['intermediate'], reuse=tf.AUTO_REUSE):
+                intermediate_output = tf.layers.dense(
+                    attention_output_pre,
+                    intermediate_size,
+                    activation=intermediate_act_fn,
+                    kernel_initializer=create_initializer(initializer_range))
+
+            # Down-project back to `hidden_size` then add the residual.
+            with tf.variable_scope(idx_scope['output'], reuse=tf.AUTO_REUSE):
+                layer_output = tf.layers.dense(
+                    intermediate_output,
+                    hidden_size,
+                    kernel_initializer=create_initializer(initializer_range))
+                layer_output = dropout(layer_output, hidden_dropout_prob)
+
+                # layer_output = layer_norm(layer_output + attention_output)
+                layer_output = layer_output + attention_output
+                prev_output = layer_output
+                all_layer_outputs.append(layer_output)
+
+    if do_return_all_layers:
+        final_outputs = []
+        for layer_output in all_layer_outputs:
+            final_output = bert_utils.reshape_from_matrix(layer_output, input_shape)
+            final_outputs.append(final_output)
+        return final_outputs
+    else:
+        final_output = bert_utils.reshape_from_matrix(prev_output, input_shape)
+        return final_output
diff --git a/Basic/Albert/albert_tiny_tf/albert/optimization.py b/Basic/Albert/albert_tiny_tf/albert/optimization.py
new file mode 100644
index 0000000..ac7e79a
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/optimization.py
@@ -0,0 +1,300 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions and classes related to optimization (weight updates)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import tensorflow as tf
+
+
+def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
+    """Creates an optimizer training op."""
+    global_step = tf.train.get_or_create_global_step()
+
+    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
+
+    # Implements linear decay of the learning rate.
+    learning_rate = tf.train.polynomial_decay(
+        learning_rate,
+        global_step,
+        num_train_steps,
+        end_learning_rate=0.0,
+        power=1.0,
+        cycle=False)
+
+    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
+    # learning rate will be `global_step/num_warmup_steps * init_lr`.
+    if num_warmup_steps:
+        global_steps_int = tf.cast(global_step, tf.int32)
+        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
+
+        global_steps_float = tf.cast(global_steps_int, tf.float32)
+        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
+
+        warmup_percent_done = global_steps_float / warmup_steps_float
+        warmup_learning_rate = init_lr * warmup_percent_done
+
+        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
+        learning_rate = (
+                (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
+
+    # It is recommended that you use this optimizer for fine tuning, since this
+    # is how the model was trained (note that the Adam m/v variables are NOT
+    # loaded from init_checkpoint.)
+    optimizer = LAMBOptimizer(
+        learning_rate=learning_rate,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+
+    if use_tpu:
+        optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+
+    tvars = tf.trainable_variables()
+    grads = tf.gradients(loss, tvars)
+
+    # This is how the model was pre-trained.
+    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
+
+    train_op = optimizer.apply_gradients(
+        zip(grads, tvars), global_step=global_step)
+
+    # Normally the global step update is done inside of `apply_gradients`.
+    # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
+    # a different optimizer, you should probably take this line out.
+    new_global_step = global_step + 1
+    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+    return train_op
+
+
+class AdamWeightDecayOptimizer(tf.train.Optimizer):
+    """A basic Adam optimizer that includes "correct" L2 weight decay."""
+
+    def __init__(self,
+                 learning_rate,
+                 weight_decay_rate=0.0,
+                 beta_1=0.9,
+                 beta_2=0.999,
+                 epsilon=1e-6,
+                 exclude_from_weight_decay=None,
+                 name="AdamWeightDecayOptimizer"):
+        """Constructs a AdamWeightDecayOptimizer."""
+        super(AdamWeightDecayOptimizer, self).__init__(False, name)
+
+        self.learning_rate = learning_rate
+        self.weight_decay_rate = weight_decay_rate
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.exclude_from_weight_decay = exclude_from_weight_decay
+
+    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+        """See base class."""
+        assignments = []
+        for (grad, param) in grads_and_vars:
+            if grad is None or param is None:
+                continue
+
+            param_name = self._get_variable_name(param.name)
+
+            m = tf.get_variable(
+                name=param_name + "/adam_m",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer())
+            v = tf.get_variable(
+                name=param_name + "/adam_v",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer())
+
+            # Standard Adam update.
+            next_m = (
+                    tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+            next_v = (
+                    tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                              tf.square(grad)))
+
+            update = next_m / (tf.sqrt(next_v) + self.epsilon)
+
+            # Just adding the square of the weights to the loss function is *not*
+            # the correct way of using L2 regularization/weight decay with Adam,
+            # since that will interact with the m and v parameters in strange ways.
+            #
+            # Instead we want ot decay the weights in a manner that doesn't interact
+            # with the m/v parameters. This is equivalent to adding the square
+            # of the weights to the loss with plain (non-momentum) SGD.
+            if self._do_use_weight_decay(param_name):
+                update += self.weight_decay_rate * param
+
+            update_with_lr = self.learning_rate * update
+
+            next_param = param - update_with_lr
+
+            assignments.extend(
+                [param.assign(next_param),
+                 m.assign(next_m),
+                 v.assign(next_v)])
+        return tf.group(*assignments, name=name)
+
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if not self.weight_decay_rate:
+            return False
+        if self.exclude_from_weight_decay:
+            for r in self.exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+
+    def _get_variable_name(self, param_name):
+        """Get the variable name from the tensor name."""
+        m = re.match("^(.*):\\d+$", param_name)
+        if m is not None:
+            param_name = m.group(1)
+        return param_name
+
+
+#
+class LAMBOptimizer(tf.train.Optimizer):
+    """
+    LAMBOptimizer optimizer.
+    https://github.com/ymcui/LAMB_Optimizer_TF
+    # IMPORTANT NOTE
+    - This is NOT an official implementation.
+    - LAMB optimizer is changed from arXiv v1 ~ v3.
+    - We implement v3 version (which is the latest version on June, 2019.).
+    - Our implementation is based on `AdamWeightDecayOptimizer` in BERT (provided by Google).
+
+    # References
+    - Large Batch Optimization for Deep Learning: Training BERT in 76 minutes. https://arxiv.org/abs/1904.00962v3
+    - BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. https://arxiv.org/abs/1810.04805
+    # Parameters
+    - There is nothing special, just the same as `AdamWeightDecayOptimizer`.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 weight_decay_rate=0.01,
+                 beta_1=0.9,
+                 beta_2=0.999,
+                 epsilon=1e-6,
+                 exclude_from_weight_decay=None,
+                 name="LAMBOptimizer"):
+        """Constructs a LAMBOptimizer."""
+        super(LAMBOptimizer, self).__init__(False, name)
+
+        self.learning_rate = learning_rate
+        self.weight_decay_rate = weight_decay_rate
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.exclude_from_weight_decay = exclude_from_weight_decay
+
+    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+        """See base class."""
+        assignments = []
+        for (grad, param) in grads_and_vars:
+            if grad is None or param is None:
+                continue
+
+            param_name = self._get_variable_name(param.name)
+
+            m = tf.get_variable(
+                name=param_name + "/lamb_m",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer())
+            v = tf.get_variable(
+                name=param_name + "/lamb_v",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer())
+
+            # Standard Adam update.
+            next_m = (
+                    tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+            next_v = (
+                    tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                              tf.square(grad)))
+
+            update = next_m / (tf.sqrt(next_v) + self.epsilon)
+
+            # Just adding the square of the weights to the loss function is *not*
+            # the correct way of using L2 regularization/weight decay with Adam,
+            # since that will interact with the m and v parameters in strange ways.
+            #
+            # Instead we want ot decay the weights in a manner that doesn't interact
+            # with the m/v parameters. This is equivalent to adding the square
+            # of the weights to the loss with plain (non-momentum) SGD.
+            if self._do_use_weight_decay(param_name):
+                update += self.weight_decay_rate * param
+
+            ############## BELOW ARE THE SPECIFIC PARTS FOR LAMB ##############
+
+            # Note: Here are two choices for scaling function \phi(z)
+            # minmax:   \phi(z) = min(max(z, \gamma_l), \gamma_u)
+            # identity: \phi(z) = z
+            # The authors does not mention what is \gamma_l and \gamma_u
+            # UPDATE: after asking authors, they provide me the code below.
+            # ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
+            #      math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)
+
+            r1 = tf.sqrt(tf.reduce_sum(tf.square(param)))
+            r2 = tf.sqrt(tf.reduce_sum(tf.square(update)))
+
+            r = tf.where(tf.greater(r1, 0.0),
+                         tf.where(tf.greater(r2, 0.0),
+                                  r1 / r2,
+                                  1.0),
+                         1.0)
+
+            eta = self.learning_rate * r
+
+            update_with_lr = eta * update
+
+            next_param = param - update_with_lr
+
+            assignments.extend(
+                [param.assign(next_param),
+                 m.assign(next_m),
+                 v.assign(next_v)])
+        return tf.group(*assignments, name=name)
+
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if not self.weight_decay_rate:
+            return False
+        if self.exclude_from_weight_decay:
+            for r in self.exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+
+    def _get_variable_name(self, param_name):
+        """Get the variable name from the tensor name."""
+        m = re.match("^(.*):\\d+$", param_name)
+        if m is not None:
+            param_name = m.group(1)
+        return param_name
diff --git a/Basic/Albert/albert_tiny_tf/albert/optimization_finetuning.py b/Basic/Albert/albert_tiny_tf/albert/optimization_finetuning.py
new file mode 100644
index 0000000..dd9311b
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/optimization_finetuning.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions and classes related to optimization (weight updates)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import tensorflow as tf
+
+
+def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
+    """Creates an optimizer training op."""
+    global_step = tf.train.get_or_create_global_step()
+
+    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
+
+    # Implements linear decay of the learning rate.
+    learning_rate = tf.train.polynomial_decay(
+        learning_rate,
+        global_step,
+        num_train_steps,
+        end_learning_rate=0.0,
+        power=1.0,
+        cycle=False)
+
+    # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
+    # learning rate will be `global_step/num_warmup_steps * init_lr`.
+    if num_warmup_steps:
+        global_steps_int = tf.cast(global_step, tf.int32)
+        warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
+
+        global_steps_float = tf.cast(global_steps_int, tf.float32)
+        warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
+
+        warmup_percent_done = global_steps_float / warmup_steps_float
+        warmup_learning_rate = init_lr * warmup_percent_done
+
+        is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
+        learning_rate = (
+                (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
+
+    # It is recommended that you use this optimizer for fine tuning, since this
+    # is how the model was trained (note that the Adam m/v variables are NOT
+    # loaded from init_checkpoint.)
+    optimizer = AdamWeightDecayOptimizer(
+        learning_rate=learning_rate,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,  # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+
+    if use_tpu:
+        optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+
+    tvars = tf.trainable_variables()
+    grads = tf.gradients(loss, tvars)
+
+    # This is how the model was pre-trained.
+    # (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
+
+    train_op = optimizer.apply_gradients(
+        zip(grads, tvars), global_step=global_step)
+
+    # Normally the global step update is done inside of `apply_gradients`.
+    # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
+    # a different optimizer, you should probably take this line out.
+    new_global_step = global_step + 1
+    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+    return train_op
+
+
+class AdamWeightDecayOptimizer(tf.train.Optimizer):
+    """A basic Adam optimizer that includes "correct" L2 weight decay."""
+
+    def __init__(self,
+                 learning_rate,
+                 weight_decay_rate=0.0,
+                 beta_1=0.9,
+                 beta_2=0.999,
+                 epsilon=1e-6,
+                 exclude_from_weight_decay=None,
+                 name="AdamWeightDecayOptimizer"):
+        """Constructs a AdamWeightDecayOptimizer."""
+        super(AdamWeightDecayOptimizer, self).__init__(False, name)
+
+        self.learning_rate = learning_rate
+        self.weight_decay_rate = weight_decay_rate
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.exclude_from_weight_decay = exclude_from_weight_decay
+
+    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+        """See base class."""
+        assignments = []
+        for (grad, param) in grads_and_vars:
+            if grad is None or param is None:
+                continue
+
+            param_name = self._get_variable_name(param.name)
+
+            m = tf.get_variable(
+                name=param_name + "/adam_m",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer())
+            v = tf.get_variable(
+                name=param_name + "/adam_v",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer())
+
+            # Standard Adam update.
+            next_m = (
+                    tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+            next_v = (
+                    tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                              tf.square(grad)))
+
+            update = next_m / (tf.sqrt(next_v) + self.epsilon)
+
+            # Just adding the square of the weights to the loss function is *not*
+            # the correct way of using L2 regularization/weight decay with Adam,
+            # since that will interact with the m and v parameters in strange ways.
+            #
+            # Instead we want ot decay the weights in a manner that doesn't interact
+            # with the m/v parameters. This is equivalent to adding the square
+            # of the weights to the loss with plain (non-momentum) SGD.
+            if self._do_use_weight_decay(param_name):
+                update += self.weight_decay_rate * param
+
+            update_with_lr = self.learning_rate * update
+
+            next_param = param - update_with_lr
+
+            assignments.extend(
+                [param.assign(next_param),
+                 m.assign(next_m),
+                 v.assign(next_v)])
+        return tf.group(*assignments, name=name)
+
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if not self.weight_decay_rate:
+            return False
+        if self.exclude_from_weight_decay:
+            for r in self.exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+
+    def _get_variable_name(self, param_name):
+        """Get the variable name from the tensor name."""
+        m = re.match("^(.*):\\d+$", param_name)
+        if m is not None:
+            param_name = m.group(1)
+        return param_name
diff --git a/Basic/Albert/albert_tiny_tf/albert/run.sh b/Basic/Albert/albert_tiny_tf/albert/run.sh
new file mode 100644
index 0000000..1e093a6
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/run.sh
@@ -0,0 +1,4 @@
+python run_classifier.py --task_name=lcqmc --do_train=true --do_eval=true --data_dir=../task_data/lcqmc --vocab_file=pre_trained_model/albert_tiny/vocab.txt --bert_config_file=pre_trained_model/albert_tiny/albert_config_tiny.json --max_seq_length=128 --train_batch_size=64 --learning_rate=1e-4 --num_train_epochs=5 --output_dir=output/lcqmc --init_checkpoint=pre_trained_model/albert_tiny/albert_model.ckpt
+
+
+python run_classifier.py --task_name=tnews --do_train=true --do_eval=true --data_dir=../task_data/tnews --vocab_file=pre_trained_model/albert_large/vocab.txt --bert_config_file=pre_trained_model/albert_large/albert_config_large.json --max_seq_length=128 --train_batch_size=8 --learning_rate=2e-5 --num_train_epochs=5 --output_dir=output/tnews --init_checkpoint=pre_trained_model/albert_large/albert_model.ckpt
diff --git a/Basic/Albert/albert_tiny_tf/albert/run_classifier.py b/Basic/Albert/albert_tiny_tf/albert/run_classifier.py
new file mode 100644
index 0000000..84acb91
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/run_classifier.py
@@ -0,0 +1,1013 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import csv
+import os
+import modeling
+import optimization_finetuning as optimization
+import tokenization
+import tensorflow as tf
+
+# from loss import bi_tempered_logistic_loss
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+## Required parameters
+flags.DEFINE_string(
+    "data_dir", None,
+    "The input data dir. Should contain the .tsv files (or other data files) "
+    "for the task.")
+
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_string("task_name", None, "The name of the task to train.")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_string(
+    "output_dir", None,
+    "The output directory where the model checkpoints will be written.")
+
+## Other parameters
+
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+
+flags.DEFINE_bool("do_train", False, "Whether to run training.")
+
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_bool(
+    "do_predict", False,
+    "Whether to run the model in inference mode on the test set.")
+
+flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+
+flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
+
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+
+flags.DEFINE_float("num_train_epochs", 3.0,
+                   "Total number of training epochs to perform.")
+
+flags.DEFINE_float(
+    "warmup_proportion", 0.1,
+    "Proportion of training to perform linear learning rate warmup for. "
+    "E.g., 0.1 = 10% of training.")
+
+flags.DEFINE_integer("save_checkpoints_steps", 1000,
+                     "How often to save the model checkpoint.")
+
+flags.DEFINE_integer("iterations_per_loop", 1000,
+                     "How many steps to make in each estimator call.")
+
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+
+tf.flags.DEFINE_string(
+    "tpu_name", None,
+    "The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.")
+
+tf.flags.DEFINE_string(
+    "tpu_zone", None,
+    "[Optional] GCE zone where the Cloud TPU is located in. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.")
+
+tf.flags.DEFINE_string(
+    "gcp_project", None,
+    "[Optional] Project name for the Cloud TPU-enabled project. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.")
+
+tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
+
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.")
+
+
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+        Args:
+          guid: Unique id for the example.
+          text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+          text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+          label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class PaddingInputExample(object):
+    """Fake example so the num input examples is a multiple of the batch size.
+    When running eval/predict on the TPU, we need to pad the number of examples
+    to be a multiple of the batch size, because the TPU requires a fixed batch
+    size. The alternative is to drop the last batch, which is bad because it means
+    the entire output data won't be generated.
+    We use this class instead of `None` because treating `None` as padding
+    battches could cause silent errors.
+    """
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 label_id,
+                 is_real_example=True):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
+        self.is_real_example = is_real_example
+
+
+class DataProcessor(object):
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for prediction."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with tf.gfile.Open(input_file, "r") as f:
+            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+            lines = []
+            for line in reader:
+                lines.append(line)
+            return lines
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+    """Converts a single `InputExample` into a single `InputFeatures`."""
+
+    if isinstance(example, PaddingInputExample):
+        return InputFeatures(
+            input_ids=[0] * max_seq_length,
+            input_mask=[0] * max_seq_length,
+            segment_ids=[0] * max_seq_length,
+            label_id=0,
+            is_real_example=False)
+
+    label_map = {}
+    for (i, label) in enumerate(label_list):
+        label_map[label] = i
+
+    tokens_a = tokenizer.tokenize(example.text_a)
+    tokens_b = None
+    if example.text_b:
+        tokens_b = tokenizer.tokenize(example.text_b)
+
+    if tokens_b:
+        # Modifies `tokens_a` and `tokens_b` in place so that the total
+        # length is less than the specified length.
+        # Account for [CLS], [SEP], [SEP] with "- 3"
+        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+    else:
+        # Account for [CLS] and [SEP] with "- 2"
+        if len(tokens_a) > max_seq_length - 2:
+            tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+    # The convention in BERT is:
+    # (a) For sequence pairs:
+    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+    # (b) For single sequences:
+    #  tokens:   [CLS] the dog is hairy . [SEP]
+    #  type_ids: 0     0   0   0  0     0 0
+    #
+    # Where "type_ids" are used to indicate whether this is the first
+    # sequence or the second sequence. The embedding vectors for `type=0` and
+    # `type=1` were learned during pre-training and are added to the wordpiece
+    # embedding vector (and position vector). This is not *strictly* necessary
+    # since the [SEP] token unambiguously separates the sequences, but it makes
+    # it easier for the model to learn the concept of sequences.
+    #
+    # For classification tasks, the first vector (corresponding to [CLS]) is
+    # used as the "sentence vector". Note that this only makes sense because
+    # the entire model is fine-tuned.
+    tokens = []
+    segment_ids = []
+    tokens.append("[CLS]")
+    segment_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        segment_ids.append(0)
+    tokens.append("[SEP]")
+    segment_ids.append(0)
+
+    if tokens_b:
+        for token in tokens_b:
+            tokens.append(token)
+            segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+    input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    label_id = label_map[example.label]
+    if ex_index < 5:
+        tf.logging.info("*** Example ***")
+        tf.logging.info("guid: %s" % (example.guid))
+        tf.logging.info("tokens: %s" % " ".join(
+            [tokenization.printable_text(x) for x in tokens]))
+        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+        tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
+
+    feature = InputFeatures(
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        label_id=label_id,
+        is_real_example=True)
+    return feature
+
+
+def file_based_convert_examples_to_features(
+        examples, label_list, max_seq_length, tokenizer, output_file):
+    """Convert a set of `InputExample`s to a TFRecord file."""
+
+    writer = tf.python_io.TFRecordWriter(output_file)
+
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+
+        def create_int_feature(values):
+            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+            return f
+
+        features = collections.OrderedDict()
+        features["input_ids"] = create_int_feature(feature.input_ids)
+        features["input_mask"] = create_int_feature(feature.input_mask)
+        features["segment_ids"] = create_int_feature(feature.segment_ids)
+        features["label_ids"] = create_int_feature([feature.label_id])
+        features["is_real_example"] = create_int_feature(
+            [int(feature.is_real_example)])
+
+        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+        writer.write(tf_example.SerializeToString())
+    writer.close()
+
+
+def file_based_input_fn_builder(input_file, seq_length, is_training,
+                                drop_remainder):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+    name_to_features = {
+        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
+        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
+        "label_ids": tf.FixedLenFeature([], tf.int64),
+        "is_real_example": tf.FixedLenFeature([], tf.int64),
+    }
+
+    def _decode_record(record, name_to_features):
+        """Decodes a record to a TensorFlow example."""
+        example = tf.parse_single_example(record, name_to_features)
+
+        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+        # So cast all int64 to int32.
+        for name in list(example.keys()):
+            t = example[name]
+            if t.dtype == tf.int64:
+                t = tf.to_int32(t)
+            example[name] = t
+
+        return example
+
+    def input_fn(params):
+        """The actual input function."""
+        batch_size = params["batch_size"]
+
+        # For training, we want a lot of parallel reading and shuffling.
+        # For eval, we want no shuffling and parallel reading doesn't matter.
+        d = tf.data.TFRecordDataset(input_file)
+        if is_training:
+            d = d.repeat()
+            d = d.shuffle(buffer_size=100)
+
+        d = d.apply(
+            tf.contrib.data.map_and_batch(
+                lambda record: _decode_record(record, name_to_features),
+                batch_size=batch_size,
+                drop_remainder=drop_remainder))
+
+        return d
+
+    return input_fn
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
+                 labels, num_labels, use_one_hot_embeddings):
+    """Creates a classification model."""
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=is_training,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=segment_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings)
+
+    # In the demo, we are doing a simple classification task on the entire
+    # segment.
+    #
+    # If you want to use the token-level output, use model.get_sequence_output()
+    # instead.
+    output_layer = model.get_pooled_output()
+
+    hidden_size = output_layer.shape[-1].value
+
+    output_weights = tf.get_variable(
+        "output_weights", [num_labels, hidden_size],
+        initializer=tf.truncated_normal_initializer(stddev=0.02))
+
+    output_bias = tf.get_variable(
+        "output_bias", [num_labels], initializer=tf.zeros_initializer())
+
+    with tf.variable_scope("loss"):
+        ln_type = bert_config.ln_type
+        if ln_type == 'preln':  # add by brightmart, 10-06. if it is preln, we need to an additonal layer: layer normalization as suggested in paper "ON LAYER NORMALIZATION IN THE TRANSFORMER ARCHITECTURE"
+            print("ln_type is preln. add LN layer.")
+            output_layer = layer_norm(output_layer)
+        else:
+            print("ln_type is postln or other,do nothing.")
+
+        if is_training:
+            # I.e., 0.1 dropout
+            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
+
+        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
+        logits = tf.nn.bias_add(logits, output_bias)
+        print("logits", logits)
+        probabilities = tf.nn.softmax(logits, axis=-1)
+        log_probs = tf.nn.log_softmax(logits, axis=-1)
+        #
+        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
+        # per_example_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=one_hot_labels)
+        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)  # todo 08-29 try temp-loss
+        ###############bi_tempered_logistic_loss############################################################################
+        # print("##cross entropy loss is used...."); tf.logging.info("##cross entropy loss is used....")
+        # t1=0.9 #t1=0.90
+        # t2=1.05 #t2=1.05
+        # per_example_loss=bi_tempered_logistic_loss(log_probs,one_hot_labels,t1,t2,label_smoothing=0.1,num_iters=5) # TODO label_smoothing=0.0
+        # tf.logging.info("per_example_loss:"+str(per_example_loss.shape))
+        ##############bi_tempered_logistic_loss#############################################################################
+
+        loss = tf.reduce_mean(per_example_loss)
+        print("loss", loss)
+
+        return (loss, per_example_loss, logits, probabilities)
+
+
+def layer_norm(input_tensor, name=None):
+    """Run layer normalization on the last dimension of the tensor."""
+    return tf.contrib.layers.layer_norm(
+        inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
+
+
+def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
+                     num_train_steps, num_warmup_steps, use_tpu,
+                     use_one_hot_embeddings):
+    """Returns `model_fn` closure for TPUEstimator."""
+
+    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+        """The `model_fn` for TPUEstimator."""
+
+        tf.logging.info("*** Features ***")
+        for name in sorted(features.keys()):
+            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+        input_ids = features["input_ids"]
+        input_mask = features["input_mask"]
+        segment_ids = features["segment_ids"]
+        label_ids = features["label_ids"]
+        is_real_example = None
+        if "is_real_example" in features:
+            is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
+        else:
+            is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
+
+        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+        (total_loss, per_example_loss, logits, probabilities) = create_model(
+            bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
+            num_labels, use_one_hot_embeddings)
+
+        tvars = tf.trainable_variables()
+        initialized_variable_names = {}
+        scaffold_fn = None
+        if init_checkpoint:
+            (assignment_map, initialized_variable_names
+             ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+            if use_tpu:
+
+                def tpu_scaffold():
+                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+                    return tf.train.Scaffold()
+
+                scaffold_fn = tpu_scaffold
+            else:
+                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+            init_string = ""
+            if var.name in initialized_variable_names:
+                init_string = ", *INIT_FROM_CKPT*"
+            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                            init_string)
+
+        output_spec = None
+        if mode == tf.estimator.ModeKeys.TRAIN:
+
+            train_op = optimization.create_optimizer(
+                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
+
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode,
+                loss=total_loss,
+                train_op=train_op,
+                scaffold_fn=scaffold_fn)
+        elif mode == tf.estimator.ModeKeys.EVAL:
+
+            def metric_fn(per_example_loss, label_ids, logits, is_real_example):
+                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
+                accuracy = tf.metrics.accuracy(
+                    labels=label_ids, predictions=predictions, weights=is_real_example)
+                loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
+                return {
+                    "eval_accuracy": accuracy,
+                    "eval_loss": loss,
+                }
+
+            eval_metrics = (metric_fn,
+                            [per_example_loss, label_ids, logits, is_real_example])
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode,
+                loss=total_loss,
+                eval_metrics=eval_metrics,
+                scaffold_fn=scaffold_fn)
+        else:
+            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+                mode=mode,
+                predictions={"probabilities": probabilities},
+                scaffold_fn=scaffold_fn)
+        return output_spec
+
+    return model_fn
+
+
+# This function is not used by this file but is still used by the Colab and
+# people who depend on it.
+def input_fn_builder(features, seq_length, is_training, drop_remainder):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+    all_input_ids = []
+    all_input_mask = []
+    all_segment_ids = []
+    all_label_ids = []
+
+    for feature in features:
+        all_input_ids.append(feature.input_ids)
+        all_input_mask.append(feature.input_mask)
+        all_segment_ids.append(feature.segment_ids)
+        all_label_ids.append(feature.label_id)
+
+    def input_fn(params):
+        """The actual input function."""
+        batch_size = params["batch_size"]
+
+        num_examples = len(features)
+
+        # This is for demo purposes and does NOT scale to large data sets. We do
+        # not use Dataset.from_generator() because that uses tf.py_func which is
+        # not TPU compatible. The right way to load data is with TFRecordReader.
+        d = tf.data.Dataset.from_tensor_slices({
+            "input_ids":
+                tf.constant(
+                    all_input_ids, shape=[num_examples, seq_length],
+                    dtype=tf.int32),
+            "input_mask":
+                tf.constant(
+                    all_input_mask,
+                    shape=[num_examples, seq_length],
+                    dtype=tf.int32),
+            "segment_ids":
+                tf.constant(
+                    all_segment_ids,
+                    shape=[num_examples, seq_length],
+                    dtype=tf.int32),
+            "label_ids":
+                tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
+        })
+
+        if is_training:
+            d = d.repeat()
+            d = d.shuffle(buffer_size=100)
+
+        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
+        return d
+
+    return input_fn
+
+
+class TNEWSClassifierProcessor(object):
+    def __init__(self):
+        self.language = "zh"
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_data(os.path.join(data_dir, "train.txt")), "train")
+        # dev_0827.tsv
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_data(os.path.join(data_dir, "dev.txt")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_data(os.path.join(data_dir, "test.txt")), "test")
+
+    def get_labels(self, data_dir):
+        """See base class."""
+        with open(os.path.join(data_dir, "labels.txt"), "r", encoding="utf8") as fr:
+            labels = [line.strip() for line in fr.readlines()]
+        return labels
+
+    def _read_data(self, data_dir):
+        with open(data_dir, "r", encoding="utf8") as fr:
+            lines = [line.strip().split("<SEP>") for line in fr.readlines()]
+            return lines
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        print("length of lines:", len(lines))
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            try:
+                label = tokenization.convert_to_unicode(line[1])
+                text_a = tokenization.convert_to_unicode(line[0])
+                examples.append(
+                    InputExample(guid=guid, text_a=text_a, text_b=text_a, label=label))
+            except Exception:
+                print('###error.i:', i, line)
+        return examples
+
+
+class LCQMCPairClassificationProcessor(DataProcessor):  # TODO NEED CHANGE2
+    """Processor for the internal data set. sentence pair classification"""
+
+    def __init__(self):
+        self.language = "zh"
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+        # dev_0827.tsv
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+        # return ["-1","0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        print("length of lines:", len(lines))
+        for (i, line) in enumerate(lines):
+            # print('#i:',i,line)
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            try:
+                label = tokenization.convert_to_unicode(line[2])
+                text_a = tokenization.convert_to_unicode(line[0])
+                text_b = tokenization.convert_to_unicode(line[1])
+                examples.append(
+                    InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            except Exception:
+                print('###error.i:', i, line)
+        return examples
+
+
+class SentencePairClassificationProcessor(DataProcessor):
+    """Processor for the internal data set. sentence pair classification"""
+
+    def __init__(self):
+        self.language = "zh"
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train_0827.tsv")), "train")
+        # dev_0827.tsv
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_0827.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "test_0827.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+        # return ["-1","0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        print("length of lines:", len(lines))
+        for (i, line) in enumerate(lines):
+            # print('#i:',i,line)
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            try:
+                label = tokenization.convert_to_unicode(line[0])
+                text_a = tokenization.convert_to_unicode(line[1])
+                text_b = tokenization.convert_to_unicode(line[2])
+                examples.append(
+                    InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+            except Exception:
+                print('###error.i:', i, line)
+        return examples
+
+
+# This function is not used by this file but is still used by the Colab and
+# people who depend on it.
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+    """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+        feature = convert_single_example(ex_index, example, label_list,
+                                         max_seq_length, tokenizer)
+
+        features.append(feature)
+    return features
+
+
+def main(_):
+    tf.logging.set_verbosity(tf.logging.INFO)
+
+    # 数据处理类
+    processors = {
+        "sentence_pair": SentencePairClassificationProcessor,
+        "lcqmc": LCQMCPairClassificationProcessor,
+        "tnews": TNEWSClassifierProcessor
+
+    }
+
+    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
+                                                  FLAGS.init_checkpoint)
+
+    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
+        raise ValueError(
+            "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
+
+    # 加载模型参数
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
+        raise ValueError(
+            "Cannot use sequence length %d because the BERT model "
+            "was only trained up to sequence length %d" %
+            (FLAGS.max_seq_length, bert_config.max_position_embeddings))
+
+    # 创建output文件
+    tf.gfile.MakeDirs(FLAGS.output_dir)
+
+    task_name = FLAGS.task_name.lower()
+
+    if task_name not in processors:
+        raise ValueError("Task not found: %s" % (task_name))
+
+    processor = processors[task_name]()
+
+    label_list = processor.get_labels()
+
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+    tpu_cluster_resolver = None
+    if FLAGS.use_tpu and FLAGS.tpu_name:
+        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
+
+    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+    # Cloud TPU: Invalid TPU configuration, ensure ClusterResolver is passed to tpu.
+    print("###tpu_cluster_resolver:", tpu_cluster_resolver)
+    run_config = tf.contrib.tpu.RunConfig(
+        cluster=tpu_cluster_resolver,
+        master=FLAGS.master,
+        model_dir=FLAGS.output_dir,
+        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+        tpu_config=tf.contrib.tpu.TPUConfig(
+            iterations_per_loop=FLAGS.iterations_per_loop,
+            num_shards=FLAGS.num_tpu_cores,
+            per_host_input_for_training=is_per_host))
+
+    train_examples = None
+    num_train_steps = None
+    num_warmup_steps = None
+    if FLAGS.do_train:
+        train_examples = processor.get_train_examples(FLAGS.data_dir)  # TODO
+        print("###length of total train_examples:", len(train_examples))
+        num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
+        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
+
+    model_fn = model_fn_builder(
+        bert_config=bert_config,
+        num_labels=len(label_list),
+        init_checkpoint=FLAGS.init_checkpoint,
+        learning_rate=FLAGS.learning_rate,
+        num_train_steps=num_train_steps,
+        num_warmup_steps=num_warmup_steps,
+        use_tpu=FLAGS.use_tpu,
+        use_one_hot_embeddings=FLAGS.use_tpu)
+
+    # If TPU is not available, this will fall back to normal Estimator on CPU
+    # or GPU.
+    estimator = tf.contrib.tpu.TPUEstimator(
+        use_tpu=FLAGS.use_tpu,
+        model_fn=model_fn,
+        config=run_config,
+        train_batch_size=FLAGS.train_batch_size,
+        eval_batch_size=FLAGS.eval_batch_size,
+        predict_batch_size=FLAGS.predict_batch_size)
+
+    if FLAGS.do_train:
+        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
+        train_file_exists = os.path.exists(train_file)
+        print("###train_file_exists:", train_file_exists, " ;train_file:", train_file)
+        if not train_file_exists:  # if tf_record file not exist, convert from raw text file. # TODO
+            file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer,
+                                                    train_file)
+        tf.logging.info("***** Running training *****")
+        tf.logging.info("  Num examples = %d", len(train_examples))
+        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+        tf.logging.info("  Num steps = %d", num_train_steps)
+        train_input_fn = file_based_input_fn_builder(
+            input_file=train_file,
+            seq_length=FLAGS.max_seq_length,
+            is_training=True,
+            drop_remainder=True)
+
+        tensors_to_log = {"train loss": "loss/Mean:0"}
+        logging_hook = tf.train.LoggingTensorHook(
+            tensors=tensors_to_log, every_n_iter=100)
+        estimator.train(input_fn=train_input_fn, hooks=[logging_hook], max_steps=num_train_steps)
+
+    if FLAGS.do_eval:
+        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
+        num_actual_eval_examples = len(eval_examples)
+        if FLAGS.use_tpu:
+            # TPU requires a fixed batch size for all batches, therefore the number
+            # of examples must be a multiple of the batch size, or else examples
+            # will get dropped. So we pad with fake examples which are ignored
+            # later on. These do NOT count towards the metric (all tf.metrics
+            # support a per-instance weight, and these get a weight of 0.0).
+            while len(eval_examples) % FLAGS.eval_batch_size != 0:
+                eval_examples.append(PaddingInputExample())
+
+        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
+        file_based_convert_examples_to_features(
+            eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
+
+        tf.logging.info("***** Running evaluation *****")
+        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
+                        len(eval_examples), num_actual_eval_examples,
+                        len(eval_examples) - num_actual_eval_examples)
+        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+
+        # This tells the estimator to run through the entire set.
+        eval_steps = None
+        # However, if running eval on the TPU, you will need to specify the
+        # number of steps.
+        if FLAGS.use_tpu:
+            assert len(eval_examples) % FLAGS.eval_batch_size == 0
+            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)
+
+        eval_drop_remainder = True if FLAGS.use_tpu else False
+        eval_input_fn = file_based_input_fn_builder(
+            input_file=eval_file,
+            seq_length=FLAGS.max_seq_length,
+            is_training=False,
+            drop_remainder=eval_drop_remainder)
+
+        #######################################################################################################################
+        # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy
+        steps_and_files = []
+        filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
+        for filename in filenames:
+            if filename.endswith(".index"):
+                ckpt_name = filename[:-6]
+                cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
+                global_step = int(cur_filename.split("-")[-1])
+                tf.logging.info("Add {} to eval list.".format(cur_filename))
+                steps_and_files.append([global_step, cur_filename])
+        steps_and_files = sorted(steps_and_files, key=lambda x: x[0])
+
+        output_eval_file = os.path.join(FLAGS.data_dir, "eval_results_albert_zh.txt")
+        print("output_eval_file:", output_eval_file)
+        tf.logging.info("output_eval_file:" + output_eval_file)
+        with tf.gfile.GFile(output_eval_file, "w") as writer:
+            for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]):
+                result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename)
+
+                tf.logging.info("***** Eval results %s *****" % (filename))
+                writer.write("***** Eval results %s *****\n" % (filename))
+                for key in sorted(result.keys()):
+                    tf.logging.info("  %s = %s", key, str(result[key]))
+                    writer.write("%s = %s\n" % (key, str(result[key])))
+        #######################################################################################################################
+
+        # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
+        #
+        # output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+        # with tf.gfile.GFile(output_eval_file, "w") as writer:
+        #  tf.logging.info("***** Eval results *****")
+        #  for key in sorted(result.keys()):
+        #    tf.logging.info("  %s = %s", key, str(result[key]))
+        #    writer.write("%s = %s\n" % (key, str(result[key])))
+
+    if FLAGS.do_predict:
+        predict_examples = processor.get_test_examples(FLAGS.data_dir)
+        num_actual_predict_examples = len(predict_examples)
+        if FLAGS.use_tpu:
+            # TPU requires a fixed batch size for all batches, therefore the number
+            # of examples must be a multiple of the batch size, or else examples
+            # will get dropped. So we pad with fake examples which are ignored
+            # later on.
+            while len(predict_examples) % FLAGS.predict_batch_size != 0:
+                predict_examples.append(PaddingInputExample())
+
+        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
+        file_based_convert_examples_to_features(predict_examples, label_list,
+                                                FLAGS.max_seq_length, tokenizer,
+                                                predict_file)
+
+        tf.logging.info("***** Running prediction*****")
+        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
+                        len(predict_examples), num_actual_predict_examples,
+                        len(predict_examples) - num_actual_predict_examples)
+        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)
+
+        predict_drop_remainder = True if FLAGS.use_tpu else False
+        predict_input_fn = file_based_input_fn_builder(
+            input_file=predict_file,
+            seq_length=FLAGS.max_seq_length,
+            is_training=False,
+            drop_remainder=predict_drop_remainder)
+
+        result = estimator.predict(input_fn=predict_input_fn)
+
+        output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv")
+        with tf.gfile.GFile(output_predict_file, "w") as writer:
+            num_written_lines = 0
+            tf.logging.info("***** Predict results *****")
+            for (i, prediction) in enumerate(result):
+                probabilities = prediction["probabilities"]
+                if i >= num_actual_predict_examples:
+                    break
+                output_line = "\t".join(
+                    str(class_probability)
+                    for class_probability in probabilities) + "\n"
+                writer.write(output_line)
+                num_written_lines += 1
+        assert num_written_lines == num_actual_predict_examples
+
+
+if __name__ == "__main__":
+    flags.mark_flag_as_required("data_dir")
+    flags.mark_flag_as_required("task_name")
+    flags.mark_flag_as_required("vocab_file")
+    flags.mark_flag_as_required("bert_config_file")
+    flags.mark_flag_as_required("output_dir")
+    tf.app.run()
diff --git a/Basic/Albert/albert_tiny_tf/albert/run_pretraining.py b/Basic/Albert/albert_tiny_tf/albert/run_pretraining.py
new file mode 100644
index 0000000..346d8bb
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/run_pretraining.py
@@ -0,0 +1,501 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Run masked LM/next sentence masked_lm pre-training for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import modeling
+import optimization
+import tensorflow as tf
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+## Required parameters
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_string(
+    "input_file", None,
+    "Input TF example files (can be a glob or comma separated).")
+
+flags.DEFINE_string(
+    "output_dir", None,
+    "The output directory where the model checkpoints will be written.")
+
+## Other parameters
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded. Must match data generation.")
+
+flags.DEFINE_integer(
+    "max_predictions_per_seq", 20,
+    "Maximum number of masked LM predictions per sequence. "
+    "Must match data generation.")
+
+flags.DEFINE_bool("do_train", False, "Whether to run training.")
+
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+
+flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.")
+
+flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.")
+
+flags.DEFINE_integer("save_checkpoints_steps", 1000,
+                     "How often to save the model checkpoint.")
+
+flags.DEFINE_integer("iterations_per_loop", 1000,
+                     "How many steps to make in each estimator call.")
+
+flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.")
+
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+
+tf.flags.DEFINE_string(
+    "tpu_name", None,
+    "The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.")
+
+tf.flags.DEFINE_string(
+    "tpu_zone", None,
+    "[Optional] GCE zone where the Cloud TPU is located in. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.")
+
+tf.flags.DEFINE_string(
+    "gcp_project", None,
+    "[Optional] Project name for the Cloud TPU-enabled project. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.")
+
+tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
+
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.")
+
+
+def model_fn_builder(bert_config, init_checkpoint, learning_rate,
+                     num_train_steps, num_warmup_steps, use_tpu,
+                     use_one_hot_embeddings):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for TPUEstimator."""
+
+    tf.logging.info("*** Features ***")
+    for name in sorted(features.keys()):
+      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+    input_ids = features["input_ids"]
+    input_mask = features["input_mask"]
+    segment_ids = features["segment_ids"]
+    masked_lm_positions = features["masked_lm_positions"]
+    masked_lm_ids = features["masked_lm_ids"]
+    masked_lm_weights = features["masked_lm_weights"]
+    next_sentence_labels = features["next_sentence_labels"]
+
+    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+    model = modeling.BertModel(
+        config=bert_config,
+        is_training=is_training,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        token_type_ids=segment_ids,
+        use_one_hot_embeddings=use_one_hot_embeddings)
+
+    (masked_lm_loss,
+     masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
+         bert_config, model.get_sequence_output(), model.get_embedding_table(),model.get_embedding_table_2(),
+         masked_lm_positions, masked_lm_ids, masked_lm_weights)
+
+    (next_sentence_loss, next_sentence_example_loss,
+     next_sentence_log_probs) = get_next_sentence_output(
+         bert_config, model.get_pooled_output(), next_sentence_labels)
+
+    total_loss = masked_lm_loss + next_sentence_loss
+
+    tvars = tf.trainable_variables()
+
+    initialized_variable_names = {}
+    print("init_checkpoint:",init_checkpoint)
+    scaffold_fn = None
+    if init_checkpoint:
+      (assignment_map, initialized_variable_names
+      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+      if use_tpu:
+
+        def tpu_scaffold():
+          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+          return tf.train.Scaffold()
+
+        scaffold_fn = tpu_scaffold
+      else:
+        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    tf.logging.info("**** Trainable Variables ****")
+    for var in tvars:
+      init_string = ""
+      if var.name in initialized_variable_names:
+        init_string = ", *INIT_FROM_CKPT*"
+      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                      init_string)
+
+    output_spec = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      train_op = optimization.create_optimizer(
+          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
+
+      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          train_op=train_op,
+          scaffold_fn=scaffold_fn)
+    elif mode == tf.estimator.ModeKeys.EVAL:
+
+      def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
+                    masked_lm_weights, next_sentence_example_loss,
+                    next_sentence_log_probs, next_sentence_labels):
+        """Computes the loss and accuracy of the model."""
+        masked_lm_log_probs = tf.reshape(masked_lm_log_probs,[-1, masked_lm_log_probs.shape[-1]])
+        masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32)
+        masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
+        masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
+        masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
+        masked_lm_accuracy = tf.metrics.accuracy(
+            labels=masked_lm_ids,
+            predictions=masked_lm_predictions,
+            weights=masked_lm_weights)
+        masked_lm_mean_loss = tf.metrics.mean(
+            values=masked_lm_example_loss, weights=masked_lm_weights)
+
+        next_sentence_log_probs = tf.reshape(
+            next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])
+        next_sentence_predictions = tf.argmax(
+            next_sentence_log_probs, axis=-1, output_type=tf.int32)
+        next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
+        next_sentence_accuracy = tf.metrics.accuracy(
+            labels=next_sentence_labels, predictions=next_sentence_predictions)
+        next_sentence_mean_loss = tf.metrics.mean(
+            values=next_sentence_example_loss)
+
+        return {
+            "masked_lm_accuracy": masked_lm_accuracy,
+            "masked_lm_loss": masked_lm_mean_loss,
+            "next_sentence_accuracy": next_sentence_accuracy,
+            "next_sentence_loss": next_sentence_mean_loss,
+        }
+
+      # next_sentence_example_loss=0.0 TODO
+      # next_sentence_log_probs=0.0 # TODO
+      eval_metrics = (metric_fn, [
+          masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
+          masked_lm_weights, next_sentence_example_loss,
+          next_sentence_log_probs, next_sentence_labels
+      ])
+      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          eval_metrics=eval_metrics,
+          scaffold_fn=scaffold_fn)
+    else:
+      raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
+
+    return output_spec
+
+  return model_fn
+
+
+def get_masked_lm_output(bert_config, input_tensor, output_weights,project_weights, positions,
+                         label_ids, label_weights):
+  """Get loss and log probs for the masked LM."""
+  input_tensor = gather_indexes(input_tensor, positions)
+
+  with tf.variable_scope("cls/predictions"):
+    # We apply one more non-linear transformation before the output layer.
+    # This matrix is not used after pre-training.
+    with tf.variable_scope("transform"):
+      input_tensor = tf.layers.dense(
+          input_tensor,
+          units=bert_config.hidden_size,
+          activation=modeling.get_activation(bert_config.hidden_act),
+          kernel_initializer=modeling.create_initializer(
+              bert_config.initializer_range))
+      input_tensor = modeling.layer_norm(input_tensor)
+
+    # The output weights are the same as the input embeddings, but there is
+    # an output-only bias for each token.
+    output_bias = tf.get_variable(
+        "output_bias",
+        shape=[bert_config.vocab_size],
+        initializer=tf.zeros_initializer())
+    # logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
+    # input_tensor=[-1,hidden_size], project_weights=[embedding_size, hidden_size], project_weights_transpose=[hidden_size, embedding_size]--->[-1, embedding_size]
+    input_project = tf.matmul(input_tensor, project_weights, transpose_b=True)
+    logits = tf.matmul(input_project, output_weights, transpose_b=True)
+    #  # input_project=[-1, embedding_size], output_weights=[vocab_size, embedding_size], output_weights_transpose=[embedding_size, vocab_size] ---> [-1, vocab_size]
+
+    logits = tf.nn.bias_add(logits, output_bias)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+    label_ids = tf.reshape(label_ids, [-1])
+    label_weights = tf.reshape(label_weights, [-1])
+
+    one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
+
+    # The `positions` tensor might be zero-padded (if the sequence is too
+    # short to have the maximum number of predictions). The `label_weights`
+    # tensor has a value of 1.0 for every real prediction and 0.0 for the
+    # padding predictions.
+    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
+    numerator = tf.reduce_sum(label_weights * per_example_loss)
+    denominator = tf.reduce_sum(label_weights) + 1e-5
+    loss = numerator / denominator
+
+  return (loss, per_example_loss, log_probs)
+
+
+def get_next_sentence_output(bert_config, input_tensor, labels):
+  """Get loss and log probs for the next sentence prediction."""
+
+  # Simple binary classification. Note that 0 is "next sentence" and 1 is
+  # "random sentence". This weight matrix is not used after pre-training.
+  with tf.variable_scope("cls/seq_relationship"):
+    output_weights = tf.get_variable(
+        "output_weights",
+        shape=[2, bert_config.hidden_size],
+        initializer=modeling.create_initializer(bert_config.initializer_range))
+    output_bias = tf.get_variable(
+        "output_bias", shape=[2], initializer=tf.zeros_initializer())
+
+    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+    labels = tf.reshape(labels, [-1])
+    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
+    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+    loss = tf.reduce_mean(per_example_loss)
+    return (loss, per_example_loss, log_probs)
+
+
+def gather_indexes(sequence_tensor, positions):
+  """Gathers the vectors at the specific positions over a minibatch."""
+  sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
+  batch_size = sequence_shape[0]
+  seq_length = sequence_shape[1]
+  width = sequence_shape[2]
+
+  flat_offsets = tf.reshape(
+      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+  flat_positions = tf.reshape(positions + flat_offsets, [-1])
+  flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                    [batch_size * seq_length, width])
+  output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+  return output_tensor
+
+
+def input_fn_builder(input_files,
+                     max_seq_length,
+                     max_predictions_per_seq,
+                     is_training,
+                     num_cpu_threads=4):
+  """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+  def input_fn(params):
+    """The actual input function."""
+    batch_size = params["batch_size"]
+
+    name_to_features = {
+        "input_ids":
+            tf.FixedLenFeature([max_seq_length], tf.int64),
+        "input_mask":
+            tf.FixedLenFeature([max_seq_length], tf.int64),
+        "segment_ids":
+            tf.FixedLenFeature([max_seq_length], tf.int64),
+        "masked_lm_positions":
+            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        "masked_lm_ids":
+            tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        "masked_lm_weights":
+            tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
+        "next_sentence_labels":
+            tf.FixedLenFeature([1], tf.int64),
+    }
+
+    # For training, we want a lot of parallel reading and shuffling.
+    # For eval, we want no shuffling and parallel reading doesn't matter.
+    if is_training:
+      d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
+      d = d.repeat()
+      d = d.shuffle(buffer_size=len(input_files))
+
+      # `cycle_length` is the number of parallel files that get read.
+      cycle_length = min(num_cpu_threads, len(input_files))
+
+      # `sloppy` mode means that the interleaving is not exact. This adds
+      # even more randomness to the training pipeline.
+      d = d.apply(
+          tf.contrib.data.parallel_interleave(
+              tf.data.TFRecordDataset,
+              sloppy=is_training,
+              cycle_length=cycle_length))
+      d = d.shuffle(buffer_size=100)
+    else:
+      d = tf.data.TFRecordDataset(input_files)
+      # Since we evaluate for a fixed number of steps we don't want to encounter
+      # out-of-range exceptions.
+      d = d.repeat()
+
+    # We must `drop_remainder` on training because the TPU requires fixed
+    # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
+    # and we *don't* want to drop the remainder, otherwise we wont cover
+    # every sample.
+    d = d.apply(
+        tf.contrib.data.map_and_batch(
+            lambda record: _decode_record(record, name_to_features),
+            batch_size=batch_size,
+            num_parallel_batches=num_cpu_threads,
+            drop_remainder=True))
+    return d
+
+  return input_fn
+
+
+def _decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  example = tf.parse_single_example(record, name_to_features)
+
+  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+  # So cast all int64 to int32.
+  for name in list(example.keys()):
+    t = example[name]
+    if t.dtype == tf.int64:
+      t = tf.to_int32(t)
+    example[name] = t
+
+  return example
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  if not FLAGS.do_train and not FLAGS.do_eval: # 必须是训练或验证的类型
+    raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # 从json文件中获得配置信息
+
+  tf.gfile.MakeDirs(FLAGS.output_dir)
+
+  input_files = [] # 输入可以是多个文件，以“逗号隔开”；可以是一个匹配形式的，如“input_x*”
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.gfile.Glob(input_pattern))
+
+  tf.logging.info("*** Input Files ***")
+  for input_file in input_files:
+    tf.logging.info("  %s" % input_file)
+
+  tpu_cluster_resolver = None
+  if FLAGS.use_tpu and FLAGS.tpu_name:
+      tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # TODO
+            tpu=FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
+
+  print("###tpu_cluster_resolver:",tpu_cluster_resolver,";FLAGS.use_tpu:",FLAGS.use_tpu,";FLAGS.tpu_name:",FLAGS.tpu_name,";FLAGS.tpu_zone:",FLAGS.tpu_zone)
+  # ###tpu_cluster_resolver: <tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver object at 0x7f4b387b06a0> ;FLAGS.use_tpu: True ;FLAGS.tpu_name: grpc://10.240.1.83:8470
+
+  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+  run_config = tf.contrib.tpu.RunConfig(
+      keep_checkpoint_max=20, # 10
+      cluster=tpu_cluster_resolver,
+      master=FLAGS.master,
+      model_dir=FLAGS.output_dir,
+      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+      tpu_config=tf.contrib.tpu.TPUConfig(
+          iterations_per_loop=FLAGS.iterations_per_loop,
+          num_shards=FLAGS.num_tpu_cores,
+          per_host_input_for_training=is_per_host))
+
+  model_fn = model_fn_builder(
+      bert_config=bert_config,
+      init_checkpoint=FLAGS.init_checkpoint,
+      learning_rate=FLAGS.learning_rate,
+      num_train_steps=FLAGS.num_train_steps,
+      num_warmup_steps=FLAGS.num_warmup_steps,
+      use_tpu=FLAGS.use_tpu,
+      use_one_hot_embeddings=FLAGS.use_tpu)
+
+  # If TPU is not available, this will fall back to normal Estimator on CPU
+  # or GPU.
+  estimator = tf.contrib.tpu.TPUEstimator(
+      use_tpu=FLAGS.use_tpu,
+      model_fn=model_fn,
+      config=run_config,
+      train_batch_size=FLAGS.train_batch_size,
+      eval_batch_size=FLAGS.eval_batch_size)
+
+  if FLAGS.do_train:
+    tf.logging.info("***** Running training *****")
+    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+    train_input_fn = input_fn_builder(
+        input_files=input_files,
+        max_seq_length=FLAGS.max_seq_length,
+        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+        is_training=True)
+    estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
+
+  if FLAGS.do_eval:
+    tf.logging.info("***** Running evaluation *****")
+    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+
+    eval_input_fn = input_fn_builder(
+        input_files=input_files,
+        max_seq_length=FLAGS.max_seq_length,
+        max_predictions_per_seq=FLAGS.max_predictions_per_seq,
+        is_training=False)
+
+    result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps)
+
+    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+    with tf.gfile.GFile(output_eval_file, "w") as writer:
+      tf.logging.info("***** Eval results *****")
+      for key in sorted(result.keys()):
+        tf.logging.info("  %s = %s", key, str(result[key]))
+        writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("output_dir")
+  tf.app.run()
diff --git a/Basic/Albert/albert_tiny_tf/albert/similarity.py b/Basic/Albert/albert_tiny_tf/albert/similarity.py
new file mode 100644
index 0000000..a944bab
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/similarity.py
@@ -0,0 +1,274 @@
+"""
+进行文本相似度预测的示例。可以直接运行进行预测。
+参考了项目：https://github.com/chdd/bert-utils
+
+"""
+
+
+import tensorflow as tf
+import args
+import tokenization
+import modeling
+from run_classifier import InputFeatures, InputExample, DataProcessor, create_model, convert_examples_to_features
+
+
+# os.environ['CUDA_VISIBLE_DEVICES'] = '1'
+
+
+class SimProcessor(DataProcessor):
+    def get_sentence_examples(self, questions):
+        examples = []
+        for index, data in enumerate(questions):
+            guid = 'test-%d' % index
+            text_a = tokenization.convert_to_unicode(str(data[0]))
+            text_b = tokenization.convert_to_unicode(str(data[1]))
+            label = str(0)
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_labels(self):
+        return ['0', '1']
+
+
+"""
+模型类，负责载入checkpoint初始化模型
+"""
+class BertSim:
+    def __init__(self, batch_size=args.batch_size):
+        self.mode = None
+        self.max_seq_length = args.max_seq_len
+        self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
+        self.batch_size = batch_size
+        self.estimator = None
+        self.processor = SimProcessor()
+        tf.logging.set_verbosity(tf.logging.INFO)
+
+
+
+    #载入estimator,构造模型
+    def start_model(self):
+        self.estimator = self.get_estimator()
+
+
+    def model_fn_builder(self, bert_config, num_labels, init_checkpoint, learning_rate,
+                         num_train_steps, num_warmup_steps,
+                         use_one_hot_embeddings):
+        """Returns `model_fn` closurimport_tfe for TPUEstimator."""
+
+        def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+            from tensorflow.python.estimator.model_fn import EstimatorSpec
+
+            tf.logging.info("*** Features ***")
+            for name in sorted(features.keys()):
+                tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+            input_ids = features["input_ids"]
+            input_mask = features["input_mask"]
+            segment_ids = features["segment_ids"]
+            label_ids = features["label_ids"]
+
+            is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+            (total_loss, per_example_loss, logits, probabilities) = create_model(
+                bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
+                num_labels, use_one_hot_embeddings)
+
+            tvars = tf.trainable_variables()
+            initialized_variable_names = {}
+
+            if init_checkpoint:
+                (assignment_map, initialized_variable_names) \
+                    = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+            tf.logging.info("**** Trainable Variables ****")
+            for var in tvars:
+                init_string = ""
+                if var.name in initialized_variable_names:
+                    init_string = ", *INIT_FROM_CKPT*"
+                tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                                init_string)
+            output_spec = EstimatorSpec(mode=mode, predictions=probabilities)
+
+            return output_spec
+
+        return model_fn
+
+    def get_estimator(self):
+
+        from tensorflow.python.estimator.estimator import Estimator
+        from tensorflow.python.estimator.run_config import RunConfig
+
+        bert_config = modeling.BertConfig.from_json_file(args.config_name)
+        label_list = self.processor.get_labels()
+        if self.mode == tf.estimator.ModeKeys.TRAIN:
+            init_checkpoint = args.ckpt_name
+        else:
+            init_checkpoint = args.output_dir
+
+        model_fn = self.model_fn_builder(
+            bert_config=bert_config,
+            num_labels=len(label_list),
+            init_checkpoint=init_checkpoint,
+            learning_rate=args.learning_rate,
+            num_train_steps=None,
+            num_warmup_steps=None,
+            use_one_hot_embeddings=False)
+
+        config = tf.ConfigProto()
+        config.gpu_options.allow_growth = True
+        config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction
+        config.log_device_placement = False
+
+        return Estimator(model_fn=model_fn, config=RunConfig(session_config=config), model_dir=args.output_dir,
+                         params={'batch_size': self.batch_size})
+
+    def predict_sentences(self,sentences):
+        results= self.estimator.predict(input_fn=input_fn_builder(self,sentences), yield_single_examples=False)
+        #打印预测结果
+        for i in results:
+            print(i)
+
+    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
+        """Truncates a sequence pair in place to the maximum length."""
+
+        # This is a simple heuristic which will always truncate the longer sequence
+        # one token at a time. This makes more sense than truncating an equal percent
+        # of tokens from each, since if one sequence is very short then each token
+        # that's truncated likely contains more information than a longer sequence.
+        while True:
+            total_length = len(tokens_a) + len(tokens_b)
+            if total_length <= max_length:
+                break
+            if len(tokens_a) > len(tokens_b):
+                tokens_a.pop()
+            else:
+                tokens_b.pop()
+
+    def convert_single_example(self, ex_index, example, label_list, max_seq_length, tokenizer):
+        """Converts a single `InputExample` into a single `InputFeatures`."""
+        label_map = {}
+        for (i, label) in enumerate(label_list):
+            label_map[label] = i
+
+        tokens_a = tokenizer.tokenize(example.text_a)
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > max_seq_length - 2:
+                tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids: 0     0   0   0  0     0 0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            segment_ids.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        label_id = label_map[example.label]
+        if ex_index < 5:
+            tf.logging.info("*** Example ***")
+            tf.logging.info("guid: %s" % (example.guid))
+            tf.logging.info("tokens: %s" % " ".join(
+                [tokenization.printable_text(x) for x in tokens]))
+            tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+            tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
+
+        feature = InputFeatures(
+            input_ids=input_ids,
+            input_mask=input_mask,
+            segment_ids=segment_ids,
+            label_id=label_id)
+        return feature
+
+
+
+
+def input_fn_builder(bertSim,sentences):
+    def predict_input_fn():
+        return (tf.data.Dataset.from_generator(
+            generate_from_input,
+            output_types={
+                'input_ids': tf.int32,
+                'input_mask': tf.int32,
+                'segment_ids': tf.int32,
+                'label_ids': tf.int32},
+            output_shapes={
+                'input_ids': (None, bertSim.max_seq_length),
+                'input_mask': (None, bertSim.max_seq_length),
+                'segment_ids': (None, bertSim.max_seq_length),
+                'label_ids': (1,)}).prefetch(10))
+
+    def generate_from_input():
+        processor = bertSim.processor
+        predict_examples = processor.get_sentence_examples(sentences)
+        features = convert_examples_to_features(predict_examples, processor.get_labels(), args.max_seq_len,
+                                                bertSim.tokenizer)
+        yield {
+            'input_ids': [f.input_ids for f in features],
+            'input_mask': [f.input_mask for f in features],
+            'segment_ids': [f.segment_ids for f in features],
+            'label_ids': [f.label_id for f in features]
+        }
+
+    return predict_input_fn
+
+
+if __name__ == '__main__':
+    sim = BertSim()
+    sim.start_model()
+    sim.predict_sentences([("我喜欢妈妈做的汤", "妈妈做的汤我很喜欢喝")])
diff --git a/Basic/Albert/albert_tiny_tf/albert/test_changes.py b/Basic/Albert/albert_tiny_tf/albert/test_changes.py
new file mode 100644
index 0000000..f5f1d2e
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/test_changes.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+import tensorflow as tf
+from modeling import embedding_lookup_factorized,transformer_model
+import os
+
+"""
+测试albert主要的改进点：词嵌入的因式分解、层间参数共享、段落间连贯性
+test main change of albert from bert
+"""
+batch_size = 2048
+sequence_length = 512
+vocab_size = 30000
+hidden_size = 1024
+num_attention_heads = int(hidden_size / 64)
+
+def get_total_parameters():
+    """
+    get total parameters of a graph
+    :return:
+    """
+    total_parameters = 0
+    for variable in tf.trainable_variables():
+        # shape is an array of tf.Dimension
+        shape = variable.get_shape()
+        # print(shape)
+        # print(len(shape))
+        variable_parameters = 1
+        for dim in shape:
+            # print(dim)
+            variable_parameters *= dim.value
+        # print(variable_parameters)
+        total_parameters += variable_parameters
+    return total_parameters
+
+def test_factorized_embedding():
+    """
+    test of Factorized embedding parameterization
+    :return:
+    """
+    input_ids=tf.zeros((batch_size, sequence_length),dtype=tf.int32)
+    output, embedding_table, embedding_table_2=embedding_lookup_factorized(input_ids,vocab_size,hidden_size)
+    print("output:",output)
+
+def test_share_parameters():
+    """
+    test of share parameters across all layers: how many parameter after share parameter across layers of transformer.
+    :return:
+    """
+    def total_parameters_transformer(share_parameter_across_layers):
+        input_tensor=tf.zeros((batch_size, sequence_length, hidden_size),dtype=tf.float32)
+        print("transformer_model. input:",input_tensor)
+        transformer_result=transformer_model(input_tensor,hidden_size=hidden_size,num_attention_heads=num_attention_heads,share_parameter_across_layers=share_parameter_across_layers)
+        print("transformer_result:",transformer_result)
+        total_parameters=get_total_parameters()
+        print('total_parameters(not share):',total_parameters)
+
+    share_parameter_across_layers=False
+    total_parameters_transformer(share_parameter_across_layers) # total parameters, not share: 125,976,576 = 125 million
+
+    tf.reset_default_graph() # Clears the default graph stack and resets the global default graph
+    share_parameter_across_layers=True
+    total_parameters_transformer(share_parameter_across_layers) #  total parameters,   share: 10,498,048 = 10.5 million
+
+def test_sentence_order_prediction():
+    """
+    sentence order prediction.
+
+    check method of create_instances_from_document_albert from create_pretrining_data.py
+
+    :return:
+    """
+    # 添加运行权限
+    os.system("chmod +x create_pretrain_data.sh")
+
+    os.system("./create_pretrain_data.sh")
+
+
+# 1.test of Factorized embedding parameterization
+#test_factorized_embedding()
+
+# 2. test of share parameters across all layers: how many parameter after share parameter across layers of transformer.
+# before share parameter: 125,976,576; after share parameter:
+#test_share_parameters()
+
+# 3. test of sentence order prediction(SOP)
+test_sentence_order_prediction()
+
diff --git a/Basic/Albert/albert_tiny_tf/albert/tokenization.py b/Basic/Albert/albert_tiny_tf/albert/tokenization.py
new file mode 100644
index 0000000..f7020e8
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert/tokenization.py
@@ -0,0 +1,401 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+import six
+import tensorflow as tf
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  index = 0
+  with tf.gfile.GFile(vocab_file, "r") as reader:
+    while True:
+      token = convert_to_unicode(reader.readline())
+      if not token:
+        break
+      token = token.strip()
+      vocab[token] = index
+      index += 1
+  return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  #print("items:",items) #['[CLS]', '日', '##期', '，', '但', '被', '##告', '金', '##东', '##福', '载', '##明', '[MASK]', 'U', '##N', '##K', ']', '保', '##证', '本', '##月', '1', '##4', '[MASK]', '到', '##位', '，', '2', '##0', '##1', '##5', '年', '6', '[MASK]', '1', '##1', '日', '[', 'U', '##N', '##K', ']', '，', '原', '##告', '[MASK]', '认', '##可', '于', '2', '##0', '##1', '##5', '[MASK]', '6', '月', '[MASK]', '[MASK]', '日', '##向', '被', '##告', '主', '##张', '权', '##利', '。', '而', '[MASK]', '[MASK]', '自', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '年', '6', '月', '1', '##1', '日', '[SEP]', '原', '##告', '于', '2', '##0', '##1', '##6', '[MASK]', '6', '[MASK]', '2', '##4', '日', '起', '##诉', '，', '主', '##张', '保', '##证', '责', '##任', '，', '已', '超', '##过', '保', '##证', '期', '##限', '[MASK]', '保', '##证', '人', '依', '##法', '不', '##再', '承', '##担', '保', '##证', '[MASK]', '[MASK]', '[MASK]', '[SEP]']
+  for i,item in enumerate(items):
+    #print(i,"item:",item) #  ##期
+    output.append(vocab[item])
+  return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True):
+    """Constructs a BasicTokenizer.
+
+    Args:
+      do_lower_case: Whether to lower case the input.
+    """
+    self.do_lower_case = do_lower_case
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token))
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    text = convert_to_unicode(text)
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+
+
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically contorl characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+
+
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat in ("Cc", "Cf"):
+    return True
+  return False
+
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
diff --git a/Basic/Albert/albert_tiny_tf/albert_model/albert_tiny/albert_config.json b/Basic/Albert/albert_tiny_tf/albert_model/albert_tiny/albert_config.json
new file mode 100644
index 0000000..dc97f5b
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/albert_model/albert_tiny/albert_config.json
@@ -0,0 +1,23 @@
+{
+  "attention_probs_dropout_prob": 0.0,
+  "directionality": "bidi", 
+  "hidden_act": "gelu", 
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 312,
+  "embedding_size": 128,
+  "initializer_range": 0.02, 
+  "intermediate_size": 1248 ,
+  "max_position_embeddings": 512, 
+  "num_attention_heads": 12,
+  "num_hidden_layers": 4,
+
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3, 
+  "pooler_size_per_head": 128, 
+  "pooler_type": "first_token_transform", 
+  "type_vocab_size": 2, 
+  "vocab_size": 21128,
+   "ln_type":"postln"
+
+}
diff --git a/Basic/Albert/albert_tiny_tf/classifier_task/README.md b/Basic/Albert/albert_tiny_tf/classifier_task/README.md
new file mode 100644
index 0000000..4804335
--- /dev/null
+++ b/Basic/Albert/albert_tiny_tf/classifier_task/README.md
@@ -0,0 +1,32 @@
+#### config文件解读
+
+##### 以inews_config.json为例
+
+* model_name：模型名称
+* epochs：迭代epoch的数量
+* checkpoint_every：间隔多少步保存一次模型
+* eval_every：间隔多少步验证一次模型
+* learning_rate：学习速率，推荐2e-5， 5e-5， 1e-4
+* sequence_length：序列长度，单GPU时不要超过128
+* batch_size：单GPU时不要超过32
+* num_classes：文本分类的类别数量，若是二分类设置为1
+* warmup_rate：训练时的预热比例，建议0.05， 0.1
+* output_path：输出文件夹，用来存储label_to_index等文件
+* bert_model_path：预训练模型文件夹路径
+* train_data：训练数据路径
+* eval_data：验证数据路径
+* ckpt_model_path：checkpoint模型文件保存路径
+
+
+######处理好的数据
+data文件夹下
+训练数据中，train:valid=4:1
+
+
+#########训练和测试
+cd到classifier文件夹目录下
+训练：sh run.sh
+测试：CUDA_VISIBLE_DEVICES=2 python testnew.py
+
+######运行环境
+tensorflow-GPU-1.14-cp37（与代码不是最合适的版本，会有许多警告）
\ No newline at end of file
diff --git a/BertToSimple/textcnn/predict.py b/BertToSimple/textcnn/predict.py
new file mode 100644
index 0000000..80c6d98
--- /dev/null
+++ b/BertToSimple/textcnn/predict.py
@@ -0,0 +1,83 @@
+# -*-coding:utf-8-*-
+
+import pickle, numpy as np
+import time
+from keras.layers import *
+from keras.models import Model
+from keras.initializers import Constant
+from keras.preprocessing import sequence
+from keras.models import load_model    
+from keras.utils.np_utils import to_categorical
+from sklearn.metrics import accuracy_score
+from utils import load_data
+
+def get_textcnn(x_len, v_size, embs):
+	x = Input(shape=(x_len,),dtype='int32')
+	# embed = Embedding(v_size,300)(x)
+	embed = Embedding(v_size,300,embeddings_initializer=Constant(embs),trainable=False)(x)
+	cnn1 = Convolution1D(256,3,padding='same',strides=1,activation='relu')(embed)
+	cnn1 = MaxPool1D(pool_size=4)(cnn1)
+	cnn2 = Convolution1D(256,4,padding='same',strides=1,activation='relu')(embed)
+	cnn2 = MaxPool1D(pool_size=4)(cnn2)
+	cnn3 = Convolution1D(256,5,padding='same',strides=1,activation='relu')(embed)
+	cnn3 = MaxPool1D(pool_size=4)(cnn3)
+	cnn = concatenate([cnn1,cnn2,cnn3],axis=-1)
+	flat = Flatten()(cnn)
+	drop = Dropout(0.2,name='drop')(flat)
+	y = Dense(3,activation='softmax')(drop)
+	model = Model(inputs=x,outputs=y)
+	return model
+
+def get_birnn(x_len, v_size, embs):
+	x = Input(shape=(x_len,),dtype='int32')
+	# embed = Embedding(v_size,300)(x)
+	embed = Embedding(v_size,300,embeddings_initializer=Constant(embs),trainable=False)(x)
+	# bi = Bidirectional(GRU(256,activation='tanh',recurrent_dropout=0.2,dropout=0.2,return_sequences=True))(embed)
+	bi = Bidirectional(GRU(256,activation='tanh',recurrent_dropout=0.2,dropout=0.2))(embed)
+	bi_1 = Bidirectional(GRU(256, activation='tanh', recurrent_dropout=0.2, dropout=0.2))(embed)
+	y = Dense(3,activation='softmax')(bi_1)
+	model = Model(inputs=x,outputs=y)
+	return model
+
+
+def predict():
+	x_len = 50
+
+	# ----- ----- ----- ----- -----
+	# from keras.datasets import imdb
+	# (x_tr,y_tr),(x_te,y_te) = imdb.load_data(num_words=10000)
+	# ----- ----- ----- ----- -----
+
+	name = 'hotel' # clothing, fruit, hotel, pda, shampoo
+	(x_tr,y_tr,_),(x_de,y_de,_),(x_te,y_te,_),v_size,embs = load_data(name)
+	x_tr = sequence.pad_sequences(x_tr,maxlen=x_len)
+	x_de = sequence.pad_sequences(x_de,maxlen=x_len)
+	x_te = sequence.pad_sequences(x_te,maxlen=x_len)
+	y_tr = to_categorical(y_tr,3)
+	y_de = to_categorical(y_de,3)
+	y_te = to_categorical(y_te,3)
+	#with open('data/cache/t_tr','rb') as fin: y_tr = pickle.load(fin)
+	#with open('data/cache/t_de','rb') as fin: y_de = pickle.load(fin)
+	# y_tr = to_categorical(y_tr.argmax(axis=1),3)
+	# y_de = to_categorical(y_de.argmax(axis=1),3)
+
+	# ----- ----- predict ----- -----
+	 # 模型的加载及使用
+    
+	print("Using loaded model to predict...")
+ 
+	load_model1 = load_model("model_weight(textcnn).h5")
+	start= time.time()
+	predicted = load_model1.predict(x_te)
+	predict_1 = np.argmax(predicted,axis=1)
+	print(predict_1.shape)
+	end = time.time()
+	print('time:',end-start,' acc:',accuracy_score(y_te, predict_1))
+    
+	load_model1.summary()
+	
+	# ----- ----- ----- ----- -----
+
+if __name__ == '__main__':
+	# run_small()
+	predict()
diff --git a/BertToSimple/textcnn/readme.md b/BertToSimple/textcnn/readme.md
new file mode 100644
index 0000000..c3775e3
--- /dev/null
+++ b/BertToSimple/textcnn/readme.md
@@ -0,0 +1,10 @@
+数据格式：
+0	询问是否认识借款人,我是那个
+invalid -1  yes-2  no-3
+
+时间，17000条数据预测时间为12s,每条8ms 准确率：87.44%
+
+训练：test.py
+预测： predict.py
+
+测试集，验证集，训练集请放在与代码同级的data文件夹  data下设置一个hotel文件夹放置，分别test.txt dev.txt train.txt
diff --git a/BertToSimple/textcnn/test.py b/BertToSimple/textcnn/test.py
new file mode 100644
index 0000000..b0084b9
--- /dev/null
+++ b/BertToSimple/textcnn/test.py
@@ -0,0 +1,89 @@
+# -*-coding:utf-8-*-
+
+import pickle, numpy as np
+from keras.layers import *
+from keras.models import Model
+from keras.initializers import Constant
+from keras.preprocessing import sequence
+from keras.models import load_model    
+from keras.utils.np_utils import to_categorical
+from utils import load_data
+
+def get_textcnn(x_len, v_size, embs):
+	x = Input(shape=(x_len,),dtype='int32')
+	# embed = Embedding(v_size,300)(x)
+	embed = Embedding(v_size,300,embeddings_initializer=Constant(embs),trainable=False)(x)
+	cnn1 = Convolution1D(256,3,padding='same',strides=1,activation='relu')(embed)
+	cnn1 = MaxPool1D(pool_size=4)(cnn1)
+	cnn2 = Convolution1D(256,4,padding='same',strides=1,activation='relu')(embed)
+	cnn2 = MaxPool1D(pool_size=4)(cnn2)
+	cnn3 = Convolution1D(256,5,padding='same',strides=1,activation='relu')(embed)
+	cnn3 = MaxPool1D(pool_size=4)(cnn3)
+	cnn = concatenate([cnn1,cnn2,cnn3],axis=-1)
+	flat = Flatten()(cnn)
+	drop = Dropout(0.2,name='drop')(flat)
+	y = Dense(3,activation='softmax')(drop)
+	model = Model(inputs=x,outputs=y)
+	return model
+
+def get_birnn(x_len, v_size, embs):
+	x = Input(shape=(x_len,),dtype='int32')
+	# embed = Embedding(v_size,300)(x)
+	embed = Embedding(v_size,300,embeddings_initializer=Constant(embs),trainable=False)(x)
+	# bi = Bidirectional(GRU(256,activation='tanh',recurrent_dropout=0.2,dropout=0.2,return_sequences=True))(embed)
+	bi = Bidirectional(GRU(256,activation='tanh',recurrent_dropout=0.2,dropout=0.2))(embed)
+	bi_1 = Bidirectional(GRU(256, activation='tanh', recurrent_dropout=0.2, dropout=0.2))(embed)
+	y = Dense(3,activation='softmax')(bi_1)
+	model = Model(inputs=x,outputs=y)
+	return model
+
+def run_small():
+	x_len = 50
+	name = 'hotel' # clothing, fruit, hotel, pda, shampoo
+	(x_tr,y_tr,_),_,(x_te,y_te,_),v_size,embs = load_data(name)
+	x_tr = sequence.pad_sequences(x_tr,maxlen=x_len)
+	x_te = sequence.pad_sequences(x_te,maxlen=x_len)
+	y_tr = to_categorical(y_tr,3)
+	y_te = to_categorical(y_te,3)
+	# model = get_textcnn(x_len,v_size,embs)
+	model = get_birnn(x_len,v_size,embs)
+	model.compile(loss='softmax_crossentropy',optimizer='adam',metrics=['accuracy'])
+	model.fit(x_tr,y_tr,batch_size=32,epochs=5,validation_data=(x_te,y_te))
+
+def run_distill():
+	x_len = 50
+
+	# ----- ----- ----- ----- -----
+	# from keras.datasets import imdb
+	# (x_tr,y_tr),(x_te,y_te) = imdb.load_data(num_words=10000)
+	# ----- ----- ----- ----- -----
+
+	name = 'hotel' # clothing, fruit, hotel, pda, shampoo
+	(x_tr,y_tr,_),(x_de,y_de,_),(x_te,y_te,_),v_size,embs = load_data(name)
+	x_tr = sequence.pad_sequences(x_tr,maxlen=x_len)
+	x_de = sequence.pad_sequences(x_de,maxlen=x_len)
+	x_te = sequence.pad_sequences(x_te,maxlen=x_len)
+	y_tr = to_categorical(y_tr,3)
+	y_de = to_categorical(y_de,3)
+	y_te = to_categorical(y_te,3)
+	#with open('data/cache/t_tr','rb') as fin: y_tr = pickle.load(fin)
+	#with open('data/cache/t_de','rb') as fin: y_de = pickle.load(fin)
+	# y_tr = to_categorical(y_tr.argmax(axis=1),2)
+	# y_de = to_categorical(y_de.argmax(axis=1),2)
+
+	# ----- ----- distill ----- -----
+	model = get_textcnn(x_len,v_size,embs)
+	#model = get_birnn(x_len,v_size,embs)
+	x_tr = np.vstack([x_tr,x_de])
+	y_tr = np.vstack([y_tr,y_de])
+	model.compile(loss='mse',optimizer='adam',metrics=['accuracy'])
+	print(x_tr.shape,y_tr.shape,x_te.shape,y_te.shape,x_de.shape,y_de.shape)
+	# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
+	model.summary()
+	model.fit(x_tr,y_tr,batch_size=32,epochs=5,validation_data=(x_te,y_te))
+	model.save('model_weight.h5')  # creates a HDF5 file 'my_model.h5'
+	# ----- ----- ----- ----- -----
+
+if __name__ == '__main__':
+	# run_small()
+	run_distill()
diff --git a/BertToSimple/textcnn/utils.py b/BertToSimple/textcnn/utils.py
new file mode 100644
index 0000000..b7a4f47
--- /dev/null
+++ b/BertToSimple/textcnn/utils.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+import jieba, random, fileinput, numpy as np
+from keras.preprocessing.text import Tokenizer
+from sklearn.model_selection import train_test_split
+
+def load_data(name):
+	def get_w2v():
+		for line in open('data/cache/word2vec',encoding='utf-8').read().strip().split('\n'):
+			line = line.strip().split()
+			if not line: continue
+			yield line[0],np.array(list(map(float,line[1:])))
+	tokenizer = Tokenizer(filters='',lower=True,split=' ',oov_token=1)
+	texts = [' '.join(jieba.cut(line.split('\t',1)[1].strip()))\
+		for line in open('data/{}/train.txt'.format(name,name),encoding='utf-8'
+			).read().strip().split('\n')]
+	tokenizer.fit_on_texts(texts)
+	# with open('word2vec','w') as out:
+	# 	for line in fileinput.input('sgns.sogou.word'):
+	# 		word = line.strip().split()[0]
+	# 		if word in tokenizer.word_index:
+	# 			out.write(line+'\n')
+	# 	fileinput.close()
+	x_train,y_train = [],[]; text_train = []
+	for line in open('data/{}/train.txt'.format(name),encoding='utf-8').read().strip().split('\n'):
+		label,text = line.split('\t',1)
+		text_train.append(text.strip())
+		x_train.append(' '.join(jieba.cut(text.strip())))
+		y_train.append(int(label))
+	x_train = tokenizer.texts_to_sequences(x_train)
+	x_dev,y_dev = [],[]; text_dev = []
+	for line in open('data/{}/dev.txt'.format(name),encoding='utf-8').read().strip().split('\n'):
+		label,text = line.split('\t',1)
+		text_dev.append(text.strip())
+		x_dev.append(' '.join(jieba.cut(text.strip())))
+		y_dev.append(int(label))
+	x_dev = tokenizer.texts_to_sequences(x_dev)
+	x_test,y_test = [],[]; text_test = []
+	for line in open('data/{}/test.txt'.format(name),encoding='utf-8').read().strip().split('\n'):
+		label,text = line.split('\t',1)
+		text_test.append(text.strip())
+		x_test.append(' '.join(jieba.cut(text.strip())))
+		y_test.append(int(label))
+	x_test = tokenizer.texts_to_sequences(x_test)
+	v_size = len(tokenizer.word_index)+1
+	embs,w2v = np.zeros((v_size,300)),dict(get_w2v())
+	for word,index in tokenizer.word_index.items():
+		if word in w2v: embs[index] = w2v[word]
+	return (x_train,y_train,text_train),\
+		   (x_dev,y_dev,text_dev),\
+		   (x_test,y_test,text_test),\
+		   v_size,embs   
+
+if __name__ == '__main__':
+	load_data(name='hotel')
diff --git a/Preprocessor/InfoExtraction/caiyang.py b/Preprocessor/InfoExtraction/caiyang.py
new file mode 100644
index 0000000..3fbf0e4
--- /dev/null
+++ b/Preprocessor/InfoExtraction/caiyang.py
@@ -0,0 +1,9 @@
+import pandas as pd
+def split(path):
+    total = pd.read_excel(path, encoding='utf-8')
+    set_1 = total.sample(n=1000, random_state=0, axis=0)
+    set_2 =total.loc[~total.index.isin(set_1.index)]
+    set_1.to_csv("m2_1000.csv")
+    set_2.to_csv("m2_yu.csv")
+
+split("input_m2_4-10/哈银M2.xlsx")
\ No newline at end of file
diff --git a/Preprocessor/InfoExtraction/lexicon_external.txt b/Preprocessor/InfoExtraction/lexicon_external.txt
new file mode 100644
index 0000000..813d6a9
--- /dev/null
+++ b/Preprocessor/InfoExtraction/lexicon_external.txt
@@ -0,0 +1,23 @@
+是
+能
+唉
+行
+啊
+哦
+嗯
+额
+零
+好的
+对的
+好嘞
+好了
+是的
+等一下
+不是
+不知道
+知道了
+有印象
+打错了
+没问题
+我知道
+我知道了
\ No newline at end of file
diff --git a/Preprocessor/InfoExtraction/m1.py b/Preprocessor/InfoExtraction/m1.py
new file mode 100644
index 0000000..7f06504
--- /dev/null
+++ b/Preprocessor/InfoExtraction/m1.py
@@ -0,0 +1,209 @@
+import pandas as pd
+import numpy as np
+from copy import deepcopy
+from m1_dict import ai_map,in_dct,out_dct
+import jieba
+import os
+
+
+def del_duplicate(info, info_name):
+    """
+    # 删除重复的文本 在同一个数据集下
+    """
+    # ['processid', 'in_node', 'type_robot', 'msg', 'msg_del_dup', 'out_true', 'type', 'type_combine']
+    set_only = set()
+    new_info = []
+    for item in info:
+        msg = item[3]
+        if msg not in set_only:
+            set_only.add(msg)
+            new_info.append(item)
+    # print("\n对 {} 去重, 被删除的重复的语料的数量（以msg为准）: {}. Final number: {}"
+    #       .format(info_name, len(info) - len(new_info), len(new_info)))
+    return np.array(new_info)
+
+
+def build_map_by_what(info, what_col):
+    # 按照意图节点切分: what_col = 2; 按照type切分: what_col = 6
+    # print(info)
+    info = np.array(info)
+    nodes = np.unique(np.array(info[:, what_col]))
+    map_temp = dict(zip(nodes, [[] for _ in range(len(nodes))]))
+    for item in info:
+        map_temp[item[what_col]].append(item)
+    return map_temp
+
+
+def read_data(file_dir):
+    cols = ["通话状态","通话记录"]
+    all_data = []
+    for root, dirs, files in os.walk(file_dir):
+        for curr_file in files:
+            print("read file: {}".format(os.path.join(root, curr_file)))
+            temp = pd.read_csv(os.path.join(root, curr_file), usecols=cols).values
+            all_data += list(temp)
+            # print("------------------------------")
+            # print(temp)
+    all_data = [x for x in all_data if x[0] == '已接听' and isinstance(x[1], str) and len(x[1]) != 0 ]
+    return all_data
+
+
+def split_ai_me(data,ai_map,wu_dic,out_dic):
+    in_node = []
+    type_robot = []
+    me = []
+    out_node = []
+    type = []
+    AI = []
+    S_id = [] #session_id
+    for line in data:
+        # ["通话记录ID","通话状态", "通话记录详情"]
+        texts = line[1].split('\n')
+        # print("***********************************")
+        # print(texts)
+        # s_id = str(line[1]).strip('')
+
+        # 初始化index count
+        index= 0
+        pre_ai, rear_ai = '', '' #in_node 和 out_node
+        pre_ai_key,rear_ai_key='*','**'
+        # 遍历文本：AI 和 ME
+        while index < len(texts):
+            temp = texts[index]
+            if not texts[index].startswith('ME'): #当前的ai问题是什么 in_node
+                if texts[index].startswith('AI'):
+                    tt = texts[index]
+                    for key_word in ai_map.keys():
+                        if key_word in texts[index]:
+                            # print("//////////////////////////////////")
+                            # print(key_word)
+                            # print(texts[index])
+                            pre_ai = ai_map[key_word]
+                            pre_ai_key=key_word
+                            # print(wu_dic[pre_ai_key][1])
+                            break
+                        pre_ai_key='*' #有AI说话无关键词
+                index += 1
+                continue
+            while texts[index].startswith('ME'):
+                kk = texts[index]
+                # 当前标签
+                index_ai = index
+                while index_ai < len(texts) and (not texts[index_ai].startswith('AI')):
+                    index_ai += 1
+                if index_ai < len(texts) and texts[index_ai].startswith('AI'):# me 回答后的ai问题是什么 out_node
+                    for key_word in ai_map.keys():
+                        if key_word in texts[index_ai]:
+                            rear_ai = ai_map[key_word]
+                            rear_ai_key = key_word
+                            break
+                        rear_ai_key = '*' #有AI说话但无关键字
+                in_node.append(wu_dic[pre_ai_key][0])
+                type_robot.append(wu_dic[pre_ai_key][1])
+                me.append(texts[index][3:])
+                out_node.append(out_dic[rear_ai_key][0])
+                type.append(out_dic[rear_ai_key][1])
+
+                AI.append(tt)
+                # S_id.append(s_id)
+                rear_ai_key ='**' #重置
+                # index加一：下一条AI或ME
+                index += 1
+
+    # 把 p_id in_node type_robot me out_node type 放在一起
+    ans, temp = [], []
+    for index in range(len(in_node)):
+        if type_robot[index] != "":
+            temp.append('hayinm1')
+            temp.append(in_node[index])
+            temp.append(type_robot[index])
+            temp.append(me[index])
+            temp.append(out_node[index])
+            temp.append(type[index])
+            temp.append(AI[index])
+            # temp.append(S_id[index])
+            ans.append(deepcopy(temp))
+            temp.clear()
+    return ans
+
+
+def del_dul_word(dul_word):
+    new_string = []
+    pre_ch = None
+    for ch in dul_word:
+        if ch != pre_ch:
+            new_string.append(ch)
+            pre_ch = ch
+    return new_string
+
+
+def insert_cols(data):
+    jieba.load_userdict('./lexicon_external.txt')
+    ans = []
+    for item in data:
+        msg = item[3].strip()
+        # 仅保留中文
+        msg = ''.join([ch for ch in msg if ('\u4e00' <= ch <= '\u9fa5')])
+        if len(msg) == 0:
+            continue
+        item[3] = msg
+        # 去叠词：msg_del_dul
+        msg_del_dul = del_dul_word([word for word in jieba.cut(msg)])
+        item.append(''.join(msg_del_dul))
+        # item += ['']
+        item.append('')
+        ans.append(item)
+    print('过滤 msg左右空格+符号+英文+数字 仅留中文汉字: {} -> {}'.format(len(data), len(ans)))
+    return ans
+
+
+if __name__ == '__main__':
+    input_dir = './imput_m1_11+_1000'
+    high_data = read_data(input_dir)
+
+    # 处理 "通话记录详情"
+    high_data = split_ai_me(high_data,ai_map,in_dct,out_dct)
+    print("6666666666666666666666666")
+    print(high_data)
+
+    #插入新列：去叠词
+    high_data = insert_cols(high_data)
+    print("777777777777777777777777777")
+    print(high_data)
+
+    # 去重复
+    high_data_by_node = build_map_by_what(high_data, 2)
+    del high_data
+    high_data = []
+    for key_node, val_node in high_data_by_node.items():
+        high_data += list(del_duplicate(val_node, key_node))
+        # print("当前数量量：{}".format(len(high_data)))
+    # del high_data_by_node
+    # 保存
+    str = 'hayinm1duolun_new1' # 直接更改文件名标识
+    sava_path = './output_m1_1000/'+str+'_result.csv'
+    # original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q","session_id", "msg_del_dup", "type_combine"]
+    original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q", "msg_del_dup", "type_combine"]
+    new_col = ["session_id","processid", "AI_Q", "in_node", "type_robot", "msg", "msg_del_dup", "out_true", "type", "type_combine"]
+    #所有数据保存
+    high_data = pd.DataFrame(high_data, columns=original_col)
+    high_data = high_data.reindex(columns=new_col)
+    high_data.to_csv(sava_path, encoding='utf-8-sig', index_label='id')
+
+    # # 划分：需要标注 and 不需要标注/入结点知识库+表示结束
+    tag ,un_tag = [], []
+    for item in high_data.values:
+        if item[3] in ['无AI其他','知识库','敏感词','其他','4.1','4.4','5.1','5.2','5.3','5.4','2.1','3.2']:
+            un_tag.append(item)
+        else:
+            tag.append(item)
+
+    # 需要标注
+    tag = sorted(tag, key=lambda k: (k[4]))
+    tag=pd.DataFrame(tag,columns=new_col)
+    tag.to_csv('./output_m1_1000/'+str+'_tag.csv',encoding='utf-8-sig', index_label='id')
+
+    # 不需要标注
+    un_tag = sorted(un_tag, key=lambda k: (k[4]))
+    un_tag = pd.DataFrame(un_tag, columns=new_col)
+    un_tag.to_csv('./output_m1_1000/'+str+'_untag.csv', encoding='utf-8-sig', index_label='id')
\ No newline at end of file
diff --git a/Preprocessor/InfoExtraction/m1_dict.py b/Preprocessor/InfoExtraction/m1_dict.py
new file mode 100644
index 0000000..73c7cef
--- /dev/null
+++ b/Preprocessor/InfoExtraction/m1_dict.py
@@ -0,0 +1,150 @@
+
+ai_map = {"本人吗": '身份确认',
+          "您是不是": '身份确认',
+          "原因": '询问未还款原因',
+          "联系方式": '询问本人是否方便接听电话',
+          "方便接听": '询问本人是否方便接听电话',
+          "今天赶紧想办法": "询问今日能否还款",
+          "今天能": "询问今日能否还款",
+          "麻烦他本人": "请本人接听电话",
+          "那让他本人": "请本人接听电话",
+          "请您让他": "请本人接听电话",
+          "朋友": "是否为亲属朋友",
+          "今天务必": "结束4.1",
+          "能想办法还": "询问明日能否还款",
+          "明天您能": "询问明日能否还款",
+          "办理了相关业务": "请求转告",
+          "请他务必尽快解决": "结束4.4",
+          "记录下来": "结束5.1",
+          "很遗憾": "结束5.2",
+          "您的配合": "结束5.3",
+          "尽量帮忙转告": "结束5.4",
+          "来电是提醒您": "询问案源机构",
+          "消费贷款目前已经": "询问欠款金额",
+          "一笔应还款未处理,": "不认可金额",
+          "稍后重新处理": "还款失败",
+          "减免利息": "减免利息",
+          "还款日期应为": "询问逾期时间",
+          "人工接待": "转人工",
+          "办理分期": "如何办理账单分期",
+          "定时划扣": "如何自动扣款",
+          "还错": "还错卡",
+          "目的是": "询问工号",
+          "诶": "打断",
+          "无法在线核实": "询问绑定卡号",
+          "疑问": "质疑身份",
+          "逾期时间太长": "过几天还",
+          "我们的客服热线": "客服热线",
+          "智能语音助手": "询问是否为机器人",
+          "首先很抱歉": "投诉/涉媒/公检法",
+          "延时": "已还款",
+          "先忙": "在忙",
+          "好的, 稍后": "持卡人吸毒",
+          "节哀": "持卡人死亡",
+          "不好意思，稍后": "持卡人重病/持卡人坐牢",
+          "(直接挂机)": "挂机",
+          "您如果想把逾期影响降到最低":"没钱还",
+           "不良影响就越大":"不配合还款",
+          "便捷还款方式": "还款不便",
+          }
+in_dct = {"本人吗": ['1.1', '身份确认'],
+          "您是不是": ['1.1', '身份确认'],
+          "原因": ['2.1', '询问未还款原因'],
+          "联系方式": ['2.2', '询问本人是否方便接听电话'],
+          "方便接听": ['2.2', '询问本人是否方便接听电话'],
+          "今天赶紧想办法": ['3.1', "询问今日能否还款"],
+          "今天能": ['3.1', "询问今日能否还款"],
+          "麻烦他本人": ["3.2", "请本人接听电话"],
+          "那让他本人": ["3.2", "请本人接听电话"],
+          "请您让他": ["3.2", "请本人接听电话"],
+          "朋友": ['3.3', "是否为亲属朋友"],
+          "今天务必": ['4.1', "结束4.1"],
+          "能想办法还": ['4.2', "询问明日能否还款"],
+          "明天您能": ['4.2', "询问明日能否还款"],
+          "办理了相关业务": ['4.3', "请求转告"],
+          "请他务必尽快解决": ['4.4', "结束4.4"],
+          "记录下来": ['5.1', "结束5.1"],
+          "很遗憾": ['5.2', "结束5.2"],
+          "您的配合": ['5.3', "结束5.3"],
+          "尽量帮忙转告": ['5.4', "结束5.4"],
+          "**": ['无AI的其他', ''],
+          "*": ['其他', ''],
+          "来电是提醒您": ['知识库', "询问案源机构"],
+          "消费贷款目前已经": ['知识库', "询问欠款金额"],
+          "一笔应还款未处理": ['知识库', "不认可金额"],
+          "稍后重新处理": ['知识库', "还款失败"],
+          "减免利息": ['知识库', "减免利息"],
+          "还款日期应为": ['知识库', "询问逾期时间"],
+          "人工接待": ['知识库', "转人工"],
+          "办理分期": ['知识库', "如何办理账单分期"],
+          "定时划扣": ['知识库', "如何自动扣款"],
+          "还错": ['知识库', "还错卡"],
+          "目的是": ['知识库', "询问工号"],
+          "诶": ['知识库', "打断"],
+          "无法在线核实": ['知识库', "询问绑定卡号"],
+          "疑问": ['知识库', "质疑身份"],
+          "逾期时间太长": ['知识库', "过几天还"],
+          "我们的客服热线": ['知识库', "客服热线"],
+          "智能语音助手": ['知识库', "询问是否为机器人"],
+          "首先很抱歉": ['敏感词', "投诉/涉媒/公检法"],
+          "延时": ['敏感词', "已还款"],
+          "先忙": ['敏感词', "在忙"],
+          "好的, 稍后": ['敏感词', "持卡人吸毒"],
+          "节哀": ['敏感词', "持卡人死亡"],
+          "不好意思，稍后": ['敏感词', "持卡人重病/持卡人坐牢"],
+          "(直接挂机)": ["知识库", "挂机"],
+          "您如果想把逾期影响降到最低": ["没钱还3.2","没钱还"],
+          "不良影响就越大": ["不配合还款2.1","不配合还款"],
+           "便捷还款方式":["还款不便3.2","还款不便"],
+          }
+
+out_dct = {"本人吗": ['1.1', '身份确认'],
+           "您是不是": ['1.1', '身份确认'],
+           "原因": ['2.1', '本人'],
+           "联系方式": ['2.2', '非本人'],
+           "方便接听": ['2.2', '非本人'],
+           "今天赶紧想办法": ['3.1', "说出原因"],
+           "今天能": ['3.1', "说出原因"],
+           "麻烦他本人": ["3.2", "方便"],
+           "那让他本人": ["3.2", "方便"],
+           "请您让他": ["3.2", "方便"],
+           "朋友": ['3.3', "不方便"],
+           "今天务必": ['4.1', "同意今日还"],
+           "能想办法还": ['4.2', "今日还不了"],
+           "明天您能": ['4.2', "今日还不了"],
+           "办理了相关业务": ['4.3', "关联人"],
+           "请他务必尽快解决": ['4.4', "非关联人"],
+           "记录下来": ['5.1', "同意明日还"],
+           "很遗憾": ['5.2', "明日还不了"],
+           "您的配合": ['5.3', "同意转告"],
+           "尽量帮忙转告": ['5.4', "不同意转告"],
+           "**": ['无AI的其他', ''],
+           "*": ['其他', ''],
+           "来电是提醒您": ['知识库', "询问案源机构"],
+           "消费贷款目前已经": ['知识库', "询问欠款金额"],
+           "一笔应还款未处理": ['知识库', "不认可金额"],
+           "稍后重新处理": ['知识库', "还款失败"],
+           "减免利息": ['知识库', "减免利息"],
+           "还款日期应为": ['知识库', "询问逾期时间"],
+           "人工接待": ['知识库', "转人工"],
+           "办理分期": ['知识库', "如何办理账单分期"],
+           "定时划扣": ['知识库', "如何自动扣款"],
+           "还错": ['知识库', "还错卡"],
+           "目的是": ['知识库', "询问工号"],
+           "诶": ['知识库', "打断"],
+           "无法在线核实": ['知识库', "询问绑定卡号"],
+           "疑问": ['知识库', "质疑身份"],
+           "逾期时间太长": ['知识库', "过几天还"],
+           "我们的客服热线": ['知识库', "客服热线"],
+           "智能语音助手": ['知识库', "询问是否为机器人"],
+           "首先很抱歉": ['敏感词', "投诉/涉媒/公检法"],
+           "延时": ['敏感词', "已还款"],
+           "先忙": ['敏感词', "在忙"],
+           "好的, 稍后": ['敏感词', "持卡人吸毒"],
+           "节哀": ['敏感词', "持卡人死亡"],
+           "不好意思，稍后": ['敏感词', "持卡人重病/持卡人坐牢"],
+           "(直接挂机)": ["知识库", "挂机"],
+           "您如果想把逾期影响降到最低": ["没钱还3.2", "没钱还"],
+           "不良影响就越大": ["不配合还款2.1", "不配合还款"],
+           "便捷还款方式": ["还款不便3.2", "还款不便"],
+           }
\ No newline at end of file
diff --git a/Preprocessor/InfoExtraction/m2.py b/Preprocessor/InfoExtraction/m2.py
new file mode 100644
index 0000000..5df807a
--- /dev/null
+++ b/Preprocessor/InfoExtraction/m2.py
@@ -0,0 +1,201 @@
+import pandas as pd
+import numpy as np
+from copy import deepcopy
+from m2_dict import ai_map,in_dct,out_dct
+import jieba
+import os
+
+
+def del_duplicate(info, info_name):
+    """
+    # 删除重复的文本 在同一个数据集下
+    """
+    # ['processid', 'in_node', 'type_robot', 'msg', 'msg_del_dup', 'out_true', 'type', 'type_combine']
+    set_only = set()
+    new_info = []
+    for item in info:
+        msg = item[3]
+        if msg not in set_only:
+            set_only.add(msg)
+            new_info.append(item)
+    print("\n对 {} 去重, 被删除的重复的语料的数量（以msg为准）: {}. Final number: {}"
+          .format(info_name, len(info) - len(new_info), len(new_info)))
+    return np.array(new_info)
+
+
+def build_map_by_what(info, what_col):
+    # 按照意图节点切分: what_col = 2; 按照type切分: what_col = 6
+    # print(info)
+    info = np.array(info)
+    nodes = np.unique(np.array(info[:, what_col]))
+    map_temp = dict(zip(nodes, [[] for _ in range(len(nodes))]))
+    for item in info:
+        map_temp[item[what_col]].append(item)
+    return map_temp
+
+
+def read_data(file_dir):
+    cols = ["通话状态", "通话记录id","通话记录"]
+    all_data = []
+    for root, dirs, files in os.walk(file_dir):
+        for curr_file in files:
+            print("read file: {}".format(os.path.join(root, curr_file)))
+            temp = pd.read_csv(os.path.join(root, curr_file), usecols=cols).values
+            print("---------------------------------")
+            print(temp)
+            all_data += list(temp)
+    all_data = [x for x in all_data if x[1] == '已接听' and isinstance(x[2], str) and len(x[2]) != 0 ]
+    return all_data
+
+
+def split_ai_me(data,ai_map,wu_dic,out_dic):
+    in_node = []
+    type_robot = []
+    me = []
+    out_node = []
+    type = []
+    AI = []
+    S_id = [] #session_id
+    for line in data:
+        # ["通话记录ID","通话状态", "通话记录详情"]
+        texts = line[2].split('\n')
+        s_id = str(line[0]).strip('')
+
+        # 初始化index count
+        index= 0
+        pre_ai, rear_ai = '', '' #in_node 和 out_node
+        pre_ai_key,rear_ai_key='*','**'
+        # 遍历文本：AI 和 ME
+        while index < len(texts):
+            temp = texts[index]
+            if not texts[index].startswith('ME'): #当前的ai问题是什么 in_node
+                if texts[index].startswith('AI'):
+                    tt = texts[index]
+                    for key_word in ai_map.keys():
+                        if key_word in texts[index]:
+                            pre_ai = ai_map[key_word]
+                            pre_ai_key=key_word
+                            break
+                        pre_ai_key='*' #有AI说话无关键词
+                index += 1
+                continue
+            while texts[index].startswith('ME'):
+                kk = texts[index]
+                # 当前标签
+                index_ai = index
+                while index_ai < len(texts) and (not texts[index_ai].startswith('AI')):
+                    index_ai += 1
+                if index_ai < len(texts) and texts[index_ai].startswith('AI'):# me 回答后的ai问题是什么 out_node
+                    for key_word in ai_map.keys():
+                        if key_word in texts[index_ai]:
+                            rear_ai = ai_map[key_word]
+                            rear_ai_key = key_word
+                            break
+                        rear_ai_key = '*' #有AI说话但无关键字
+                in_node.append(wu_dic[pre_ai_key][0])
+                type_robot.append(wu_dic[pre_ai_key][1])
+                me.append(texts[index][3:])
+                out_node.append(out_dic[rear_ai_key][0])
+                type.append(out_dic[rear_ai_key][1])
+
+                AI.append(tt)
+                S_id.append(s_id)
+                rear_ai_key ='**' #重置
+                # index加一：下一条AI或ME
+                index += 1
+
+    # 把 p_id in_node type_robot me out_node type 放在一起
+    ans, temp = [], []
+    for index in range(len(in_node)):
+        if type_robot[index] != "":
+            temp.append('hayinm2')
+            temp.append(in_node[index])
+            temp.append(type_robot[index])
+            temp.append(me[index])
+            temp.append(out_node[index])
+            temp.append(type[index])
+            temp.append(AI[index])
+            temp.append(S_id[index])
+            ans.append(deepcopy(temp))
+            temp.clear()
+    return ans
+
+
+def del_dul_word(dul_word):
+    new_string = []
+    pre_ch = None
+    for ch in dul_word:
+        if ch != pre_ch:
+            new_string.append(ch)
+            pre_ch = ch
+    return new_string
+
+
+def insert_cols(data):
+    jieba.load_userdict('./lexicon_external.txt')
+    ans = []
+    for item in data:
+        msg = item[3].strip()
+        # 仅保留中文
+        msg = ''.join([ch for ch in msg if ('\u4e00' <= ch <= '\u9fa5')])
+        if len(msg) == 0:
+            continue
+        item[3] = msg
+        # 去叠词：msg_del_dul
+        msg_del_dul = del_dul_word([word for word in jieba.cut(msg)])
+        item.append(''.join(msg_del_dul))
+        # item += ['']
+        item.append('')
+        ans.append(item)
+    print('过滤 msg左右空格+符号+英文+数字 仅留中文汉字: {} -> {}'.format(len(data), len(ans)))
+    return ans
+
+
+if __name__ == '__main__':
+    input_dir = './input_m2_4-10_1000'
+    high_data = read_data(input_dir)
+
+
+    # 处理 "通话记录详情"
+    high_data = split_ai_me(high_data,ai_map,in_dct,out_dct)
+
+
+    #插入新列：去叠词
+    high_data = insert_cols(high_data)
+
+
+    # 去重复
+    high_data_by_node = build_map_by_what(high_data, 2)
+    del high_data
+    high_data = []
+    for key_node, val_node in high_data_by_node.items():
+        high_data += list(del_duplicate(val_node, key_node))
+        print("当前数量量：{}".format(len(high_data)))
+    # del high_data_by_node
+    # 保存
+    str = 'hayinm2duolun' # 直接更改文件名标识
+    sava_path = './output_m2_1000/'+str+'_result.csv'
+    original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q","session_id", "msg_del_dup", "type_combine"]
+    new_col = ["session_id","processid", "AI_Q", "in_node", "type_robot", "msg", "msg_del_dup", "out_true", "type", "type_combine"]
+    #所有数据保存
+    high_data = pd.DataFrame(high_data, columns=original_col)
+    high_data = high_data.reindex(columns=new_col)
+    high_data.to_csv(sava_path, encoding='utf-8-sig', index_label='id')
+
+    # # 划分：需要标注 and 不需要标注/入结点知识库+表示结束
+    tag ,un_tag = [], []
+    for item in high_data.values:
+        if item[3] in ['无AI其他','知识库','敏感词','其他','3.1','3.3','4.4','5.1/5.3','5.2/5.4','5.5','5.6'] :
+            un_tag.append(item)
+        else:
+            tag.append(item)
+
+    # 需要标注
+    tag = sorted(tag, key=lambda k: (k[4]))
+    tag=pd.DataFrame(tag,columns=new_col)
+    tag.to_csv('./output_m2_1000/'+str+'_tag.csv',encoding='utf-8-sig', index_label='id')
+
+    # 不需要标注
+    un_tag = sorted(un_tag, key=lambda k: (k[4]))
+    un_tag = pd.DataFrame(un_tag, columns=new_col)
+    un_tag.to_csv('./output_m2_1000/'+str+'_untag.csv', encoding='utf-8-sig', index_label='id')
\ No newline at end of file
diff --git a/Preprocessor/InfoExtraction/m2_dict.py b/Preprocessor/InfoExtraction/m2_dict.py
new file mode 100644
index 0000000..e886d4a
--- /dev/null
+++ b/Preprocessor/InfoExtraction/m2_dict.py
@@ -0,0 +1,184 @@
+ai_map = {"本人吗": "身份确认",
+          "您是不是": "身份确认",
+          "今天处理一下": "询问今日能否还款",
+          "您今天能否处理一下": "询问今日能否还款",
+          "联系方式": "询问电话号码是否是本人在用",
+          "这个号码": "询问电话号码是否是本人在用",
+          "务必今天尽快还款": "结束3.1",
+          "今天处理不了呢": "询问是没时间还是没钱",
+          "没时间还": "询问是没时间还是没钱",
+          "我们再联系他": "稍后联系麻烦转告接听",
+          "朋友": "是否为亲属或者朋友",
+          "明天能想办法处理吗": "询问明日能否还入最低还款额",
+          "明天先处理一下": "询问明日能否还入最低还款额",
+          "明天能把钱还": "询问明日能否还入最低还款额",
+          "明天务必": "询问明日能否还款",
+          "明天能不能": "询问明日能否还款",
+          "注意接听电话": "请求转告",
+          "再次联系他": "结束4.4",
+          "那稍后我会跟进您的还": "结束5.1/5.3",
+          "很遗憾未能与您达成": "结束5.2/5.4",
+          "非常感谢您的配合": "结束5.5",
+          "尽量帮忙转告": "结束5.6",
+          "**": ['无AI的其他', ''],
+          "*": ['其他', ''],
+          "来电是提醒您": "询问案源机构",
+          "消费贷款目前已经": "询问欠款金额",
+          "一笔应还款未处理,": "不认可金额",
+          "稍后重新处理": "还款失败",
+          "减免利息": "减免利息",
+          "还款日期应为": "询问逾期时间",
+          "人工接待": "转人工",
+          "办理分期": "如何办理账单分期",
+          "定时划扣": "如何自动扣款",
+          "还错": "还错卡",
+          "目的是": "询问工号",
+          "诶": "打断",
+          "无法在线核实": "询问绑定卡号",
+          "疑问": "质疑身份",
+          "逾期时间太长": "过几天还",
+          "我们的客服热线": "客服热线",
+          "智能语音助手": "询问是否为机器人",
+          "首先很抱歉": "投诉/涉媒/公检法",
+          "延时": "已还款",
+          "先忙": "在忙",
+          "好的, 稍后": "持卡人吸毒",
+          "节哀": "持卡人死亡",
+          "不好意思，稍后": "持卡人重病/持卡人坐牢",
+
+          "(直接挂机)": "挂机",
+
+          "您如果想把逾期影响降到最低": "没钱还",
+
+
+          "不良影响就越大": "不配合还款",
+
+
+          "明天赶紧想办法": "还款不便询问明日能否还款",
+          "明天能不能想办法": "还款不便询问明日能否还款",
+          "明天能想办法": "还款不便询问明日能否还款",
+          }
+
+in_dct = {"本人吗": ['1.1', "身份确认"],
+          "您是不是": ['1.1', "身份确认"],
+          "今天处理一下": ['2.1', "询问今日能否还款"],
+          "您今天能否处理一下": ['2.1', "询问今日能否还款"],
+          "联系方式": ['2.2', "询问电话号码是否是本人在用"],
+          "这个号码": ['2.2', "询问电话号码是否是本人在用"],
+          "务必今天尽快还款": ['3.1', "结束3.1"],
+          "今天处理不了呢": ['3.2', "询问是没时间还是没钱"],
+          "没时间还": ['3.2', "询问是没时间还是没钱"],
+          "我们再联系他": ['3.3', "稍后联系麻烦转告接听"],
+          "朋友": ['3.4', "是否为亲属或者朋友"],
+          "明天能想办法处理吗": ['4.1', "询问明日能否还入最低还款额"],
+          "明天先处理一下": ['4.1', "询问明日能否还入最低还款额"],
+          "明天能把钱还": ['4.1', "询问明日能否还入最低还款额"],
+          "明天务必": ['4.2', "询问明日能否还款"],
+          "明天能不能": ['4.2', "询问明日能否还款"],
+          "注意接听电话": ['4.3', "请求转告"],
+          "再次联系他": ['4.4', "结束4.4"],
+          "那稍后我会跟进您的还": ['5.1/5.3', "结束5.1/5.3"],
+          "很遗憾未能与您达成": ['5.2/5.4', "结束5.2/5.4"],
+          "非常感谢您的配合": ['5.5', "结束5.5"],
+          "尽量帮忙转告": ['5.6', "结束5.6"],
+          "**": ['无AI的其他', ''],
+          "*": ['其他', ''],
+          "来电是提醒您": ['知识库', "询问案源机构"],
+          "消费贷款目前已经": ['知识库', "询问欠款金额"],
+          "一笔应还款未处理": ['知识库', "不认可金额"],
+          "稍后重新处理": ['知识库', "还款失败"],
+          "减免利息": ['知识库', "减免利息"],
+          "还款日期应为": ['知识库', "询问逾期时间"],
+          "人工接待": ['知识库', "转人工"],
+          "办理分期": ['知识库', "如何办理账单分期"],
+          "定时划扣": ['知识库', "如何自动扣款"],
+          "还错": ['知识库', "还错卡"],
+          "目的是": ['知识库', "询问工号"],
+          "诶": ['知识库', "打断"],
+          "无法在线核实": ['知识库', "询问绑定卡号"],
+          "疑问": ['知识库', "质疑身份"],
+          "逾期时间太长": ['知识库', "过几天还"],
+          "我们的客服热线": ['知识库', "客服热线"],
+          "智能语音助手": ['知识库', "询问是否为机器人"],
+          "首先很抱歉": ['敏感词', "投诉/涉媒/公检法"],
+          "延时": ['敏感词', "已还款"],
+          "先忙": ['敏感词', "在忙"],
+          "好的, 稍后": ['敏感词', "持卡人吸毒"],
+          "节哀": ['敏感词', "持卡人死亡"],
+          "不好意思，稍后": ['敏感词', "持卡人重病/持卡人坐牢"],
+
+            "(直接挂机)": ["知识库", "挂机"],
+
+          "您如果想把逾期影响降到最低": ["没钱还3.2","没钱还"],
+
+
+          "不良影响就越大": ["不配合还款2.1","不配合还款"],
+
+
+           "明天赶紧想办法":["还款不便3.2","还款不便"],
+          "明天能不能想办法": ["还款不便3.2", "还款不便"],
+          "明天能想办法": ["还款不便3.2", "还款不便"]
+
+
+          }
+
+out_dct = {"本人吗": ['1.1', "身份确认"],
+           "您是不是": ['1.1', "身份确认"],
+           "今天处理一下": ['2.1', "本人"],
+           "您今天能否处理一下": ['2.1', "本人"],
+           "联系方式": ['2.2', "非本人"],
+           "这个号码": ['2.2', "非本人"],
+           "务必今天尽快还款": ['3.1', "同意今日还"],
+           "今天处理不了呢": ['3.2', "今日还不了"],
+           "没时间还": ['3.2', "今日还不了"],
+           "我们再联系他": ['3.3', "号码为本人在用"],
+           "朋友": ['3.4', "号码不是本人在用"],
+           "明天能想办法处理吗": ['4.1', "没钱"],
+           "明天先处理一下": ['4.1', "没钱"],
+           "明天能把钱还": ['4.1', "没钱"],
+           "明天务必": ['4.2', "没时间"],
+           "明天能不能": ['4.2', "没时间"],
+           "注意接听电话": ['4.3', "关联人"],
+           "再次联系他": ['4.4', "非关联人"],
+           "那稍后我会跟进您的还": ['5.1/5.3', "同意明日还"],
+           "很遗憾未能与您达成": ['5.2/5.4', "明日还不了"],
+           "非常感谢您的配合": ['5.5', "同意转告"],
+           "尽量帮忙转告": ['5.6', "不同意转告"],
+           "**": ['无AI的其他', ''],
+           "*": ['其他', ''],
+           "来电是提醒您": ['知识库', "询问案源机构"],
+           "消费贷款目前已经": ['知识库', "询问欠款金额"],
+           "一笔应还款未处理": ['知识库', "不认可金额"],
+           "稍后重新处理": ['知识库', "还款失败"],
+           "减免利息": ['知识库', "减免利息"],
+           "还款日期应为": ['知识库', "询问逾期时间"],
+           "人工接待": ['知识库', "转人工"],
+           "办理分期": ['知识库', "如何办理账单分期"],
+           "定时划扣": ['知识库', "如何自动扣款"],
+           "还错": ['知识库', "还错卡"],
+           "目的是": ['知识库', "询问工号"],
+           "诶": ['知识库', "打断"],
+           "无法在线核实": ['知识库', "询问绑定卡号"],
+           "疑问": ['知识库', "质疑身份"],
+           "逾期时间太长": ['知识库', "过几天还"],
+           "我们的客服热线": ['知识库', "客服热线"],
+           "智能语音助手": ['知识库', "询问是否为机器人"],
+           "首先很抱歉": ['敏感词', "投诉/涉媒/公检法"],
+           "延时": ['敏感词', "已还款"],
+           "先忙": ['敏感词', "在忙"],
+           "好的, 稍后": ['敏感词', "持卡人吸毒"],
+           "节哀": ['敏感词', "持卡人死亡"],
+           "不好意思，稍后": ['敏感词', "持卡人重病/持卡人坐牢"],
+
+           "(直接挂机)": ["知识库", "挂机"],
+
+           "您如果想把逾期影响降到最低": ["没钱还3.2", "没钱还"],
+
+           "不良影响就越大": ["不配合还款2.1", "不配合还款"],
+
+           "明天赶紧想办法": ["还款不便3.2", "还款不便"],
+           "明天能不能想办法": ["还款不便3.2", "还款不便"],
+           "明天能想办法": ["还款不便3.2", "还款不便"]
+           }
+
+
diff --git a/Preprocessor/InfoExtraction/nodel_m1.py b/Preprocessor/InfoExtraction/nodel_m1.py
new file mode 100644
index 0000000..7283080
--- /dev/null
+++ b/Preprocessor/InfoExtraction/nodel_m1.py
@@ -0,0 +1,163 @@
+import pandas as pd
+import numpy as np
+from copy import deepcopy
+from m1_dict import ai_map,in_dct,out_dct
+import jieba
+import os
+
+
+
+
+def read_data(file_dir):
+    cols = ["通话状态", "通话记录id","通话记录"]
+    all_data = []
+    for root, dirs, files in os.walk(file_dir):
+        for curr_file in files:
+            print("read file: {}".format(os.path.join(root, curr_file)))
+            temp = pd.read_csv(os.path.join(root, curr_file), usecols=cols).values
+            all_data += list(temp)
+    all_data = [x for x in all_data if x[0] == '已接听' and isinstance(x[2], str) and len(x[2]) != 0 ]
+    return all_data
+
+
+def split_ai_me(data,ai_map,wu_dic,out_dic):
+    in_node = []
+    type_robot = []
+    me = []
+    out_node = []
+    type = []
+    AI = []
+    S_id = [] #session_id
+    for line in data:
+        # ["通话记录ID","通话状态", "通话记录详情"]
+        texts = line[2].split('\n')
+        s_id = str(line[1]).strip('')
+
+        # 初始化index count
+        index= 0
+        pre_ai, rear_ai = '', '' #in_node 和 out_node
+        pre_ai_key,rear_ai_key='*','**'
+        # 遍历文本：AI 和 ME
+        while index < len(texts):
+            temp = texts[index]
+            if not texts[index].startswith('ME'): #当前的ai问题是什么 in_node
+                if texts[index].startswith('AI'):
+                    tt = texts[index]
+                    for key_word in ai_map.keys():
+                        if key_word in texts[index]:
+                            pre_ai = ai_map[key_word]
+                            pre_ai_key=key_word
+                            break
+                        pre_ai_key='*' #有AI说话无关键词
+                index += 1
+                continue
+            while texts[index].startswith('ME'):
+                kk = texts[index]
+                # 当前标签
+                index_ai = index
+                while index_ai < len(texts) and (not texts[index_ai].startswith('AI')):
+                    index_ai += 1
+                if index_ai < len(texts) and texts[index_ai].startswith('AI'):# me 回答后的ai问题是什么 out_node
+                    for key_word in ai_map.keys():
+                        if key_word in texts[index_ai]:
+                            rear_ai = ai_map[key_word]
+                            rear_ai_key = key_word
+                            break
+                        rear_ai_key = '*' #有AI说话但无关键字
+                in_node.append(wu_dic[pre_ai_key][0])
+                type_robot.append(wu_dic[pre_ai_key][1])
+                me.append(texts[index][3:])
+                out_node.append(out_dic[rear_ai_key][0])
+                type.append(out_dic[rear_ai_key][1])
+
+                AI.append(tt)
+                S_id.append(s_id)
+                rear_ai_key ='**' #重置
+                # index加一：下一条AI或ME
+                index += 1
+
+    # 把 p_id in_node type_robot me out_node type 放在一起
+    ans, temp = [], []
+    for index in range(len(in_node)):
+        temp.append('hayinm1')
+        temp.append(in_node[index])
+        temp.append(type_robot[index])
+        temp.append(me[index])
+        temp.append(out_node[index])
+        temp.append(type[index])
+        temp.append(AI[index])
+        temp.append(S_id[index])
+        ans.append(deepcopy(temp))
+        temp.clear()
+    return ans
+
+
+def del_dul_word(dul_word):
+    new_string = []
+    pre_ch = None
+    for ch in dul_word:
+        if ch != pre_ch:
+            new_string.append(ch)
+            pre_ch = ch
+    return new_string
+
+
+def insert_cols(data):
+    jieba.load_userdict('./lexicon_external.txt')
+    ans = []
+    for item in data:
+        msg = item[3].strip()
+        # 仅保留中文
+        msg = ''.join([ch for ch in msg if ('\u4e00' <= ch <= '\u9fa5')])
+        if len(msg) == 0:
+            continue
+        item[3] = msg
+        # 去叠词：msg_del_dul
+        msg_del_dul = del_dul_word([word for word in jieba.cut(msg)])
+        item.append(''.join(msg_del_dul))
+        # item += ['']
+        item.append('')
+        ans.append(item)
+    print('过滤 msg左右空格+符号+英文+数字 仅留中文汉字: {} -> {}'.format(len(data), len(ans)))
+    return ans
+
+
+if __name__ == '__main__':
+    input_dir = './imput_m1_11+_1000'
+    high_data = read_data(input_dir)
+
+    # 处理 "通话记录详情"
+    high_data = split_ai_me(high_data,ai_map,in_dct,out_dct)
+
+
+    #插入新列：去叠词
+    high_data = insert_cols(high_data)
+
+
+
+    str = 'hayinm1duolun_nodel_new1' # 直接更改文件名标识
+    sava_path = './output_m1_1000/'+str+'_result.csv'
+    original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q","session_id", "msg_del_dup", "type_combine"]
+    new_col = ["session_id","processid", "AI_Q", "in_node", "type_robot", "msg", "msg_del_dup", "out_true", "type", "type_combine"]
+    #所有数据保存
+    high_data = pd.DataFrame(high_data, columns=original_col)
+    high_data = high_data.reindex(columns=new_col)
+    high_data.to_csv(sava_path, encoding='utf-8-sig', index_label='id')
+
+    # # 划分：需要标注 and 不需要标注/入结点知识库+表示结束
+    tag ,un_tag = [], []
+    for item in high_data.values:
+        if item[3] in ['无AI其他','知识库','敏感词','其他','4.1','4.4','5.1','5.2','5.3','5.4','2.1','3.2']:
+            un_tag.append(item)
+        else:
+            tag.append(item)
+
+    # 需要标注
+    tag = sorted(tag, key=lambda k: (k[4]))
+    tag=pd.DataFrame(tag,columns=new_col)
+    tag.to_csv('./output_m1_1000/'+str+'_tag.csv',encoding='utf-8-sig', index_label='id')
+
+    # 不需要标注
+    un_tag = sorted(un_tag, key=lambda k: (k[4]))
+    un_tag = pd.DataFrame(un_tag, columns=new_col)
+    un_tag.to_csv('./output_m1_1000/'+str+'_untag.csv', encoding='utf-8-sig', index_label='id')
\ No newline at end of file
diff --git a/Preprocessor/InfoExtraction/nodel_m2.py b/Preprocessor/InfoExtraction/nodel_m2.py
new file mode 100644
index 0000000..72bb35c
--- /dev/null
+++ b/Preprocessor/InfoExtraction/nodel_m2.py
@@ -0,0 +1,163 @@
+import pandas as pd
+import numpy as np
+from copy import deepcopy
+from m2_dict import ai_map,in_dct,out_dct
+import jieba
+import os
+
+
+
+
+def read_data(file_dir):
+    cols = ["通话状态", "通话记录id","通话记录"]
+    all_data = []
+    for root, dirs, files in os.walk(file_dir):
+        for curr_file in files:
+            print("read file: {}".format(os.path.join(root, curr_file)))
+            temp = pd.read_csv(os.path.join(root, curr_file), usecols=cols).values
+            all_data += list(temp)
+    all_data = [x for x in all_data if x[0] == '已接听' and isinstance(x[2], str) and len(x[2]) != 0 ]
+    return all_data
+
+
+def split_ai_me(data,ai_map,wu_dic,out_dic):
+    in_node = []
+    type_robot = []
+    me = []
+    out_node = []
+    type = []
+    AI = []
+    S_id = [] #session_id
+    for line in data:
+        # ["通话记录ID","通话状态", "通话记录详情"]
+        texts = line[2].split('\n')
+        s_id = str(line[1]).strip('')
+
+        # 初始化index count
+        index= 0
+        pre_ai, rear_ai = '', '' #in_node 和 out_node
+        pre_ai_key,rear_ai_key='*','**'
+        # 遍历文本：AI 和 ME
+        while index < len(texts):
+            temp = texts[index]
+            if not texts[index].startswith('ME'): #当前的ai问题是什么 in_node
+                if texts[index].startswith('AI'):
+                    tt = texts[index]
+                    for key_word in ai_map.keys():
+                        if key_word in texts[index]:
+                            pre_ai = ai_map[key_word]
+                            pre_ai_key=key_word
+                            break
+                        pre_ai_key='*' #有AI说话无关键词
+                index += 1
+                continue
+            while texts[index].startswith('ME'):
+                kk = texts[index]
+                # 当前标签
+                index_ai = index
+                while index_ai < len(texts) and (not texts[index_ai].startswith('AI')):
+                    index_ai += 1
+                if index_ai < len(texts) and texts[index_ai].startswith('AI'):# me 回答后的ai问题是什么 out_node
+                    for key_word in ai_map.keys():
+                        if key_word in texts[index_ai]:
+                            rear_ai = ai_map[key_word]
+                            rear_ai_key = key_word
+                            break
+                        rear_ai_key = '*' #有AI说话但无关键字
+                in_node.append(wu_dic[pre_ai_key][0])
+                type_robot.append(wu_dic[pre_ai_key][1])
+                me.append(texts[index][3:])
+                out_node.append(out_dic[rear_ai_key][0])
+                type.append(out_dic[rear_ai_key][1])
+
+                AI.append(tt)
+                S_id.append(s_id)
+                rear_ai_key ='**' #重置
+                # index加一：下一条AI或ME
+                index += 1
+
+    # 把 p_id in_node type_robot me out_node type 放在一起
+    ans, temp = [], []
+    for index in range(len(in_node)):
+        temp.append('hayinm2')
+        temp.append(in_node[index])
+        temp.append(type_robot[index])
+        temp.append(me[index])
+        temp.append(out_node[index])
+        temp.append(type[index])
+        temp.append(AI[index])
+        temp.append(S_id[index])
+        ans.append(deepcopy(temp))
+        temp.clear()
+    return ans
+
+
+def del_dul_word(dul_word):
+    new_string = []
+    pre_ch = None
+    for ch in dul_word:
+        if ch != pre_ch:
+            new_string.append(ch)
+            pre_ch = ch
+    return new_string
+
+
+def insert_cols(data):
+    jieba.load_userdict('./lexicon_external.txt')
+    ans = []
+    for item in data:
+        msg = item[3].strip()
+        # 仅保留中文
+        msg = ''.join([ch for ch in msg if ('\u4e00' <= ch <= '\u9fa5')])
+        if len(msg) == 0:
+            continue
+        item[3] = msg
+        # 去叠词：msg_del_dul
+        msg_del_dul = del_dul_word([word for word in jieba.cut(msg)])
+        item.append(''.join(msg_del_dul))
+        # item += ['']
+        item.append('')
+        ans.append(item)
+    print('过滤 msg左右空格+符号+英文+数字 仅留中文汉字: {} -> {}'.format(len(data), len(ans)))
+    return ans
+
+
+if __name__ == '__main__':
+    input_dir = './input_m2_4-10_1000'
+    high_data = read_data(input_dir)
+    # 处理 "通话记录详情"
+    high_data = split_ai_me(high_data,ai_map,in_dct,out_dct)
+
+
+    #插入新列：去叠词
+    high_data = insert_cols(high_data)
+
+
+
+    # 保存
+    str = 'hayinm2duolun_nodel_new' # 直接更改文件名标识
+    sava_path = './output_m2_1000/'+str+'_result.xlsx'
+    original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q","session_id", "msg_del_dup", "type_combine"]
+    new_col = ["session_id","processid", "AI_Q", "in_node", "type_robot", "msg", "msg_del_dup", "out_true", "type", "type_combine"]
+    #所有数据保存
+    high_data = pd.DataFrame(high_data, columns=original_col)
+    high_data = high_data.reindex(columns=new_col)
+    high_data.to_csv(sava_path, encoding='utf-8-sig', index_label='id')
+
+    # # 划分：需要标注 and 不需要标注/入结点知识库+表示结束
+    tag ,un_tag = [], []
+    for item in high_data.values:
+        if item[3] in ['无AI其他','知识库','敏感词','其他','3.1','3.3','4.4','5.1/5.3','5.2/5.4','5.5','5.6'] :
+            un_tag.append(item)
+        else:
+            tag.append(item)
+
+    # 需要标注
+    tag = sorted(tag, key=lambda k: (k[4]))
+    tag=pd.DataFrame(tag,columns=new_col)
+    tag.to_csv('./output_m2_1000/'+str+'_tag.xlsx',encoding='utf-8-sig', index_label='id')
+
+    # 不需要标注
+    un_tag = sorted(un_tag, key=lambda k: (k[4]))
+    un_tag = pd.DataFrame(un_tag, columns=new_col)
+    un_tag.to_csv('./output_m2_1000/'+str+'_untag.xlsx', encoding='utf-8-sig', index_label='id')
\ No newline at end of file
diff --git "a/Preprocessor/InfoExtraction/\345\223\210\351\223\266\346\225\260\346\215\256\345\244\204\347\220\206\350\257\264\346\230\216.md" "b/Preprocessor/InfoExtraction/\345\223\210\351\223\266\346\225\260\346\215\256\345\244\204\347\220\206\350\257\264\346\230\216.md"
new file mode 100644
index 0000000..6c4c8b7
--- /dev/null
+++ "b/Preprocessor/InfoExtraction/\345\223\210\351\223\266\346\225\260\346\215\256\345\244\204\347\220\206\350\257\264\346\230\216.md"
@@ -0,0 +1,14 @@
+# ReadeMe
+
+input*文件夹是原始通话记录数据 input*_1000是
+output*是数据解析后的文件存放位置。
+caiyang.py 是对原始数据随机采样1000条的结果
+m1.py  对m1数据进行解析然后去重之后的结果
+m2.py  对m1数据进行解析然后去重之后的结果
+
+nodel_m1.py 是对m1数据进行解析按照入节点去重等待后续标注
+nodel_m2.py 是对m2数据进行解析按照入节点去重等待后续标注
+
+*dict*.py 存放特定流程关键字的字典，用于解析数据时确定出入节点。
+
+备注:在处理数据时，只需要更换相应流程的字典即可。特殊情况时可以考虑修改代码。
\ No newline at end of file