diff --git a/Basic/Albert/albert_tiny_tf/albert/__pycache__/__init__.cpython-36.pyc b/Basic/Albert/albert_tiny_tf/albert/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..07ec522 Binary files /dev/null and b/Basic/Albert/albert_tiny_tf/albert/__pycache__/__init__.cpython-36.pyc differ diff --git a/Basic/Albert/albert_tiny_tf/albert/args.py b/Basic/Albert/albert_tiny_tf/albert/args.py new file mode 100644 index 0000000..993c1bd --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/args.py @@ -0,0 +1,37 @@ +import os +import tensorflow as tf + +tf.logging.set_verbosity(tf.logging.INFO) + +file_path = os.path.dirname(__file__) + + +#模型目录 +model_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/') + +#config文件 +config_name = os.path.join(file_path, 'albert_config/albert_config.json') +#ckpt文件名称 +ckpt_name = os.path.join(model_dir, 'model.ckpt') +#输出文件目录 +output_dir = os.path.join(file_path, 'albert_lcqmc_checkpoints/') +#vocab文件目录 +vocab_file = os.path.join(file_path, 'albert_config/vocab.txt') +#数据目录 +data_dir = os.path.join(file_path, 'data/') + +num_train_epochs = 10 +batch_size = 128 +learning_rate = 0.00005 + +# gpu使用率 +gpu_memory_fraction = 0.8 + +# 默认取倒数第二层的输出值作为句向量 +layer_indexes = [-2] + +# 序列的最大程度,单文本建议把该值调小 +max_seq_len = 128 + +# graph名字 +graph_file = os.path.join(file_path, 'albert_lcqmc_checkpoints/graph') \ No newline at end of file diff --git a/Basic/Albert/albert_tiny_tf/albert/bert_utils.py b/Basic/Albert/albert_tiny_tf/albert/bert_utils.py new file mode 100644 index 0000000..f0a731f --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/bert_utils.py @@ -0,0 +1,148 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import six +import tensorflow as tf + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +# add sequence mask for: +# 1. random shuffle lm modeling---xlnet with random shuffled input +# 2. left2right and right2left language modeling +# 3. conditional generation +def generate_seq2seq_mask(attention_mask, mask_sequence, seq_type, **kargs): + if seq_type == 'seq2seq': + if mask_sequence is not None: + seq_shape = get_shape_list(mask_sequence, expected_rank=2) + seq_len = seq_shape[1] + ones = tf.ones((1, seq_len, seq_len)) + a_mask = tf.matrix_band_part(ones, -1, 0) + s_ex12 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 2) + s_ex13 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 3) + a_mask = (1 - s_ex13) * (1 - s_ex12) + s_ex13 * a_mask + # generate mask of batch x seq_len x seq_len + a_mask = tf.reshape(a_mask, (-1, seq_len, seq_len)) + out_mask = attention_mask * a_mask + else: + ones = tf.ones_like(attention_mask[:1]) + mask = (tf.matrix_band_part(ones, -1, 0)) + out_mask = attention_mask * mask + else: + out_mask = attention_mask + + return out_mask diff --git a/Basic/Albert/albert_tiny_tf/albert/create_pretrain_data.sh b/Basic/Albert/albert_tiny_tf/albert/create_pretrain_data.sh new file mode 100644 index 0000000..b7185f1 --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/create_pretrain_data.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +BERT_BASE_DIR=./albert_config +python3 create_pretraining_data.py --do_whole_word_mask=True --input_file=data/news_zh_1.txt \ +--output_file=data/tf_news_2016_zh_raw_news2016zh_1.tfrecord --vocab_file=$BERT_BASE_DIR/vocab.txt --do_lower_case=True \ +--max_seq_length=512 --max_predictions_per_seq=51 --masked_lm_prob=0.10 \ No newline at end of file diff --git a/Basic/Albert/albert_tiny_tf/albert/create_pretraining_data.py b/Basic/Albert/albert_tiny_tf/albert/create_pretraining_data.py new file mode 100644 index 0000000..63a4234 --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/create_pretraining_data.py @@ -0,0 +1,708 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import tokenization +import tensorflow as tf +import jieba +import re +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + +flags.DEFINE_bool("non_chinese", False,"manually set this to True if you are not doing chinese pre-train task.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + strings=reader.readline() + strings=strings.replace(" "," ").replace(" "," ") # 如果有两个或三个空格,替换为一个空格 + line = tokenization.convert_to_unicode(strings) + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document_albert( # change to albert style for sentence order prediction(SOP), 2019-08-28, brightmart + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + return instances + +def get_new_segment(segment): # 新增的方法 #### + """ + 输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。 + :param segment: 一句话. e.g. ['悬', '灸', '技', '术', '培', '训', '专', '家', '教', '你', '艾', '灸', '降', '血', '糖', ',', '为', '爸', '妈', '收', '好', '了', '!'] + :return: 一句处理过的话 e.g. ['悬', '##灸', '技', '术', '培', '训', '专', '##家', '教', '你', '艾', '##灸', '降', '##血', '##糖', ',', '为', '爸', '##妈', '收', '##好', '了', '!'] + """ + seq_cws = jieba.lcut("".join(segment)) # 分词 + seq_cws_dict = {x: 1 for x in seq_cws} # 分词后的词加入到词典dict + new_segment = [] + i = 0 + while i < len(segment): # 从句子的第一个字开始处理,知道处理完整个句子 + if len(re.findall('[\u4E00-\u9FA5]', segment[i])) == 0: # 如果找不到中文的,原文加进去即不用特殊处理。 + new_segment.append(segment[i]) + i += 1 + continue + + has_add = False + for length in range(3, 0, -1): + if i + length > len(segment): + continue + if ''.join(segment[i:i + length]) in seq_cws_dict: + new_segment.append(segment[i]) + for l in range(1, length): + new_segment.append('##' + segment[i + l]) + i += length + has_add = True + break + if not has_add: + new_segment.append(segment[i]) + i += 1 + # print("get_new_segment.wwm.get_new_segment:",new_segment) + return new_segment + +def create_instances_from_document_albert( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document. + This method is changed to create sentence-order prediction (SOP) followed by idea from paper of ALBERT, 2019-08-28, brightmart + """ + document = all_documents[document_index] # 得到一个文档 + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: # 有一定的比例,如10%的概率,我们使用比较短的序列长度,以缓解预训练的长序列和调优阶段(可能的)短序列的不一致情况 + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + # 设法使用实际的句子,而不是任意的截断句子,从而更好的构造句子连贯性预测的任务 + instances = [] + current_chunk = [] # 当前处理的文本段,包含多个句子 + current_length = 0 + i = 0 + # print("###document:",document) # 一个document可以是一整篇文章、新闻、词条等. document:[['是', '爷', '们', ',', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', ',', '获', '取', '育', '儿', '的', '智', '慧', ',', '与', '孩', '子', '一', '同', '成', '长', '!'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', ',', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', ',', '长', '大', '了', ',', '就', '底', '报', '答', '父', '母', ',', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', ',', '可', '以', '花', '心', ',', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', ',', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', ',', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', ',', '每', '次', '把', '她', '抱', '在', '怀', '里', ',', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', ',', '我', '都', '会', '引', '以', '为', '傲', ',', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', ',', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', ',', '喝', '到', '很', '晚', ',', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', ',', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', ',', '喝', '酒', '还', '勉', '强', '过', '得', '去', ',', '不', '过', '该', '喝', '的', '时', '候', '喝', ',', '不', '该', '喝', '的', '时', '候', ',', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '必', '须', '听', '我', '话', ',', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', ',', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', ',', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', ',', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', ',', '不', '管', '上', '学', '还', '是', '上', '班', ',', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '交', '往', '不', '到', '1', '年', ',', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', ',', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', ',', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', ',', '只', '要', '媳', '妇', '发', '话', ',', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', ',', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', ',', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', ',', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', ',', '可', '是', '我', '会', '告', '诉', '全', '世', '界', ',', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', ',', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', ',', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', ',', '做', '恶', '梦', '的', '时', '候', ',', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', ',', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', ',', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', ',', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', ',', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', ',', '咱', '是', '一', '爷', '们', ',', '给', '媳', '妇', '服', '个', '软', ',', '道', '个', '歉', '怎', '么', '了', '?'], ['我', '是', '一', '爷', '们', ',', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', ',', '拿', '她', '和', '别', '人', '比', ',', '说', '她', '这', '不', '如', '人', '家', ',', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', ',', '陪', '媳', '妇', '逛', '街', '时', ',', '碰', '见', '熟', '人', ',', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', ',', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', ',', '来', '自', '网', '络', ',', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', ',', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '(', '微', '信', '号', ':', 'he', '##bc', '##x', '##jy', ')', '。'], ['打', '开', '微', '信', ',', '扫', '描', '二', '维', '码', ',', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', ',', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', ',', '会', '有', '更', '多', '惊', '喜', '哦', '!']] + while i < len(document): # 从文档的第一个位置开始,按个往下看 + segment = document[i] # segment是列表,代表的是按字分开的一个完整句子,如 segment=['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'] + if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then do something to make chinese whole word mask works. + segment = get_new_segment(segment) # whole word mask for chinese: 结合分词的中文的whole mask设置即在需要的地方加上“##” + + current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中 + current_length += len(segment) # 累计到为止位置接触到句子的总长度 + if i == len(document) - 1 or current_length >= target_seq_length: + # 如果累计的序列长度达到了目标的长度,或当前走到了文档结尾==>构造并添加到“A[SEP]B“中的A和B中; + if current_chunk: # 如果当前块不为空 + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: # 当前块,如果包含超过两个句子,取当前块的一部分作为“A[SEP]B“中的A部分 + a_end = rng.randint(1, len(current_chunk) - 1) + # 将当前文本段中选取出来的前半部分,赋值给A即tokens_a + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + # 构造“A[SEP]B“中的B部分(有一部分是正常的当前文档中的后半部;在原BERT的实现中一部分是随机的从另一个文档中选取的,) + tokens_b = [] + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + + # 有百分之50%的概率交换一下tokens_a和tokens_b的位置 + # print("tokens_a length1:",len(tokens_a)) + # print("tokens_b length1:",len(tokens_b)) # len(tokens_b) = 0 + + if len(tokens_a) == 0 or len(tokens_b) == 0: i += 1; continue + if rng.random() < 0.5: # 交换一下tokens_a和tokens_b + is_random_next=True + temp=tokens_a + tokens_a=tokens_b + tokens_b=temp + else: + is_random_next=False + + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + # 把tokens_a & tokens_b加入到按照bert的风格,即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式,结合到一起,作为最终的tokens; 也带上segment_ids,前面部分segment_ids的值是0,后面部分的值是1. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( # 创建训练实例的对象 + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] # 清空当前块 + current_length = 0 # 重置当前文本块的长度 + i += 1 # 接着文档中的内容往后看 + + return instances + + +def create_instances_from_document_original( # THIS IS ORIGINAL BERT STYLE FOR CREATE DATA OF MLM AND NEXT SENTENCE PREDICTION TASK + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] # 得到一个文档 + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: # 有一定的比例,如10%的概率,我们使用比较短的序列长度,以缓解预训练的长序列和调优阶段(可能的)短序列的不一致情况 + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + # 设法使用实际的句子,而不是任意的截断句子,从而更好的构造句子连贯性预测的任务 + instances = [] + current_chunk = [] # 当前处理的文本段,包含多个句子 + current_length = 0 + i = 0 + # print("###document:",document) # 一个document可以是一整篇文章、新闻、一个词条等. document:[['是', '爷', '们', ',', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', ',', '获', '取', '育', '儿', '的', '智', '慧', ',', '与', '孩', '子', '一', '同', '成', '长', '!'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', ',', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', ',', '长', '大', '了', ',', '就', '底', '报', '答', '父', '母', ',', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', ',', '可', '以', '花', '心', ',', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', ',', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', ',', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', ',', '每', '次', '把', '她', '抱', '在', '怀', '里', ',', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', ',', '我', '都', '会', '引', '以', '为', '傲', ',', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', ',', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', ',', '喝', '到', '很', '晚', ',', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', ',', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', ',', '喝', '酒', '还', '勉', '强', '过', '得', '去', ',', '不', '过', '该', '喝', '的', '时', '候', '喝', ',', '不', '该', '喝', '的', '时', '候', ',', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '必', '须', '听', '我', '话', ',', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', ',', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', ',', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', ',', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', ',', '不', '管', '上', '学', '还', '是', '上', '班', ',', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '交', '往', '不', '到', '1', '年', ',', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', ',', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', ',', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', ',', '只', '要', '媳', '妇', '发', '话', ',', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', ',', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', ',', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', ',', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', ',', '可', '是', '我', '会', '告', '诉', '全', '世', '界', ',', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', ',', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', ',', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', ',', '做', '恶', '梦', '的', '时', '候', ',', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', ',', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', ',', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', ',', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', ',', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', ',', '咱', '是', '一', '爷', '们', ',', '给', '媳', '妇', '服', '个', '软', ',', '道', '个', '歉', '怎', '么', '了', '?'], ['我', '是', '一', '爷', '们', ',', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', ',', '拿', '她', '和', '别', '人', '比', ',', '说', '她', '这', '不', '如', '人', '家', ',', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', ',', '陪', '媳', '妇', '逛', '街', '时', ',', '碰', '见', '熟', '人', ',', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', ',', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', ',', '来', '自', '网', '络', ',', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', ',', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '(', '微', '信', '号', ':', 'he', '##bc', '##x', '##jy', ')', '。'], ['打', '开', '微', '信', ',', '扫', '描', '二', '维', '码', ',', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', ',', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', ',', '会', '有', '更', '多', '惊', '喜', '哦', '!']] + while i < len(document): # 从文档的第一个位置开始,按个往下看 + segment = document[i] # segment是列表,代表的是按字分开的一个完整句子,如 segment=['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'] + # print("###i:",i,";segment:",segment) + current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中 + current_length += len(segment) # 累计到为止位置接触到句子的总长度 + if i == len(document) - 1 or current_length >= target_seq_length: # 如果累计的序列长度达到了目标的长度==>构造并添加到“A[SEP]B“中的A和B中。 + if current_chunk: # 如果当前块不为空 + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: # 当前块,如果包含超过两个句子,怎取当前块的一部分作为“A[SEP]B“中的A部分 + a_end = rng.randint(1, len(current_chunk) - 1) + # 将当前文本段中选取出来的前半部分,赋值给A即tokens_a + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + # 构造“A[SEP]B“中的B部分(原本的B有一部分是随机的从另一个文档中选取的,有一部分是正常的当前文档中的后半部) + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: # 有50%的概率,是从其他文档中随机的选取一个文档,并得到这个文档的后半版本作为B即tokens_b + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + random_document_index=0 + for _ in range(10): # 随机的选出一个与当前的文档不一样的文档的索引 + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] # 选出这个文档 + random_start = rng.randint(0, len(random_document) - 1) # 从这个文档选出一个段落的开始位置 + for j in range(random_start, len(random_document)): # 从这个文档的开始位置到结束,作为我们的“A[SEP]B“中的B即tokens_b + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. 这里是为了防止文本的浪费的一个小技巧 + num_unused_segments = len(current_chunk) - a_end # e.g. 550-200=350 + i -= num_unused_segments # i=i-num_unused_segments, e.g. i=400, num_unused_segments=350, 那么 i=i-num_unused_segments=400-350=50 + # Actual next + else: # 有另外50%的几乎,从当前文本块(长度为max_sequence_length)中的后段中填充到tokens_b即“A[SEP]B“中的B。 + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + # 把tokens_a & tokens_b加入到按照bert的风格,即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式,结合到一起,作为最终的tokens; 也带上segment_ids,前面部分segment_ids的值是0,后面部分的值是1. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( # 创建训练实例的对象 + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] # 清空当前块 + current_length = 0 # 重置当前文本块的长度 + i += 1 # 接着文档中的内容往后看 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously + output_tokens = [t[2:] if len(re.findall('##[\u4E00-\u9FA5]', t)) > 0 else t for t in tokens] # 去掉"##" + else: # english and other language, which is not chinese + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + if FLAGS.non_chinese == False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously + masked_token = tokens[index][2:] if len(re.findall('##[\u4E00-\u9FA5]', tokens[index])) > 0 else tokens[index] # 去掉"##" + else: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + # tf.logging.info('%s' % (tokens)) + # tf.logging.info('%s' % (output_tokens)) + return (output_tokens, masked_lm_positions, masked_lm_labels) + +def create_masked_lm_predictions_original(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() \ No newline at end of file diff --git a/Basic/Albert/albert_tiny_tf/albert/modeling.py b/Basic/Albert/albert_tiny_tf/albert/modeling.py new file mode 100644 index 0000000..a521690 --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/modeling.py @@ -0,0 +1,1280 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The main BERT model and related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import numpy as np +import six +import tensorflow as tf +from albert import bert_utils + + +class BertConfig(object): + """Configuration for `BertModel`.""" + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The stdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids, but use stype of factorized embedding parameterization from albert. add by brightmart, 2019-09-28 + (self.embedding_output, self.embedding_table, self.embedding_table_2) = embedding_lookup_factorized( + input_ids=input_ids, + vocab_size=config.vocab_size, + hidden_size=config.hidden_size, + embedding_size=config.embedding_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + with tf.variable_scope("encoder"): + # This converts a 2D mask of shape [batch_size, seq_length] to a 3D + # mask of shape [batch_size, seq_length, seq_length] which is used + # for the attention scores. + attention_mask = create_attention_mask_from_input_mask( + input_ids, input_mask) + + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + ln_type = config.ln_type + + if ln_type == 'postln' or ln_type is None: # currently, base or large of albert used post-LN structure + print("old structure of transformer.use: transformer_model,which use post-LN") + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) + else: # xlarge or xxlarge of albert, used pre-LN structure + print("new structure of transformer.use: prelln_transformer_model,which use pre-LN") + self.all_encoder_layers = prelln_transformer_model( + # change by brightmart, 4th, oct, 2019. pre-Layer Normalization can converge fast and better. check paper: ON LAYER NORMALIZATION IN THE TRANSFORMER ARCHITECTURE + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True, + shared_type='all') # do_return_all_layers=True + + self.sequence_output = self.all_encoder_layers[-1] # [batch_size, seq_length, hidden_size] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + """Gets final hidden layer of encoder. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table + + def get_embedding_table_2(self): + return self.embedding_table_2 + + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def get_activation(activation_string): + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + + Args: + activation_string: String name of the activation function. + + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. + + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ + + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string + + if not activation_string: + return None + + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + continue + assignment_map[name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) + return output + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) # shape of input_ids is:[ batch_size, seq_length, 1] + + embedding_table = tf.get_variable( # [vocab_size, embedding_size] + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) # one rank. shape as (batch_size * sequence_length,) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, + depth=vocab_size) # one_hot_input_ids=[batch_size * sequence_length,vocab_size] + output = tf.matmul(one_hot_input_ids, embedding_table) # output=[batch_size * sequence_length,embedding_size] + else: + output = tf.gather(embedding_table, + flat_input_ids) # [vocab_size, embedding_size]*[batch_size * sequence_length,]--->[batch_size * sequence_length,embedding_size] + + input_shape = get_shape_list(input_ids) # input_shape=[ batch_size, seq_length, 1] + + output = tf.reshape(output, input_shape[0:-1] + [ + input_shape[-1] * embedding_size]) # output=[batch_size,sequence_length,embedding_size] + return (output, embedding_table) + + +def embedding_lookup_factorized(input_ids, # Factorized embedding parameterization provide by albert + vocab_size, + hidden_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor, but in a factorized style followed by albert. it is used to reduce much percentage of parameters previous exists. + Check "Factorized embedding parameterization" session in the paper. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + + # 1.first project one-hot vectors into a lower dimensional embedding space of size E + print("embedding_lookup_factorized. factorized embedding parameterization is used.") + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) # shape of input_ids is:[ batch_size, seq_length, 1] + + embedding_table = tf.get_variable( # [vocab_size, embedding_size] + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) # one rank. shape as (batch_size * sequence_length,) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, + depth=vocab_size) # one_hot_input_ids=[batch_size * sequence_length,vocab_size] + output_middle = tf.matmul(one_hot_input_ids, + embedding_table) # output=[batch_size * sequence_length,embedding_size] + else: + output_middle = tf.gather(embedding_table, + flat_input_ids) # [vocab_size, embedding_size]*[batch_size * sequence_length,]--->[batch_size * sequence_length,embedding_size] + + # 2. project vector(output_middle) to the hidden space + project_variable = tf.get_variable( # [embedding_size, hidden_size] + name=word_embedding_name + "_2", + shape=[embedding_size, hidden_size], + initializer=create_initializer(initializer_range)) + output = tf.matmul(output_middle, + project_variable) # ([batch_size * sequence_length, embedding_size] * [embedding_size, hidden_size])--->[batch_size * sequence_length, hidden_size] + # reshape back to 3 rank + input_shape = get_shape_list(input_ids) # input_shape=[ batch_size, seq_length, 1] + batch_size, sequene_length, _ = input_shape + output = tf.reshape(output, + (batch_size, sequene_length, hidden_size)) # output=[batch_size, sequence_length, hidden_size] + return (output, embedding_table, project_variable) + + +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout( + output, dropout_prob) + + return output + + +def create_attention_mask_from_input_mask(from_tensor, to_mask): + """Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + batch_size = from_shape[0] + from_seq_length = from_shape[1] + + to_shape = get_shape_list(to_mask, expected_rank=2) + to_seq_length = to_shape[1] + + to_mask = tf.cast( + tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + # + # `broadcast_ones` = [batch_size, from_seq_length, 1] + broadcast_ones = tf.ones( + shape=[batch_size, from_seq_length, 1], dtype=tf.float32) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + +def attention_layer(from_tensor, + to_tensor, + attention_mask=None, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + do_return_2d_tensor=False, + batch_size=None, + from_seq_length=None, + to_seq_length=None): + """Performs multi-headed attention from `from_tensor` to `to_tensor`. + + This is an implementation of multi-headed attention based on "Attention + is all you Need". If `from_tensor` and `to_tensor` are the same, then + this is self-attention. Each timestep in `from_tensor` attends to the + corresponding sequence in `to_tensor`, and returns a fixed-with vector. + + This function first projects `from_tensor` into a "query" tensor and + `to_tensor` into "key" and "value" tensors. These are (effectively) a list + of tensors of length `num_attention_heads`, where each tensor is of shape + [batch_size, seq_length, size_per_head]. + + Then, the query and key tensors are dot-producted and scaled. These are + softmaxed to obtain attention probabilities. The value tensors are then + interpolated by these probabilities, then concatenated back to a single + tensor and returned. + + In practice, the multi-headed attention are done with transposes and + reshapes rather than actual separate tensors. + + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: (optional) float. Dropout probability of the + attention probabilities. + initializer_range: float. Range of the weight initializer. + do_return_2d_tensor: bool. If True, the output will be of shape [batch_size + * from_seq_length, num_attention_heads * size_per_head]. If False, the + output will be of shape [batch_size, from_seq_length, num_attention_heads + * size_per_head]. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. + + Returns: + float Tensor of shape [batch_size, from_seq_length, + num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is + true, this will be of shape [batch_size * from_seq_length, + num_attention_heads * size_per_head]). + + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ + + def transpose_for_scores(input_tensor, batch_size, num_attention_heads, + seq_length, width): + output_tensor = tf.reshape( + input_tensor, [batch_size, seq_length, num_attention_heads, width]) + + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor + + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") + + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") + + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + + from_tensor_2d = reshape_to_matrix(from_tensor) + to_tensor_2d = reshape_to_matrix(to_tensor) + + # `query_layer` = [B*F, N*H] + query_layer = tf.layers.dense( + from_tensor_2d, + num_attention_heads * size_per_head, + activation=query_act, + name="query", + kernel_initializer=create_initializer(initializer_range)) + + # `key_layer` = [B*T, N*H] + key_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=key_act, + name="key", + kernel_initializer=create_initializer(initializer_range)) + + # `value_layer` = [B*T, N*H] + value_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=value_act, + name="value", + kernel_initializer=create_initializer(initializer_range)) + + # `query_layer` = [B, N, F, H] + query_layer = transpose_for_scores(query_layer, batch_size, + num_attention_heads, from_seq_length, + size_per_head) + + # `key_layer` = [B, N, T, H] + key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, + to_seq_length, size_per_head) + + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = tf.multiply(attention_scores, + 1.0 / math.sqrt(float(size_per_head))) + + if attention_mask is not None: + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 + + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder + + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = tf.nn.softmax(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + + # `value_layer` = [B, T, N, H] + value_layer = tf.reshape( + value_layer, + [batch_size, to_seq_length, num_attention_heads, size_per_head]) + + # `value_layer` = [B, N, T, H] + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + # `context_layer` = [B, N, F, H] + context_layer = tf.matmul(attention_probs, value_layer) + + # `context_layer` = [B, F, N, H] + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + + if do_return_2d_tensor: + # `context_layer` = [B*F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size * from_seq_length, num_attention_heads * size_per_head]) + else: + # `context_layer` = [B, F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size, from_seq_length, num_attention_heads * size_per_head]) + + return context_layer + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False, + share_parameter_across_layers=True): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = reshape_to_matrix(input_tensor) + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + if share_parameter_across_layers: + name_variable_scope = "layer_shared" + else: + name_variable_scope = "layer_%d" % layer_idx + # share all parameters across layers. add by brightmart, 2019-09-28. previous it is like this: "layer_%d" % layer_idx + with tf.variable_scope(name_variable_scope, + reuse=True if (share_parameter_across_layers and layer_idx > 0) else False): + + layer_input = prev_output + + with tf.variable_scope("attention"): + attention_heads = [] + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = tf.layers.dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=tf.glorot_normal_initializer()) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = reshape_from_matrix(prev_output, input_shape) + return final_output + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) + + +def prelln_transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False, + shared_type='all', # None, + adapter_fn=None): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + + input_shape = bert_utils.get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = bert_utils.reshape_to_matrix(input_tensor) + + all_layer_outputs = [] + + def layer_scope(idx, shared_type): + if shared_type == 'all': + tmp = { + "layer": "layer_shared", + 'attention': 'attention', + 'intermediate': 'intermediate', + 'output': 'output' + } + elif shared_type == 'attention': + tmp = { + "layer": "layer_shared", + 'attention': 'attention', + 'intermediate': 'intermediate_{}'.format(idx), + 'output': 'output_{}'.format(idx) + } + elif shared_type == 'ffn': + tmp = { + "layer": "layer_shared", + 'attention': 'attention_{}'.format(idx), + 'intermediate': 'intermediate', + 'output': 'output' + } + else: + tmp = { + "layer": "layer_{}".format(idx), + 'attention': 'attention', + 'intermediate': 'intermediate', + 'output': 'output' + } + + return tmp + + all_layer_outputs = [] + + for layer_idx in range(num_hidden_layers): + + idx_scope = layer_scope(layer_idx, shared_type) + + with tf.variable_scope(idx_scope['layer'], reuse=tf.AUTO_REUSE): + layer_input = prev_output + + with tf.variable_scope(idx_scope['attention'], reuse=tf.AUTO_REUSE): + attention_heads = [] + + with tf.variable_scope("output", reuse=tf.AUTO_REUSE): + layer_input_pre = layer_norm(layer_input) + + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input_pre, + to_tensor=layer_input_pre, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output", reuse=tf.AUTO_REUSE): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + + # attention_output = layer_norm(attention_output + layer_input) + attention_output = attention_output + layer_input + + with tf.variable_scope(idx_scope['output'], reuse=tf.AUTO_REUSE): + attention_output_pre = layer_norm(attention_output) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope(idx_scope['intermediate'], reuse=tf.AUTO_REUSE): + intermediate_output = tf.layers.dense( + attention_output_pre, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope(idx_scope['output'], reuse=tf.AUTO_REUSE): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + + # layer_output = layer_norm(layer_output + attention_output) + layer_output = layer_output + attention_output + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = bert_utils.reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = bert_utils.reshape_from_matrix(prev_output, input_shape) + return final_output diff --git a/Basic/Albert/albert_tiny_tf/albert/optimization.py b/Basic/Albert/albert_tiny_tf/albert/optimization.py new file mode 100644 index 0000000..ac7e79a --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/optimization.py @@ -0,0 +1,300 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = LAMBOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name + + +# +class LAMBOptimizer(tf.train.Optimizer): + """ + LAMBOptimizer optimizer. + https://github.com/ymcui/LAMB_Optimizer_TF + # IMPORTANT NOTE + - This is NOT an official implementation. + - LAMB optimizer is changed from arXiv v1 ~ v3. + - We implement v3 version (which is the latest version on June, 2019.). + - Our implementation is based on `AdamWeightDecayOptimizer` in BERT (provided by Google). + + # References + - Large Batch Optimization for Deep Learning: Training BERT in 76 minutes. https://arxiv.org/abs/1904.00962v3 + - BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. https://arxiv.org/abs/1810.04805 + # Parameters + - There is nothing special, just the same as `AdamWeightDecayOptimizer`. + """ + + def __init__(self, + learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="LAMBOptimizer"): + """Constructs a LAMBOptimizer.""" + super(LAMBOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/lamb_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/lamb_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + ############## BELOW ARE THE SPECIFIC PARTS FOR LAMB ############## + + # Note: Here are two choices for scaling function \phi(z) + # minmax: \phi(z) = min(max(z, \gamma_l), \gamma_u) + # identity: \phi(z) = z + # The authors does not mention what is \gamma_l and \gamma_u + # UPDATE: after asking authors, they provide me the code below. + # ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where( + # math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) + + r1 = tf.sqrt(tf.reduce_sum(tf.square(param))) + r2 = tf.sqrt(tf.reduce_sum(tf.square(update))) + + r = tf.where(tf.greater(r1, 0.0), + tf.where(tf.greater(r2, 0.0), + r1 / r2, + 1.0), + 1.0) + + eta = self.learning_rate * r + + update_with_lr = eta * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/Basic/Albert/albert_tiny_tf/albert/optimization_finetuning.py b/Basic/Albert/albert_tiny_tf/albert/optimization_finetuning.py new file mode 100644 index 0000000..dd9311b --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/optimization_finetuning.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + # (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/Basic/Albert/albert_tiny_tf/albert/run.sh b/Basic/Albert/albert_tiny_tf/albert/run.sh new file mode 100644 index 0000000..1e093a6 --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/run.sh @@ -0,0 +1,4 @@ +python run_classifier.py --task_name=lcqmc --do_train=true --do_eval=true --data_dir=../task_data/lcqmc --vocab_file=pre_trained_model/albert_tiny/vocab.txt --bert_config_file=pre_trained_model/albert_tiny/albert_config_tiny.json --max_seq_length=128 --train_batch_size=64 --learning_rate=1e-4 --num_train_epochs=5 --output_dir=output/lcqmc --init_checkpoint=pre_trained_model/albert_tiny/albert_model.ckpt + + +python run_classifier.py --task_name=tnews --do_train=true --do_eval=true --data_dir=../task_data/tnews --vocab_file=pre_trained_model/albert_large/vocab.txt --bert_config_file=pre_trained_model/albert_large/albert_config_large.json --max_seq_length=128 --train_batch_size=8 --learning_rate=2e-5 --num_train_epochs=5 --output_dir=output/tnews --init_checkpoint=pre_trained_model/albert_large/albert_model.ckpt diff --git a/Basic/Albert/albert_tiny_tf/albert/run_classifier.py b/Basic/Albert/albert_tiny_tf/albert/run_classifier.py new file mode 100644 index 0000000..84acb91 --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/run_classifier.py @@ -0,0 +1,1013 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import csv +import os +import modeling +import optimization_finetuning as optimization +import tokenization +import tensorflow as tf + +# from loss import bi_tempered_logistic_loss + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + ln_type = bert_config.ln_type + if ln_type == 'preln': # add by brightmart, 10-06. if it is preln, we need to an additonal layer: layer normalization as suggested in paper "ON LAYER NORMALIZATION IN THE TRANSFORMER ARCHITECTURE" + print("ln_type is preln. add LN layer.") + output_layer = layer_norm(output_layer) + else: + print("ln_type is postln or other,do nothing.") + + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + print("logits", logits) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + # + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + # per_example_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=one_hot_labels) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) # todo 08-29 try temp-loss + ###############bi_tempered_logistic_loss############################################################################ + # print("##cross entropy loss is used...."); tf.logging.info("##cross entropy loss is used....") + # t1=0.9 #t1=0.90 + # t2=1.05 #t2=1.05 + # per_example_loss=bi_tempered_logistic_loss(log_probs,one_hot_labels,t1,t2,label_smoothing=0.1,num_iters=5) # TODO label_smoothing=0.0 + # tf.logging.info("per_example_loss:"+str(per_example_loss.shape)) + ##############bi_tempered_logistic_loss############################################################################# + + loss = tf.reduce_mean(per_example_loss) + print("loss", loss) + + return (loss, per_example_loss, logits, probabilities) + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +class TNEWSClassifierProcessor(object): + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_data(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_data(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_data(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self, data_dir): + """See base class.""" + with open(os.path.join(data_dir, "labels.txt"), "r", encoding="utf8") as fr: + labels = [line.strip() for line in fr.readlines()] + return labels + + def _read_data(self, data_dir): + with open(data_dir, "r", encoding="utf8") as fr: + lines = [line.strip().split("") for line in fr.readlines()] + return lines + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[1]) + text_a = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_a, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class LCQMCPairClassificationProcessor(DataProcessor): # TODO NEED CHANGE2 + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class SentencePairClassificationProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_0827.tsv")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_0827.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_0827.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[0]) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = tokenization.convert_to_unicode(line[2]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features.append(feature) + return features + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + # 数据处理类 + processors = { + "sentence_pair": SentencePairClassificationProcessor, + "lcqmc": LCQMCPairClassificationProcessor, + "tnews": TNEWSClassifierProcessor + + } + + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + # 加载模型参数 + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + # 创建output文件 + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + # Cloud TPU: Invalid TPU configuration, ensure ClusterResolver is passed to tpu. + print("###tpu_cluster_resolver:", tpu_cluster_resolver) + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) # TODO + print("###length of total train_examples:", len(train_examples)) + num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + train_file_exists = os.path.exists(train_file) + print("###train_file_exists:", train_file_exists, " ;train_file:", train_file) + if not train_file_exists: # if tf_record file not exist, convert from raw text file. # TODO + file_based_convert_examples_to_features(train_examples, label_list, FLAGS.max_seq_length, tokenizer, + train_file) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + + tensors_to_log = {"train loss": "loss/Mean:0"} + logging_hook = tf.train.LoggingTensorHook( + tensors=tensors_to_log, every_n_iter=100) + estimator.train(input_fn=train_input_fn, hooks=[logging_hook], max_steps=num_train_steps) + + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "eval_results_albert_zh.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + # output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(PaddingInputExample()) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + num_written_lines = 0 + tf.logging.info("***** Predict results *****") + for (i, prediction) in enumerate(result): + probabilities = prediction["probabilities"] + if i >= num_actual_predict_examples: + break + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/Basic/Albert/albert_tiny_tf/albert/run_pretraining.py b/Basic/Albert/albert_tiny_tf/albert/run_pretraining.py new file mode 100644 index 0000000..346d8bb --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/run_pretraining.py @@ -0,0 +1,501 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run masked LM/next sentence masked_lm pre-training for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import modeling +import optimization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "input_file", None, + "Input TF example files (can be a glob or comma separated).") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded. Must match data generation.") + +flags.DEFINE_integer( + "max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence. " + "Must match data generation.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") + +flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + next_sentence_labels = features["next_sentence_labels"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + (masked_lm_loss, + masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( + bert_config, model.get_sequence_output(), model.get_embedding_table(),model.get_embedding_table_2(), + masked_lm_positions, masked_lm_ids, masked_lm_weights) + + (next_sentence_loss, next_sentence_example_loss, + next_sentence_log_probs) = get_next_sentence_output( + bert_config, model.get_pooled_output(), next_sentence_labels) + + total_loss = masked_lm_loss + next_sentence_loss + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + print("init_checkpoint:",init_checkpoint) + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels): + """Computes the loss and accuracy of the model.""" + masked_lm_log_probs = tf.reshape(masked_lm_log_probs,[-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) + + next_sentence_log_probs = tf.reshape( + next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) + next_sentence_predictions = tf.argmax( + next_sentence_log_probs, axis=-1, output_type=tf.int32) + next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) + next_sentence_accuracy = tf.metrics.accuracy( + labels=next_sentence_labels, predictions=next_sentence_predictions) + next_sentence_mean_loss = tf.metrics.mean( + values=next_sentence_example_loss) + + return { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + "next_sentence_accuracy": next_sentence_accuracy, + "next_sentence_loss": next_sentence_mean_loss, + } + + # next_sentence_example_loss=0.0 TODO + # next_sentence_log_probs=0.0 # TODO + eval_metrics = (metric_fn, [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels + ]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def get_masked_lm_output(bert_config, input_tensor, output_weights,project_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) + + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + # logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + # input_tensor=[-1,hidden_size], project_weights=[embedding_size, hidden_size], project_weights_transpose=[hidden_size, embedding_size]--->[-1, embedding_size] + input_project = tf.matmul(input_tensor, project_weights, transpose_b=True) + logits = tf.matmul(input_project, output_weights, transpose_b=True) + # # input_project=[-1, embedding_size], output_weights=[vocab_size, embedding_size], output_weights_transpose=[embedding_size, vocab_size] ---> [-1, vocab_size] + + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) + + one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator + + return (loss, per_example_loss, log_probs) + + +def get_next_sentence_output(bert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, bert_config.hidden_size], + initializer=modeling.create_initializer(bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def input_fn_builder(input_files, + max_seq_length, + max_predictions_per_seq, + is_training, + num_cpu_threads=4): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + name_to_features = { + "input_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": + tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32), + "next_sentence_labels": + tf.FixedLenFeature([1], tf.int64), + } + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) + + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() + + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaluating on the CPU or GPU + # and we *don't* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + return d + + return input_fn + + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not FLAGS.do_train and not FLAGS.do_eval: # 必须是训练或验证的类型 + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # 从json文件中获得配置信息 + + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] # 输入可以是多个文件,以“逗号隔开”;可以是一个匹配形式的,如“input_x*” + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # TODO + tpu=FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + print("###tpu_cluster_resolver:",tpu_cluster_resolver,";FLAGS.use_tpu:",FLAGS.use_tpu,";FLAGS.tpu_name:",FLAGS.tpu_name,";FLAGS.tpu_zone:",FLAGS.tpu_zone) + # ###tpu_cluster_resolver: ;FLAGS.use_tpu: True ;FLAGS.tpu_name: grpc://10.240.1.83:8470 + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + keep_checkpoint_max=20, # 10 + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) + + result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/Basic/Albert/albert_tiny_tf/albert/similarity.py b/Basic/Albert/albert_tiny_tf/albert/similarity.py new file mode 100644 index 0000000..a944bab --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/similarity.py @@ -0,0 +1,274 @@ +""" +进行文本相似度预测的示例。可以直接运行进行预测。 +参考了项目:https://github.com/chdd/bert-utils + +""" + + +import tensorflow as tf +import args +import tokenization +import modeling +from run_classifier import InputFeatures, InputExample, DataProcessor, create_model, convert_examples_to_features + + +# os.environ['CUDA_VISIBLE_DEVICES'] = '1' + + +class SimProcessor(DataProcessor): + def get_sentence_examples(self, questions): + examples = [] + for index, data in enumerate(questions): + guid = 'test-%d' % index + text_a = tokenization.convert_to_unicode(str(data[0])) + text_b = tokenization.convert_to_unicode(str(data[1])) + label = str(0) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + return ['0', '1'] + + +""" +模型类,负责载入checkpoint初始化模型 +""" +class BertSim: + def __init__(self, batch_size=args.batch_size): + self.mode = None + self.max_seq_length = args.max_seq_len + self.tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=True) + self.batch_size = batch_size + self.estimator = None + self.processor = SimProcessor() + tf.logging.set_verbosity(tf.logging.INFO) + + + + #载入estimator,构造模型 + def start_model(self): + self.estimator = self.get_estimator() + + + def model_fn_builder(self, bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, + use_one_hot_embeddings): + """Returns `model_fn` closurimport_tfe for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + from tensorflow.python.estimator.model_fn import EstimatorSpec + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + + if init_checkpoint: + (assignment_map, initialized_variable_names) \ + = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + output_spec = EstimatorSpec(mode=mode, predictions=probabilities) + + return output_spec + + return model_fn + + def get_estimator(self): + + from tensorflow.python.estimator.estimator import Estimator + from tensorflow.python.estimator.run_config import RunConfig + + bert_config = modeling.BertConfig.from_json_file(args.config_name) + label_list = self.processor.get_labels() + if self.mode == tf.estimator.ModeKeys.TRAIN: + init_checkpoint = args.ckpt_name + else: + init_checkpoint = args.output_dir + + model_fn = self.model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=init_checkpoint, + learning_rate=args.learning_rate, + num_train_steps=None, + num_warmup_steps=None, + use_one_hot_embeddings=False) + + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction + config.log_device_placement = False + + return Estimator(model_fn=model_fn, config=RunConfig(session_config=config), model_dir=args.output_dir, + params={'batch_size': self.batch_size}) + + def predict_sentences(self,sentences): + results= self.estimator.predict(input_fn=input_fn_builder(self,sentences), yield_single_examples=False) + #打印预测结果 + for i in results: + print(i) + + def _truncate_seq_pair(self, tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + def convert_single_example(self, ex_index, example, label_list, max_seq_length, tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id) + return feature + + + + +def input_fn_builder(bertSim,sentences): + def predict_input_fn(): + return (tf.data.Dataset.from_generator( + generate_from_input, + output_types={ + 'input_ids': tf.int32, + 'input_mask': tf.int32, + 'segment_ids': tf.int32, + 'label_ids': tf.int32}, + output_shapes={ + 'input_ids': (None, bertSim.max_seq_length), + 'input_mask': (None, bertSim.max_seq_length), + 'segment_ids': (None, bertSim.max_seq_length), + 'label_ids': (1,)}).prefetch(10)) + + def generate_from_input(): + processor = bertSim.processor + predict_examples = processor.get_sentence_examples(sentences) + features = convert_examples_to_features(predict_examples, processor.get_labels(), args.max_seq_len, + bertSim.tokenizer) + yield { + 'input_ids': [f.input_ids for f in features], + 'input_mask': [f.input_mask for f in features], + 'segment_ids': [f.segment_ids for f in features], + 'label_ids': [f.label_id for f in features] + } + + return predict_input_fn + + +if __name__ == '__main__': + sim = BertSim() + sim.start_model() + sim.predict_sentences([("我喜欢妈妈做的汤", "妈妈做的汤我很喜欢喝")]) diff --git a/Basic/Albert/albert_tiny_tf/albert/test_changes.py b/Basic/Albert/albert_tiny_tf/albert/test_changes.py new file mode 100644 index 0000000..f5f1d2e --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/test_changes.py @@ -0,0 +1,87 @@ +# coding=utf-8 +import tensorflow as tf +from modeling import embedding_lookup_factorized,transformer_model +import os + +""" +测试albert主要的改进点:词嵌入的因式分解、层间参数共享、段落间连贯性 +test main change of albert from bert +""" +batch_size = 2048 +sequence_length = 512 +vocab_size = 30000 +hidden_size = 1024 +num_attention_heads = int(hidden_size / 64) + +def get_total_parameters(): + """ + get total parameters of a graph + :return: + """ + total_parameters = 0 + for variable in tf.trainable_variables(): + # shape is an array of tf.Dimension + shape = variable.get_shape() + # print(shape) + # print(len(shape)) + variable_parameters = 1 + for dim in shape: + # print(dim) + variable_parameters *= dim.value + # print(variable_parameters) + total_parameters += variable_parameters + return total_parameters + +def test_factorized_embedding(): + """ + test of Factorized embedding parameterization + :return: + """ + input_ids=tf.zeros((batch_size, sequence_length),dtype=tf.int32) + output, embedding_table, embedding_table_2=embedding_lookup_factorized(input_ids,vocab_size,hidden_size) + print("output:",output) + +def test_share_parameters(): + """ + test of share parameters across all layers: how many parameter after share parameter across layers of transformer. + :return: + """ + def total_parameters_transformer(share_parameter_across_layers): + input_tensor=tf.zeros((batch_size, sequence_length, hidden_size),dtype=tf.float32) + print("transformer_model. input:",input_tensor) + transformer_result=transformer_model(input_tensor,hidden_size=hidden_size,num_attention_heads=num_attention_heads,share_parameter_across_layers=share_parameter_across_layers) + print("transformer_result:",transformer_result) + total_parameters=get_total_parameters() + print('total_parameters(not share):',total_parameters) + + share_parameter_across_layers=False + total_parameters_transformer(share_parameter_across_layers) # total parameters, not share: 125,976,576 = 125 million + + tf.reset_default_graph() # Clears the default graph stack and resets the global default graph + share_parameter_across_layers=True + total_parameters_transformer(share_parameter_across_layers) # total parameters, share: 10,498,048 = 10.5 million + +def test_sentence_order_prediction(): + """ + sentence order prediction. + + check method of create_instances_from_document_albert from create_pretrining_data.py + + :return: + """ + # 添加运行权限 + os.system("chmod +x create_pretrain_data.sh") + + os.system("./create_pretrain_data.sh") + + +# 1.test of Factorized embedding parameterization +#test_factorized_embedding() + +# 2. test of share parameters across all layers: how many parameter after share parameter across layers of transformer. +# before share parameter: 125,976,576; after share parameter: +#test_share_parameters() + +# 3. test of sentence order prediction(SOP) +test_sentence_order_prediction() + diff --git a/Basic/Albert/albert_tiny_tf/albert/tokenization.py b/Basic/Albert/albert_tiny_tf/albert/tokenization.py new file mode 100644 index 0000000..f7020e8 --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert/tokenization.py @@ -0,0 +1,401 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + #print("items:",items) #['[CLS]', '日', '##期', ',', '但', '被', '##告', '金', '##东', '##福', '载', '##明', '[MASK]', 'U', '##N', '##K', ']', '保', '##证', '本', '##月', '1', '##4', '[MASK]', '到', '##位', ',', '2', '##0', '##1', '##5', '年', '6', '[MASK]', '1', '##1', '日', '[', 'U', '##N', '##K', ']', ',', '原', '##告', '[MASK]', '认', '##可', '于', '2', '##0', '##1', '##5', '[MASK]', '6', '月', '[MASK]', '[MASK]', '日', '##向', '被', '##告', '主', '##张', '权', '##利', '。', '而', '[MASK]', '[MASK]', '自', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '年', '6', '月', '1', '##1', '日', '[SEP]', '原', '##告', '于', '2', '##0', '##1', '##6', '[MASK]', '6', '[MASK]', '2', '##4', '日', '起', '##诉', ',', '主', '##张', '保', '##证', '责', '##任', ',', '已', '超', '##过', '保', '##证', '期', '##限', '[MASK]', '保', '##证', '人', '依', '##法', '不', '##再', '承', '##担', '保', '##证', '[MASK]', '[MASK]', '[MASK]', '[SEP]'] + for i,item in enumerate(items): + #print(i,"item:",item) # ##期 + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/Basic/Albert/albert_tiny_tf/albert_model/albert_tiny/albert_config.json b/Basic/Albert/albert_tiny_tf/albert_model/albert_tiny/albert_config.json new file mode 100644 index 0000000..dc97f5b --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/albert_model/albert_tiny/albert_config.json @@ -0,0 +1,23 @@ +{ + "attention_probs_dropout_prob": 0.0, + "directionality": "bidi", + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 312, + "embedding_size": 128, + "initializer_range": 0.02, + "intermediate_size": 1248 , + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 4, + + "pooler_fc_size": 768, + "pooler_num_attention_heads": 12, + "pooler_num_fc_layers": 3, + "pooler_size_per_head": 128, + "pooler_type": "first_token_transform", + "type_vocab_size": 2, + "vocab_size": 21128, + "ln_type":"postln" + +} diff --git a/Basic/Albert/albert_tiny_tf/classifier_task/README.md b/Basic/Albert/albert_tiny_tf/classifier_task/README.md new file mode 100644 index 0000000..4804335 --- /dev/null +++ b/Basic/Albert/albert_tiny_tf/classifier_task/README.md @@ -0,0 +1,32 @@ +#### config文件解读 + +##### 以inews_config.json为例 + +* model_name:模型名称 +* epochs:迭代epoch的数量 +* checkpoint_every:间隔多少步保存一次模型 +* eval_every:间隔多少步验证一次模型 +* learning_rate:学习速率,推荐2e-5, 5e-5, 1e-4 +* sequence_length:序列长度,单GPU时不要超过128 +* batch_size:单GPU时不要超过32 +* num_classes:文本分类的类别数量,若是二分类设置为1 +* warmup_rate:训练时的预热比例,建议0.05, 0.1 +* output_path:输出文件夹,用来存储label_to_index等文件 +* bert_model_path:预训练模型文件夹路径 +* train_data:训练数据路径 +* eval_data:验证数据路径 +* ckpt_model_path:checkpoint模型文件保存路径 + + +######处理好的数据 +data文件夹下 +训练数据中,train:valid=4:1 + + +#########训练和测试 +cd到classifier文件夹目录下 +训练:sh run.sh +测试:CUDA_VISIBLE_DEVICES=2 python testnew.py + +######运行环境 +tensorflow-GPU-1.14-cp37(与代码不是最合适的版本,会有许多警告) \ No newline at end of file diff --git a/BertToSimple/textcnn/predict.py b/BertToSimple/textcnn/predict.py new file mode 100644 index 0000000..80c6d98 --- /dev/null +++ b/BertToSimple/textcnn/predict.py @@ -0,0 +1,83 @@ +# -*-coding:utf-8-*- + +import pickle, numpy as np +import time +from keras.layers import * +from keras.models import Model +from keras.initializers import Constant +from keras.preprocessing import sequence +from keras.models import load_model +from keras.utils.np_utils import to_categorical +from sklearn.metrics import accuracy_score +from utils import load_data + +def get_textcnn(x_len, v_size, embs): + x = Input(shape=(x_len,),dtype='int32') + # embed = Embedding(v_size,300)(x) + embed = Embedding(v_size,300,embeddings_initializer=Constant(embs),trainable=False)(x) + cnn1 = Convolution1D(256,3,padding='same',strides=1,activation='relu')(embed) + cnn1 = MaxPool1D(pool_size=4)(cnn1) + cnn2 = Convolution1D(256,4,padding='same',strides=1,activation='relu')(embed) + cnn2 = MaxPool1D(pool_size=4)(cnn2) + cnn3 = Convolution1D(256,5,padding='same',strides=1,activation='relu')(embed) + cnn3 = MaxPool1D(pool_size=4)(cnn3) + cnn = concatenate([cnn1,cnn2,cnn3],axis=-1) + flat = Flatten()(cnn) + drop = Dropout(0.2,name='drop')(flat) + y = Dense(3,activation='softmax')(drop) + model = Model(inputs=x,outputs=y) + return model + +def get_birnn(x_len, v_size, embs): + x = Input(shape=(x_len,),dtype='int32') + # embed = Embedding(v_size,300)(x) + embed = Embedding(v_size,300,embeddings_initializer=Constant(embs),trainable=False)(x) + # bi = Bidirectional(GRU(256,activation='tanh',recurrent_dropout=0.2,dropout=0.2,return_sequences=True))(embed) + bi = Bidirectional(GRU(256,activation='tanh',recurrent_dropout=0.2,dropout=0.2))(embed) + bi_1 = Bidirectional(GRU(256, activation='tanh', recurrent_dropout=0.2, dropout=0.2))(embed) + y = Dense(3,activation='softmax')(bi_1) + model = Model(inputs=x,outputs=y) + return model + + +def predict(): + x_len = 50 + + # ----- ----- ----- ----- ----- + # from keras.datasets import imdb + # (x_tr,y_tr),(x_te,y_te) = imdb.load_data(num_words=10000) + # ----- ----- ----- ----- ----- + + name = 'hotel' # clothing, fruit, hotel, pda, shampoo + (x_tr,y_tr,_),(x_de,y_de,_),(x_te,y_te,_),v_size,embs = load_data(name) + x_tr = sequence.pad_sequences(x_tr,maxlen=x_len) + x_de = sequence.pad_sequences(x_de,maxlen=x_len) + x_te = sequence.pad_sequences(x_te,maxlen=x_len) + y_tr = to_categorical(y_tr,3) + y_de = to_categorical(y_de,3) + y_te = to_categorical(y_te,3) + #with open('data/cache/t_tr','rb') as fin: y_tr = pickle.load(fin) + #with open('data/cache/t_de','rb') as fin: y_de = pickle.load(fin) + # y_tr = to_categorical(y_tr.argmax(axis=1),3) + # y_de = to_categorical(y_de.argmax(axis=1),3) + + # ----- ----- predict ----- ----- + # 模型的加载及使用 + + print("Using loaded model to predict...") + + load_model1 = load_model("model_weight(textcnn).h5") + start= time.time() + predicted = load_model1.predict(x_te) + predict_1 = np.argmax(predicted,axis=1) + print(predict_1.shape) + end = time.time() + print('time:',end-start,' acc:',accuracy_score(y_te, predict_1)) + + load_model1.summary() + + # ----- ----- ----- ----- ----- + +if __name__ == '__main__': + # run_small() + predict() diff --git a/BertToSimple/textcnn/readme.md b/BertToSimple/textcnn/readme.md new file mode 100644 index 0000000..c3775e3 --- /dev/null +++ b/BertToSimple/textcnn/readme.md @@ -0,0 +1,10 @@ +数据格式: +0 询问是否认识借款人,我是那个 +invalid -1 yes-2 no-3 + +时间,17000条数据预测时间为12s,每条8ms 准确率:87.44% + +训练:test.py +预测: predict.py + +测试集,验证集,训练集请放在与代码同级的data文件夹 data下设置一个hotel文件夹放置,分别test.txt dev.txt train.txt diff --git a/BertToSimple/textcnn/test.py b/BertToSimple/textcnn/test.py new file mode 100644 index 0000000..b0084b9 --- /dev/null +++ b/BertToSimple/textcnn/test.py @@ -0,0 +1,89 @@ +# -*-coding:utf-8-*- + +import pickle, numpy as np +from keras.layers import * +from keras.models import Model +from keras.initializers import Constant +from keras.preprocessing import sequence +from keras.models import load_model +from keras.utils.np_utils import to_categorical +from utils import load_data + +def get_textcnn(x_len, v_size, embs): + x = Input(shape=(x_len,),dtype='int32') + # embed = Embedding(v_size,300)(x) + embed = Embedding(v_size,300,embeddings_initializer=Constant(embs),trainable=False)(x) + cnn1 = Convolution1D(256,3,padding='same',strides=1,activation='relu')(embed) + cnn1 = MaxPool1D(pool_size=4)(cnn1) + cnn2 = Convolution1D(256,4,padding='same',strides=1,activation='relu')(embed) + cnn2 = MaxPool1D(pool_size=4)(cnn2) + cnn3 = Convolution1D(256,5,padding='same',strides=1,activation='relu')(embed) + cnn3 = MaxPool1D(pool_size=4)(cnn3) + cnn = concatenate([cnn1,cnn2,cnn3],axis=-1) + flat = Flatten()(cnn) + drop = Dropout(0.2,name='drop')(flat) + y = Dense(3,activation='softmax')(drop) + model = Model(inputs=x,outputs=y) + return model + +def get_birnn(x_len, v_size, embs): + x = Input(shape=(x_len,),dtype='int32') + # embed = Embedding(v_size,300)(x) + embed = Embedding(v_size,300,embeddings_initializer=Constant(embs),trainable=False)(x) + # bi = Bidirectional(GRU(256,activation='tanh',recurrent_dropout=0.2,dropout=0.2,return_sequences=True))(embed) + bi = Bidirectional(GRU(256,activation='tanh',recurrent_dropout=0.2,dropout=0.2))(embed) + bi_1 = Bidirectional(GRU(256, activation='tanh', recurrent_dropout=0.2, dropout=0.2))(embed) + y = Dense(3,activation='softmax')(bi_1) + model = Model(inputs=x,outputs=y) + return model + +def run_small(): + x_len = 50 + name = 'hotel' # clothing, fruit, hotel, pda, shampoo + (x_tr,y_tr,_),_,(x_te,y_te,_),v_size,embs = load_data(name) + x_tr = sequence.pad_sequences(x_tr,maxlen=x_len) + x_te = sequence.pad_sequences(x_te,maxlen=x_len) + y_tr = to_categorical(y_tr,3) + y_te = to_categorical(y_te,3) + # model = get_textcnn(x_len,v_size,embs) + model = get_birnn(x_len,v_size,embs) + model.compile(loss='softmax_crossentropy',optimizer='adam',metrics=['accuracy']) + model.fit(x_tr,y_tr,batch_size=32,epochs=5,validation_data=(x_te,y_te)) + +def run_distill(): + x_len = 50 + + # ----- ----- ----- ----- ----- + # from keras.datasets import imdb + # (x_tr,y_tr),(x_te,y_te) = imdb.load_data(num_words=10000) + # ----- ----- ----- ----- ----- + + name = 'hotel' # clothing, fruit, hotel, pda, shampoo + (x_tr,y_tr,_),(x_de,y_de,_),(x_te,y_te,_),v_size,embs = load_data(name) + x_tr = sequence.pad_sequences(x_tr,maxlen=x_len) + x_de = sequence.pad_sequences(x_de,maxlen=x_len) + x_te = sequence.pad_sequences(x_te,maxlen=x_len) + y_tr = to_categorical(y_tr,3) + y_de = to_categorical(y_de,3) + y_te = to_categorical(y_te,3) + #with open('data/cache/t_tr','rb') as fin: y_tr = pickle.load(fin) + #with open('data/cache/t_de','rb') as fin: y_de = pickle.load(fin) + # y_tr = to_categorical(y_tr.argmax(axis=1),2) + # y_de = to_categorical(y_de.argmax(axis=1),2) + + # ----- ----- distill ----- ----- + model = get_textcnn(x_len,v_size,embs) + #model = get_birnn(x_len,v_size,embs) + x_tr = np.vstack([x_tr,x_de]) + y_tr = np.vstack([y_tr,y_de]) + model.compile(loss='mse',optimizer='adam',metrics=['accuracy']) + print(x_tr.shape,y_tr.shape,x_te.shape,y_te.shape,x_de.shape,y_de.shape) + # model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) + model.summary() + model.fit(x_tr,y_tr,batch_size=32,epochs=5,validation_data=(x_te,y_te)) + model.save('model_weight.h5') # creates a HDF5 file 'my_model.h5' + # ----- ----- ----- ----- ----- + +if __name__ == '__main__': + # run_small() + run_distill() diff --git a/BertToSimple/textcnn/utils.py b/BertToSimple/textcnn/utils.py new file mode 100644 index 0000000..b7a4f47 --- /dev/null +++ b/BertToSimple/textcnn/utils.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +import jieba, random, fileinput, numpy as np +from keras.preprocessing.text import Tokenizer +from sklearn.model_selection import train_test_split + +def load_data(name): + def get_w2v(): + for line in open('data/cache/word2vec',encoding='utf-8').read().strip().split('\n'): + line = line.strip().split() + if not line: continue + yield line[0],np.array(list(map(float,line[1:]))) + tokenizer = Tokenizer(filters='',lower=True,split=' ',oov_token=1) + texts = [' '.join(jieba.cut(line.split('\t',1)[1].strip()))\ + for line in open('data/{}/train.txt'.format(name,name),encoding='utf-8' + ).read().strip().split('\n')] + tokenizer.fit_on_texts(texts) + # with open('word2vec','w') as out: + # for line in fileinput.input('sgns.sogou.word'): + # word = line.strip().split()[0] + # if word in tokenizer.word_index: + # out.write(line+'\n') + # fileinput.close() + x_train,y_train = [],[]; text_train = [] + for line in open('data/{}/train.txt'.format(name),encoding='utf-8').read().strip().split('\n'): + label,text = line.split('\t',1) + text_train.append(text.strip()) + x_train.append(' '.join(jieba.cut(text.strip()))) + y_train.append(int(label)) + x_train = tokenizer.texts_to_sequences(x_train) + x_dev,y_dev = [],[]; text_dev = [] + for line in open('data/{}/dev.txt'.format(name),encoding='utf-8').read().strip().split('\n'): + label,text = line.split('\t',1) + text_dev.append(text.strip()) + x_dev.append(' '.join(jieba.cut(text.strip()))) + y_dev.append(int(label)) + x_dev = tokenizer.texts_to_sequences(x_dev) + x_test,y_test = [],[]; text_test = [] + for line in open('data/{}/test.txt'.format(name),encoding='utf-8').read().strip().split('\n'): + label,text = line.split('\t',1) + text_test.append(text.strip()) + x_test.append(' '.join(jieba.cut(text.strip()))) + y_test.append(int(label)) + x_test = tokenizer.texts_to_sequences(x_test) + v_size = len(tokenizer.word_index)+1 + embs,w2v = np.zeros((v_size,300)),dict(get_w2v()) + for word,index in tokenizer.word_index.items(): + if word in w2v: embs[index] = w2v[word] + return (x_train,y_train,text_train),\ + (x_dev,y_dev,text_dev),\ + (x_test,y_test,text_test),\ + v_size,embs + +if __name__ == '__main__': + load_data(name='hotel') diff --git a/Preprocessor/InfoExtraction/caiyang.py b/Preprocessor/InfoExtraction/caiyang.py new file mode 100644 index 0000000..3fbf0e4 --- /dev/null +++ b/Preprocessor/InfoExtraction/caiyang.py @@ -0,0 +1,9 @@ +import pandas as pd +def split(path): + total = pd.read_excel(path, encoding='utf-8') + set_1 = total.sample(n=1000, random_state=0, axis=0) + set_2 =total.loc[~total.index.isin(set_1.index)] + set_1.to_csv("m2_1000.csv") + set_2.to_csv("m2_yu.csv") + +split("input_m2_4-10/哈银M2.xlsx") \ No newline at end of file diff --git a/Preprocessor/InfoExtraction/lexicon_external.txt b/Preprocessor/InfoExtraction/lexicon_external.txt new file mode 100644 index 0000000..813d6a9 --- /dev/null +++ b/Preprocessor/InfoExtraction/lexicon_external.txt @@ -0,0 +1,23 @@ +是 +能 +唉 +行 +啊 +哦 +嗯 +额 +零 +好的 +对的 +好嘞 +好了 +是的 +等一下 +不是 +不知道 +知道了 +有印象 +打错了 +没问题 +我知道 +我知道了 \ No newline at end of file diff --git a/Preprocessor/InfoExtraction/m1.py b/Preprocessor/InfoExtraction/m1.py new file mode 100644 index 0000000..7f06504 --- /dev/null +++ b/Preprocessor/InfoExtraction/m1.py @@ -0,0 +1,209 @@ +import pandas as pd +import numpy as np +from copy import deepcopy +from m1_dict import ai_map,in_dct,out_dct +import jieba +import os + + +def del_duplicate(info, info_name): + """ + # 删除重复的文本 在同一个数据集下 + """ + # ['processid', 'in_node', 'type_robot', 'msg', 'msg_del_dup', 'out_true', 'type', 'type_combine'] + set_only = set() + new_info = [] + for item in info: + msg = item[3] + if msg not in set_only: + set_only.add(msg) + new_info.append(item) + # print("\n对 {} 去重, 被删除的重复的语料的数量(以msg为准): {}. Final number: {}" + # .format(info_name, len(info) - len(new_info), len(new_info))) + return np.array(new_info) + + +def build_map_by_what(info, what_col): + # 按照意图节点切分: what_col = 2; 按照type切分: what_col = 6 + # print(info) + info = np.array(info) + nodes = np.unique(np.array(info[:, what_col])) + map_temp = dict(zip(nodes, [[] for _ in range(len(nodes))])) + for item in info: + map_temp[item[what_col]].append(item) + return map_temp + + +def read_data(file_dir): + cols = ["通话状态","通话记录"] + all_data = [] + for root, dirs, files in os.walk(file_dir): + for curr_file in files: + print("read file: {}".format(os.path.join(root, curr_file))) + temp = pd.read_csv(os.path.join(root, curr_file), usecols=cols).values + all_data += list(temp) + # print("------------------------------") + # print(temp) + all_data = [x for x in all_data if x[0] == '已接听' and isinstance(x[1], str) and len(x[1]) != 0 ] + return all_data + + +def split_ai_me(data,ai_map,wu_dic,out_dic): + in_node = [] + type_robot = [] + me = [] + out_node = [] + type = [] + AI = [] + S_id = [] #session_id + for line in data: + # ["通话记录ID","通话状态", "通话记录详情"] + texts = line[1].split('\n') + # print("***********************************") + # print(texts) + # s_id = str(line[1]).strip('') + + # 初始化index count + index= 0 + pre_ai, rear_ai = '', '' #in_node 和 out_node + pre_ai_key,rear_ai_key='*','**' + # 遍历文本:AI 和 ME + while index < len(texts): + temp = texts[index] + if not texts[index].startswith('ME'): #当前的ai问题是什么 in_node + if texts[index].startswith('AI'): + tt = texts[index] + for key_word in ai_map.keys(): + if key_word in texts[index]: + # print("//////////////////////////////////") + # print(key_word) + # print(texts[index]) + pre_ai = ai_map[key_word] + pre_ai_key=key_word + # print(wu_dic[pre_ai_key][1]) + break + pre_ai_key='*' #有AI说话无关键词 + index += 1 + continue + while texts[index].startswith('ME'): + kk = texts[index] + # 当前标签 + index_ai = index + while index_ai < len(texts) and (not texts[index_ai].startswith('AI')): + index_ai += 1 + if index_ai < len(texts) and texts[index_ai].startswith('AI'):# me 回答后的ai问题是什么 out_node + for key_word in ai_map.keys(): + if key_word in texts[index_ai]: + rear_ai = ai_map[key_word] + rear_ai_key = key_word + break + rear_ai_key = '*' #有AI说话但无关键字 + in_node.append(wu_dic[pre_ai_key][0]) + type_robot.append(wu_dic[pre_ai_key][1]) + me.append(texts[index][3:]) + out_node.append(out_dic[rear_ai_key][0]) + type.append(out_dic[rear_ai_key][1]) + + AI.append(tt) + # S_id.append(s_id) + rear_ai_key ='**' #重置 + # index加一:下一条AI或ME + index += 1 + + # 把 p_id in_node type_robot me out_node type 放在一起 + ans, temp = [], [] + for index in range(len(in_node)): + if type_robot[index] != "": + temp.append('hayinm1') + temp.append(in_node[index]) + temp.append(type_robot[index]) + temp.append(me[index]) + temp.append(out_node[index]) + temp.append(type[index]) + temp.append(AI[index]) + # temp.append(S_id[index]) + ans.append(deepcopy(temp)) + temp.clear() + return ans + + +def del_dul_word(dul_word): + new_string = [] + pre_ch = None + for ch in dul_word: + if ch != pre_ch: + new_string.append(ch) + pre_ch = ch + return new_string + + +def insert_cols(data): + jieba.load_userdict('./lexicon_external.txt') + ans = [] + for item in data: + msg = item[3].strip() + # 仅保留中文 + msg = ''.join([ch for ch in msg if ('\u4e00' <= ch <= '\u9fa5')]) + if len(msg) == 0: + continue + item[3] = msg + # 去叠词:msg_del_dul + msg_del_dul = del_dul_word([word for word in jieba.cut(msg)]) + item.append(''.join(msg_del_dul)) + # item += [''] + item.append('') + ans.append(item) + print('过滤 msg左右空格+符号+英文+数字 仅留中文汉字: {} -> {}'.format(len(data), len(ans))) + return ans + + +if __name__ == '__main__': + input_dir = './imput_m1_11+_1000' + high_data = read_data(input_dir) + + # 处理 "通话记录详情" + high_data = split_ai_me(high_data,ai_map,in_dct,out_dct) + print("6666666666666666666666666") + print(high_data) + + #插入新列:去叠词 + high_data = insert_cols(high_data) + print("777777777777777777777777777") + print(high_data) + + # 去重复 + high_data_by_node = build_map_by_what(high_data, 2) + del high_data + high_data = [] + for key_node, val_node in high_data_by_node.items(): + high_data += list(del_duplicate(val_node, key_node)) + # print("当前数量量:{}".format(len(high_data))) + # del high_data_by_node + # 保存 + str = 'hayinm1duolun_new1' # 直接更改文件名标识 + sava_path = './output_m1_1000/'+str+'_result.csv' + # original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q","session_id", "msg_del_dup", "type_combine"] + original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q", "msg_del_dup", "type_combine"] + new_col = ["session_id","processid", "AI_Q", "in_node", "type_robot", "msg", "msg_del_dup", "out_true", "type", "type_combine"] + #所有数据保存 + high_data = pd.DataFrame(high_data, columns=original_col) + high_data = high_data.reindex(columns=new_col) + high_data.to_csv(sava_path, encoding='utf-8-sig', index_label='id') + + # # 划分:需要标注 and 不需要标注/入结点知识库+表示结束 + tag ,un_tag = [], [] + for item in high_data.values: + if item[3] in ['无AI其他','知识库','敏感词','其他','4.1','4.4','5.1','5.2','5.3','5.4','2.1','3.2']: + un_tag.append(item) + else: + tag.append(item) + + # 需要标注 + tag = sorted(tag, key=lambda k: (k[4])) + tag=pd.DataFrame(tag,columns=new_col) + tag.to_csv('./output_m1_1000/'+str+'_tag.csv',encoding='utf-8-sig', index_label='id') + + # 不需要标注 + un_tag = sorted(un_tag, key=lambda k: (k[4])) + un_tag = pd.DataFrame(un_tag, columns=new_col) + un_tag.to_csv('./output_m1_1000/'+str+'_untag.csv', encoding='utf-8-sig', index_label='id') \ No newline at end of file diff --git a/Preprocessor/InfoExtraction/m1_dict.py b/Preprocessor/InfoExtraction/m1_dict.py new file mode 100644 index 0000000..73c7cef --- /dev/null +++ b/Preprocessor/InfoExtraction/m1_dict.py @@ -0,0 +1,150 @@ + +ai_map = {"本人吗": '身份确认', + "您是不是": '身份确认', + "原因": '询问未还款原因', + "联系方式": '询问本人是否方便接听电话', + "方便接听": '询问本人是否方便接听电话', + "今天赶紧想办法": "询问今日能否还款", + "今天能": "询问今日能否还款", + "麻烦他本人": "请本人接听电话", + "那让他本人": "请本人接听电话", + "请您让他": "请本人接听电话", + "朋友": "是否为亲属朋友", + "今天务必": "结束4.1", + "能想办法还": "询问明日能否还款", + "明天您能": "询问明日能否还款", + "办理了相关业务": "请求转告", + "请他务必尽快解决": "结束4.4", + "记录下来": "结束5.1", + "很遗憾": "结束5.2", + "您的配合": "结束5.3", + "尽量帮忙转告": "结束5.4", + "来电是提醒您": "询问案源机构", + "消费贷款目前已经": "询问欠款金额", + "一笔应还款未处理,": "不认可金额", + "稍后重新处理": "还款失败", + "减免利息": "减免利息", + "还款日期应为": "询问逾期时间", + "人工接待": "转人工", + "办理分期": "如何办理账单分期", + "定时划扣": "如何自动扣款", + "还错": "还错卡", + "目的是": "询问工号", + "诶": "打断", + "无法在线核实": "询问绑定卡号", + "疑问": "质疑身份", + "逾期时间太长": "过几天还", + "我们的客服热线": "客服热线", + "智能语音助手": "询问是否为机器人", + "首先很抱歉": "投诉/涉媒/公检法", + "延时": "已还款", + "先忙": "在忙", + "好的, 稍后": "持卡人吸毒", + "节哀": "持卡人死亡", + "不好意思,稍后": "持卡人重病/持卡人坐牢", + "(直接挂机)": "挂机", + "您如果想把逾期影响降到最低":"没钱还", + "不良影响就越大":"不配合还款", + "便捷还款方式": "还款不便", + } +in_dct = {"本人吗": ['1.1', '身份确认'], + "您是不是": ['1.1', '身份确认'], + "原因": ['2.1', '询问未还款原因'], + "联系方式": ['2.2', '询问本人是否方便接听电话'], + "方便接听": ['2.2', '询问本人是否方便接听电话'], + "今天赶紧想办法": ['3.1', "询问今日能否还款"], + "今天能": ['3.1', "询问今日能否还款"], + "麻烦他本人": ["3.2", "请本人接听电话"], + "那让他本人": ["3.2", "请本人接听电话"], + "请您让他": ["3.2", "请本人接听电话"], + "朋友": ['3.3', "是否为亲属朋友"], + "今天务必": ['4.1', "结束4.1"], + "能想办法还": ['4.2', "询问明日能否还款"], + "明天您能": ['4.2', "询问明日能否还款"], + "办理了相关业务": ['4.3', "请求转告"], + "请他务必尽快解决": ['4.4', "结束4.4"], + "记录下来": ['5.1', "结束5.1"], + "很遗憾": ['5.2', "结束5.2"], + "您的配合": ['5.3', "结束5.3"], + "尽量帮忙转告": ['5.4', "结束5.4"], + "**": ['无AI的其他', ''], + "*": ['其他', ''], + "来电是提醒您": ['知识库', "询问案源机构"], + "消费贷款目前已经": ['知识库', "询问欠款金额"], + "一笔应还款未处理": ['知识库', "不认可金额"], + "稍后重新处理": ['知识库', "还款失败"], + "减免利息": ['知识库', "减免利息"], + "还款日期应为": ['知识库', "询问逾期时间"], + "人工接待": ['知识库', "转人工"], + "办理分期": ['知识库', "如何办理账单分期"], + "定时划扣": ['知识库', "如何自动扣款"], + "还错": ['知识库', "还错卡"], + "目的是": ['知识库', "询问工号"], + "诶": ['知识库', "打断"], + "无法在线核实": ['知识库', "询问绑定卡号"], + "疑问": ['知识库', "质疑身份"], + "逾期时间太长": ['知识库', "过几天还"], + "我们的客服热线": ['知识库', "客服热线"], + "智能语音助手": ['知识库', "询问是否为机器人"], + "首先很抱歉": ['敏感词', "投诉/涉媒/公检法"], + "延时": ['敏感词', "已还款"], + "先忙": ['敏感词', "在忙"], + "好的, 稍后": ['敏感词', "持卡人吸毒"], + "节哀": ['敏感词', "持卡人死亡"], + "不好意思,稍后": ['敏感词', "持卡人重病/持卡人坐牢"], + "(直接挂机)": ["知识库", "挂机"], + "您如果想把逾期影响降到最低": ["没钱还3.2","没钱还"], + "不良影响就越大": ["不配合还款2.1","不配合还款"], + "便捷还款方式":["还款不便3.2","还款不便"], + } + +out_dct = {"本人吗": ['1.1', '身份确认'], + "您是不是": ['1.1', '身份确认'], + "原因": ['2.1', '本人'], + "联系方式": ['2.2', '非本人'], + "方便接听": ['2.2', '非本人'], + "今天赶紧想办法": ['3.1', "说出原因"], + "今天能": ['3.1', "说出原因"], + "麻烦他本人": ["3.2", "方便"], + "那让他本人": ["3.2", "方便"], + "请您让他": ["3.2", "方便"], + "朋友": ['3.3', "不方便"], + "今天务必": ['4.1', "同意今日还"], + "能想办法还": ['4.2', "今日还不了"], + "明天您能": ['4.2', "今日还不了"], + "办理了相关业务": ['4.3', "关联人"], + "请他务必尽快解决": ['4.4', "非关联人"], + "记录下来": ['5.1', "同意明日还"], + "很遗憾": ['5.2', "明日还不了"], + "您的配合": ['5.3', "同意转告"], + "尽量帮忙转告": ['5.4', "不同意转告"], + "**": ['无AI的其他', ''], + "*": ['其他', ''], + "来电是提醒您": ['知识库', "询问案源机构"], + "消费贷款目前已经": ['知识库', "询问欠款金额"], + "一笔应还款未处理": ['知识库', "不认可金额"], + "稍后重新处理": ['知识库', "还款失败"], + "减免利息": ['知识库', "减免利息"], + "还款日期应为": ['知识库', "询问逾期时间"], + "人工接待": ['知识库', "转人工"], + "办理分期": ['知识库', "如何办理账单分期"], + "定时划扣": ['知识库', "如何自动扣款"], + "还错": ['知识库', "还错卡"], + "目的是": ['知识库', "询问工号"], + "诶": ['知识库', "打断"], + "无法在线核实": ['知识库', "询问绑定卡号"], + "疑问": ['知识库', "质疑身份"], + "逾期时间太长": ['知识库', "过几天还"], + "我们的客服热线": ['知识库', "客服热线"], + "智能语音助手": ['知识库', "询问是否为机器人"], + "首先很抱歉": ['敏感词', "投诉/涉媒/公检法"], + "延时": ['敏感词', "已还款"], + "先忙": ['敏感词', "在忙"], + "好的, 稍后": ['敏感词', "持卡人吸毒"], + "节哀": ['敏感词', "持卡人死亡"], + "不好意思,稍后": ['敏感词', "持卡人重病/持卡人坐牢"], + "(直接挂机)": ["知识库", "挂机"], + "您如果想把逾期影响降到最低": ["没钱还3.2", "没钱还"], + "不良影响就越大": ["不配合还款2.1", "不配合还款"], + "便捷还款方式": ["还款不便3.2", "还款不便"], + } \ No newline at end of file diff --git a/Preprocessor/InfoExtraction/m2.py b/Preprocessor/InfoExtraction/m2.py new file mode 100644 index 0000000..5df807a --- /dev/null +++ b/Preprocessor/InfoExtraction/m2.py @@ -0,0 +1,201 @@ +import pandas as pd +import numpy as np +from copy import deepcopy +from m2_dict import ai_map,in_dct,out_dct +import jieba +import os + + +def del_duplicate(info, info_name): + """ + # 删除重复的文本 在同一个数据集下 + """ + # ['processid', 'in_node', 'type_robot', 'msg', 'msg_del_dup', 'out_true', 'type', 'type_combine'] + set_only = set() + new_info = [] + for item in info: + msg = item[3] + if msg not in set_only: + set_only.add(msg) + new_info.append(item) + print("\n对 {} 去重, 被删除的重复的语料的数量(以msg为准): {}. Final number: {}" + .format(info_name, len(info) - len(new_info), len(new_info))) + return np.array(new_info) + + +def build_map_by_what(info, what_col): + # 按照意图节点切分: what_col = 2; 按照type切分: what_col = 6 + # print(info) + info = np.array(info) + nodes = np.unique(np.array(info[:, what_col])) + map_temp = dict(zip(nodes, [[] for _ in range(len(nodes))])) + for item in info: + map_temp[item[what_col]].append(item) + return map_temp + + +def read_data(file_dir): + cols = ["通话状态", "通话记录id","通话记录"] + all_data = [] + for root, dirs, files in os.walk(file_dir): + for curr_file in files: + print("read file: {}".format(os.path.join(root, curr_file))) + temp = pd.read_csv(os.path.join(root, curr_file), usecols=cols).values + print("---------------------------------") + print(temp) + all_data += list(temp) + all_data = [x for x in all_data if x[1] == '已接听' and isinstance(x[2], str) and len(x[2]) != 0 ] + return all_data + + +def split_ai_me(data,ai_map,wu_dic,out_dic): + in_node = [] + type_robot = [] + me = [] + out_node = [] + type = [] + AI = [] + S_id = [] #session_id + for line in data: + # ["通话记录ID","通话状态", "通话记录详情"] + texts = line[2].split('\n') + s_id = str(line[0]).strip('') + + # 初始化index count + index= 0 + pre_ai, rear_ai = '', '' #in_node 和 out_node + pre_ai_key,rear_ai_key='*','**' + # 遍历文本:AI 和 ME + while index < len(texts): + temp = texts[index] + if not texts[index].startswith('ME'): #当前的ai问题是什么 in_node + if texts[index].startswith('AI'): + tt = texts[index] + for key_word in ai_map.keys(): + if key_word in texts[index]: + pre_ai = ai_map[key_word] + pre_ai_key=key_word + break + pre_ai_key='*' #有AI说话无关键词 + index += 1 + continue + while texts[index].startswith('ME'): + kk = texts[index] + # 当前标签 + index_ai = index + while index_ai < len(texts) and (not texts[index_ai].startswith('AI')): + index_ai += 1 + if index_ai < len(texts) and texts[index_ai].startswith('AI'):# me 回答后的ai问题是什么 out_node + for key_word in ai_map.keys(): + if key_word in texts[index_ai]: + rear_ai = ai_map[key_word] + rear_ai_key = key_word + break + rear_ai_key = '*' #有AI说话但无关键字 + in_node.append(wu_dic[pre_ai_key][0]) + type_robot.append(wu_dic[pre_ai_key][1]) + me.append(texts[index][3:]) + out_node.append(out_dic[rear_ai_key][0]) + type.append(out_dic[rear_ai_key][1]) + + AI.append(tt) + S_id.append(s_id) + rear_ai_key ='**' #重置 + # index加一:下一条AI或ME + index += 1 + + # 把 p_id in_node type_robot me out_node type 放在一起 + ans, temp = [], [] + for index in range(len(in_node)): + if type_robot[index] != "": + temp.append('hayinm2') + temp.append(in_node[index]) + temp.append(type_robot[index]) + temp.append(me[index]) + temp.append(out_node[index]) + temp.append(type[index]) + temp.append(AI[index]) + temp.append(S_id[index]) + ans.append(deepcopy(temp)) + temp.clear() + return ans + + +def del_dul_word(dul_word): + new_string = [] + pre_ch = None + for ch in dul_word: + if ch != pre_ch: + new_string.append(ch) + pre_ch = ch + return new_string + + +def insert_cols(data): + jieba.load_userdict('./lexicon_external.txt') + ans = [] + for item in data: + msg = item[3].strip() + # 仅保留中文 + msg = ''.join([ch for ch in msg if ('\u4e00' <= ch <= '\u9fa5')]) + if len(msg) == 0: + continue + item[3] = msg + # 去叠词:msg_del_dul + msg_del_dul = del_dul_word([word for word in jieba.cut(msg)]) + item.append(''.join(msg_del_dul)) + # item += [''] + item.append('') + ans.append(item) + print('过滤 msg左右空格+符号+英文+数字 仅留中文汉字: {} -> {}'.format(len(data), len(ans))) + return ans + + +if __name__ == '__main__': + input_dir = './input_m2_4-10_1000' + high_data = read_data(input_dir) + + + # 处理 "通话记录详情" + high_data = split_ai_me(high_data,ai_map,in_dct,out_dct) + + + #插入新列:去叠词 + high_data = insert_cols(high_data) + + + # 去重复 + high_data_by_node = build_map_by_what(high_data, 2) + del high_data + high_data = [] + for key_node, val_node in high_data_by_node.items(): + high_data += list(del_duplicate(val_node, key_node)) + print("当前数量量:{}".format(len(high_data))) + # del high_data_by_node + # 保存 + str = 'hayinm2duolun' # 直接更改文件名标识 + sava_path = './output_m2_1000/'+str+'_result.csv' + original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q","session_id", "msg_del_dup", "type_combine"] + new_col = ["session_id","processid", "AI_Q", "in_node", "type_robot", "msg", "msg_del_dup", "out_true", "type", "type_combine"] + #所有数据保存 + high_data = pd.DataFrame(high_data, columns=original_col) + high_data = high_data.reindex(columns=new_col) + high_data.to_csv(sava_path, encoding='utf-8-sig', index_label='id') + + # # 划分:需要标注 and 不需要标注/入结点知识库+表示结束 + tag ,un_tag = [], [] + for item in high_data.values: + if item[3] in ['无AI其他','知识库','敏感词','其他','3.1','3.3','4.4','5.1/5.3','5.2/5.4','5.5','5.6'] : + un_tag.append(item) + else: + tag.append(item) + + # 需要标注 + tag = sorted(tag, key=lambda k: (k[4])) + tag=pd.DataFrame(tag,columns=new_col) + tag.to_csv('./output_m2_1000/'+str+'_tag.csv',encoding='utf-8-sig', index_label='id') + + # 不需要标注 + un_tag = sorted(un_tag, key=lambda k: (k[4])) + un_tag = pd.DataFrame(un_tag, columns=new_col) + un_tag.to_csv('./output_m2_1000/'+str+'_untag.csv', encoding='utf-8-sig', index_label='id') \ No newline at end of file diff --git a/Preprocessor/InfoExtraction/m2_dict.py b/Preprocessor/InfoExtraction/m2_dict.py new file mode 100644 index 0000000..e886d4a --- /dev/null +++ b/Preprocessor/InfoExtraction/m2_dict.py @@ -0,0 +1,184 @@ +ai_map = {"本人吗": "身份确认", + "您是不是": "身份确认", + "今天处理一下": "询问今日能否还款", + "您今天能否处理一下": "询问今日能否还款", + "联系方式": "询问电话号码是否是本人在用", + "这个号码": "询问电话号码是否是本人在用", + "务必今天尽快还款": "结束3.1", + "今天处理不了呢": "询问是没时间还是没钱", + "没时间还": "询问是没时间还是没钱", + "我们再联系他": "稍后联系麻烦转告接听", + "朋友": "是否为亲属或者朋友", + "明天能想办法处理吗": "询问明日能否还入最低还款额", + "明天先处理一下": "询问明日能否还入最低还款额", + "明天能把钱还": "询问明日能否还入最低还款额", + "明天务必": "询问明日能否还款", + "明天能不能": "询问明日能否还款", + "注意接听电话": "请求转告", + "再次联系他": "结束4.4", + "那稍后我会跟进您的还": "结束5.1/5.3", + "很遗憾未能与您达成": "结束5.2/5.4", + "非常感谢您的配合": "结束5.5", + "尽量帮忙转告": "结束5.6", + "**": ['无AI的其他', ''], + "*": ['其他', ''], + "来电是提醒您": "询问案源机构", + "消费贷款目前已经": "询问欠款金额", + "一笔应还款未处理,": "不认可金额", + "稍后重新处理": "还款失败", + "减免利息": "减免利息", + "还款日期应为": "询问逾期时间", + "人工接待": "转人工", + "办理分期": "如何办理账单分期", + "定时划扣": "如何自动扣款", + "还错": "还错卡", + "目的是": "询问工号", + "诶": "打断", + "无法在线核实": "询问绑定卡号", + "疑问": "质疑身份", + "逾期时间太长": "过几天还", + "我们的客服热线": "客服热线", + "智能语音助手": "询问是否为机器人", + "首先很抱歉": "投诉/涉媒/公检法", + "延时": "已还款", + "先忙": "在忙", + "好的, 稍后": "持卡人吸毒", + "节哀": "持卡人死亡", + "不好意思,稍后": "持卡人重病/持卡人坐牢", + + "(直接挂机)": "挂机", + + "您如果想把逾期影响降到最低": "没钱还", + + + "不良影响就越大": "不配合还款", + + + "明天赶紧想办法": "还款不便询问明日能否还款", + "明天能不能想办法": "还款不便询问明日能否还款", + "明天能想办法": "还款不便询问明日能否还款", + } + +in_dct = {"本人吗": ['1.1', "身份确认"], + "您是不是": ['1.1', "身份确认"], + "今天处理一下": ['2.1', "询问今日能否还款"], + "您今天能否处理一下": ['2.1', "询问今日能否还款"], + "联系方式": ['2.2', "询问电话号码是否是本人在用"], + "这个号码": ['2.2', "询问电话号码是否是本人在用"], + "务必今天尽快还款": ['3.1', "结束3.1"], + "今天处理不了呢": ['3.2', "询问是没时间还是没钱"], + "没时间还": ['3.2', "询问是没时间还是没钱"], + "我们再联系他": ['3.3', "稍后联系麻烦转告接听"], + "朋友": ['3.4', "是否为亲属或者朋友"], + "明天能想办法处理吗": ['4.1', "询问明日能否还入最低还款额"], + "明天先处理一下": ['4.1', "询问明日能否还入最低还款额"], + "明天能把钱还": ['4.1', "询问明日能否还入最低还款额"], + "明天务必": ['4.2', "询问明日能否还款"], + "明天能不能": ['4.2', "询问明日能否还款"], + "注意接听电话": ['4.3', "请求转告"], + "再次联系他": ['4.4', "结束4.4"], + "那稍后我会跟进您的还": ['5.1/5.3', "结束5.1/5.3"], + "很遗憾未能与您达成": ['5.2/5.4', "结束5.2/5.4"], + "非常感谢您的配合": ['5.5', "结束5.5"], + "尽量帮忙转告": ['5.6', "结束5.6"], + "**": ['无AI的其他', ''], + "*": ['其他', ''], + "来电是提醒您": ['知识库', "询问案源机构"], + "消费贷款目前已经": ['知识库', "询问欠款金额"], + "一笔应还款未处理": ['知识库', "不认可金额"], + "稍后重新处理": ['知识库', "还款失败"], + "减免利息": ['知识库', "减免利息"], + "还款日期应为": ['知识库', "询问逾期时间"], + "人工接待": ['知识库', "转人工"], + "办理分期": ['知识库', "如何办理账单分期"], + "定时划扣": ['知识库', "如何自动扣款"], + "还错": ['知识库', "还错卡"], + "目的是": ['知识库', "询问工号"], + "诶": ['知识库', "打断"], + "无法在线核实": ['知识库', "询问绑定卡号"], + "疑问": ['知识库', "质疑身份"], + "逾期时间太长": ['知识库', "过几天还"], + "我们的客服热线": ['知识库', "客服热线"], + "智能语音助手": ['知识库', "询问是否为机器人"], + "首先很抱歉": ['敏感词', "投诉/涉媒/公检法"], + "延时": ['敏感词', "已还款"], + "先忙": ['敏感词', "在忙"], + "好的, 稍后": ['敏感词', "持卡人吸毒"], + "节哀": ['敏感词', "持卡人死亡"], + "不好意思,稍后": ['敏感词', "持卡人重病/持卡人坐牢"], + + "(直接挂机)": ["知识库", "挂机"], + + "您如果想把逾期影响降到最低": ["没钱还3.2","没钱还"], + + + "不良影响就越大": ["不配合还款2.1","不配合还款"], + + + "明天赶紧想办法":["还款不便3.2","还款不便"], + "明天能不能想办法": ["还款不便3.2", "还款不便"], + "明天能想办法": ["还款不便3.2", "还款不便"] + + + } + +out_dct = {"本人吗": ['1.1', "身份确认"], + "您是不是": ['1.1', "身份确认"], + "今天处理一下": ['2.1', "本人"], + "您今天能否处理一下": ['2.1', "本人"], + "联系方式": ['2.2', "非本人"], + "这个号码": ['2.2', "非本人"], + "务必今天尽快还款": ['3.1', "同意今日还"], + "今天处理不了呢": ['3.2', "今日还不了"], + "没时间还": ['3.2', "今日还不了"], + "我们再联系他": ['3.3', "号码为本人在用"], + "朋友": ['3.4', "号码不是本人在用"], + "明天能想办法处理吗": ['4.1', "没钱"], + "明天先处理一下": ['4.1', "没钱"], + "明天能把钱还": ['4.1', "没钱"], + "明天务必": ['4.2', "没时间"], + "明天能不能": ['4.2', "没时间"], + "注意接听电话": ['4.3', "关联人"], + "再次联系他": ['4.4', "非关联人"], + "那稍后我会跟进您的还": ['5.1/5.3', "同意明日还"], + "很遗憾未能与您达成": ['5.2/5.4', "明日还不了"], + "非常感谢您的配合": ['5.5', "同意转告"], + "尽量帮忙转告": ['5.6', "不同意转告"], + "**": ['无AI的其他', ''], + "*": ['其他', ''], + "来电是提醒您": ['知识库', "询问案源机构"], + "消费贷款目前已经": ['知识库', "询问欠款金额"], + "一笔应还款未处理": ['知识库', "不认可金额"], + "稍后重新处理": ['知识库', "还款失败"], + "减免利息": ['知识库', "减免利息"], + "还款日期应为": ['知识库', "询问逾期时间"], + "人工接待": ['知识库', "转人工"], + "办理分期": ['知识库', "如何办理账单分期"], + "定时划扣": ['知识库', "如何自动扣款"], + "还错": ['知识库', "还错卡"], + "目的是": ['知识库', "询问工号"], + "诶": ['知识库', "打断"], + "无法在线核实": ['知识库', "询问绑定卡号"], + "疑问": ['知识库', "质疑身份"], + "逾期时间太长": ['知识库', "过几天还"], + "我们的客服热线": ['知识库', "客服热线"], + "智能语音助手": ['知识库', "询问是否为机器人"], + "首先很抱歉": ['敏感词', "投诉/涉媒/公检法"], + "延时": ['敏感词', "已还款"], + "先忙": ['敏感词', "在忙"], + "好的, 稍后": ['敏感词', "持卡人吸毒"], + "节哀": ['敏感词', "持卡人死亡"], + "不好意思,稍后": ['敏感词', "持卡人重病/持卡人坐牢"], + + "(直接挂机)": ["知识库", "挂机"], + + "您如果想把逾期影响降到最低": ["没钱还3.2", "没钱还"], + + "不良影响就越大": ["不配合还款2.1", "不配合还款"], + + "明天赶紧想办法": ["还款不便3.2", "还款不便"], + "明天能不能想办法": ["还款不便3.2", "还款不便"], + "明天能想办法": ["还款不便3.2", "还款不便"] + } + + diff --git a/Preprocessor/InfoExtraction/nodel_m1.py b/Preprocessor/InfoExtraction/nodel_m1.py new file mode 100644 index 0000000..7283080 --- /dev/null +++ b/Preprocessor/InfoExtraction/nodel_m1.py @@ -0,0 +1,163 @@ +import pandas as pd +import numpy as np +from copy import deepcopy +from m1_dict import ai_map,in_dct,out_dct +import jieba +import os + + + + +def read_data(file_dir): + cols = ["通话状态", "通话记录id","通话记录"] + all_data = [] + for root, dirs, files in os.walk(file_dir): + for curr_file in files: + print("read file: {}".format(os.path.join(root, curr_file))) + temp = pd.read_csv(os.path.join(root, curr_file), usecols=cols).values + all_data += list(temp) + all_data = [x for x in all_data if x[0] == '已接听' and isinstance(x[2], str) and len(x[2]) != 0 ] + return all_data + + +def split_ai_me(data,ai_map,wu_dic,out_dic): + in_node = [] + type_robot = [] + me = [] + out_node = [] + type = [] + AI = [] + S_id = [] #session_id + for line in data: + # ["通话记录ID","通话状态", "通话记录详情"] + texts = line[2].split('\n') + s_id = str(line[1]).strip('') + + # 初始化index count + index= 0 + pre_ai, rear_ai = '', '' #in_node 和 out_node + pre_ai_key,rear_ai_key='*','**' + # 遍历文本:AI 和 ME + while index < len(texts): + temp = texts[index] + if not texts[index].startswith('ME'): #当前的ai问题是什么 in_node + if texts[index].startswith('AI'): + tt = texts[index] + for key_word in ai_map.keys(): + if key_word in texts[index]: + pre_ai = ai_map[key_word] + pre_ai_key=key_word + break + pre_ai_key='*' #有AI说话无关键词 + index += 1 + continue + while texts[index].startswith('ME'): + kk = texts[index] + # 当前标签 + index_ai = index + while index_ai < len(texts) and (not texts[index_ai].startswith('AI')): + index_ai += 1 + if index_ai < len(texts) and texts[index_ai].startswith('AI'):# me 回答后的ai问题是什么 out_node + for key_word in ai_map.keys(): + if key_word in texts[index_ai]: + rear_ai = ai_map[key_word] + rear_ai_key = key_word + break + rear_ai_key = '*' #有AI说话但无关键字 + in_node.append(wu_dic[pre_ai_key][0]) + type_robot.append(wu_dic[pre_ai_key][1]) + me.append(texts[index][3:]) + out_node.append(out_dic[rear_ai_key][0]) + type.append(out_dic[rear_ai_key][1]) + + AI.append(tt) + S_id.append(s_id) + rear_ai_key ='**' #重置 + # index加一:下一条AI或ME + index += 1 + + # 把 p_id in_node type_robot me out_node type 放在一起 + ans, temp = [], [] + for index in range(len(in_node)): + temp.append('hayinm1') + temp.append(in_node[index]) + temp.append(type_robot[index]) + temp.append(me[index]) + temp.append(out_node[index]) + temp.append(type[index]) + temp.append(AI[index]) + temp.append(S_id[index]) + ans.append(deepcopy(temp)) + temp.clear() + return ans + + +def del_dul_word(dul_word): + new_string = [] + pre_ch = None + for ch in dul_word: + if ch != pre_ch: + new_string.append(ch) + pre_ch = ch + return new_string + + +def insert_cols(data): + jieba.load_userdict('./lexicon_external.txt') + ans = [] + for item in data: + msg = item[3].strip() + # 仅保留中文 + msg = ''.join([ch for ch in msg if ('\u4e00' <= ch <= '\u9fa5')]) + if len(msg) == 0: + continue + item[3] = msg + # 去叠词:msg_del_dul + msg_del_dul = del_dul_word([word for word in jieba.cut(msg)]) + item.append(''.join(msg_del_dul)) + # item += [''] + item.append('') + ans.append(item) + print('过滤 msg左右空格+符号+英文+数字 仅留中文汉字: {} -> {}'.format(len(data), len(ans))) + return ans + + +if __name__ == '__main__': + input_dir = './imput_m1_11+_1000' + high_data = read_data(input_dir) + + # 处理 "通话记录详情" + high_data = split_ai_me(high_data,ai_map,in_dct,out_dct) + + + #插入新列:去叠词 + high_data = insert_cols(high_data) + + + + str = 'hayinm1duolun_nodel_new1' # 直接更改文件名标识 + sava_path = './output_m1_1000/'+str+'_result.csv' + original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q","session_id", "msg_del_dup", "type_combine"] + new_col = ["session_id","processid", "AI_Q", "in_node", "type_robot", "msg", "msg_del_dup", "out_true", "type", "type_combine"] + #所有数据保存 + high_data = pd.DataFrame(high_data, columns=original_col) + high_data = high_data.reindex(columns=new_col) + high_data.to_csv(sava_path, encoding='utf-8-sig', index_label='id') + + # # 划分:需要标注 and 不需要标注/入结点知识库+表示结束 + tag ,un_tag = [], [] + for item in high_data.values: + if item[3] in ['无AI其他','知识库','敏感词','其他','4.1','4.4','5.1','5.2','5.3','5.4','2.1','3.2']: + un_tag.append(item) + else: + tag.append(item) + + # 需要标注 + tag = sorted(tag, key=lambda k: (k[4])) + tag=pd.DataFrame(tag,columns=new_col) + tag.to_csv('./output_m1_1000/'+str+'_tag.csv',encoding='utf-8-sig', index_label='id') + + # 不需要标注 + un_tag = sorted(un_tag, key=lambda k: (k[4])) + un_tag = pd.DataFrame(un_tag, columns=new_col) + un_tag.to_csv('./output_m1_1000/'+str+'_untag.csv', encoding='utf-8-sig', index_label='id') \ No newline at end of file diff --git a/Preprocessor/InfoExtraction/nodel_m2.py b/Preprocessor/InfoExtraction/nodel_m2.py new file mode 100644 index 0000000..72bb35c --- /dev/null +++ b/Preprocessor/InfoExtraction/nodel_m2.py @@ -0,0 +1,163 @@ +import pandas as pd +import numpy as np +from copy import deepcopy +from m2_dict import ai_map,in_dct,out_dct +import jieba +import os + + + + +def read_data(file_dir): + cols = ["通话状态", "通话记录id","通话记录"] + all_data = [] + for root, dirs, files in os.walk(file_dir): + for curr_file in files: + print("read file: {}".format(os.path.join(root, curr_file))) + temp = pd.read_csv(os.path.join(root, curr_file), usecols=cols).values + all_data += list(temp) + all_data = [x for x in all_data if x[0] == '已接听' and isinstance(x[2], str) and len(x[2]) != 0 ] + return all_data + + +def split_ai_me(data,ai_map,wu_dic,out_dic): + in_node = [] + type_robot = [] + me = [] + out_node = [] + type = [] + AI = [] + S_id = [] #session_id + for line in data: + # ["通话记录ID","通话状态", "通话记录详情"] + texts = line[2].split('\n') + s_id = str(line[1]).strip('') + + # 初始化index count + index= 0 + pre_ai, rear_ai = '', '' #in_node 和 out_node + pre_ai_key,rear_ai_key='*','**' + # 遍历文本:AI 和 ME + while index < len(texts): + temp = texts[index] + if not texts[index].startswith('ME'): #当前的ai问题是什么 in_node + if texts[index].startswith('AI'): + tt = texts[index] + for key_word in ai_map.keys(): + if key_word in texts[index]: + pre_ai = ai_map[key_word] + pre_ai_key=key_word + break + pre_ai_key='*' #有AI说话无关键词 + index += 1 + continue + while texts[index].startswith('ME'): + kk = texts[index] + # 当前标签 + index_ai = index + while index_ai < len(texts) and (not texts[index_ai].startswith('AI')): + index_ai += 1 + if index_ai < len(texts) and texts[index_ai].startswith('AI'):# me 回答后的ai问题是什么 out_node + for key_word in ai_map.keys(): + if key_word in texts[index_ai]: + rear_ai = ai_map[key_word] + rear_ai_key = key_word + break + rear_ai_key = '*' #有AI说话但无关键字 + in_node.append(wu_dic[pre_ai_key][0]) + type_robot.append(wu_dic[pre_ai_key][1]) + me.append(texts[index][3:]) + out_node.append(out_dic[rear_ai_key][0]) + type.append(out_dic[rear_ai_key][1]) + + AI.append(tt) + S_id.append(s_id) + rear_ai_key ='**' #重置 + # index加一:下一条AI或ME + index += 1 + + # 把 p_id in_node type_robot me out_node type 放在一起 + ans, temp = [], [] + for index in range(len(in_node)): + temp.append('hayinm2') + temp.append(in_node[index]) + temp.append(type_robot[index]) + temp.append(me[index]) + temp.append(out_node[index]) + temp.append(type[index]) + temp.append(AI[index]) + temp.append(S_id[index]) + ans.append(deepcopy(temp)) + temp.clear() + return ans + + +def del_dul_word(dul_word): + new_string = [] + pre_ch = None + for ch in dul_word: + if ch != pre_ch: + new_string.append(ch) + pre_ch = ch + return new_string + + +def insert_cols(data): + jieba.load_userdict('./lexicon_external.txt') + ans = [] + for item in data: + msg = item[3].strip() + # 仅保留中文 + msg = ''.join([ch for ch in msg if ('\u4e00' <= ch <= '\u9fa5')]) + if len(msg) == 0: + continue + item[3] = msg + # 去叠词:msg_del_dul + msg_del_dul = del_dul_word([word for word in jieba.cut(msg)]) + item.append(''.join(msg_del_dul)) + # item += [''] + item.append('') + ans.append(item) + print('过滤 msg左右空格+符号+英文+数字 仅留中文汉字: {} -> {}'.format(len(data), len(ans))) + return ans + + +if __name__ == '__main__': + input_dir = './input_m2_4-10_1000' + high_data = read_data(input_dir) + # 处理 "通话记录详情" + high_data = split_ai_me(high_data,ai_map,in_dct,out_dct) + + + #插入新列:去叠词 + high_data = insert_cols(high_data) + + + + # 保存 + str = 'hayinm2duolun_nodel_new' # 直接更改文件名标识 + sava_path = './output_m2_1000/'+str+'_result.xlsx' + original_col = ["processid", "in_node", "type_robot", "msg", "out_true", "type", "AI_Q","session_id", "msg_del_dup", "type_combine"] + new_col = ["session_id","processid", "AI_Q", "in_node", "type_robot", "msg", "msg_del_dup", "out_true", "type", "type_combine"] + #所有数据保存 + high_data = pd.DataFrame(high_data, columns=original_col) + high_data = high_data.reindex(columns=new_col) + high_data.to_csv(sava_path, encoding='utf-8-sig', index_label='id') + + # # 划分:需要标注 and 不需要标注/入结点知识库+表示结束 + tag ,un_tag = [], [] + for item in high_data.values: + if item[3] in ['无AI其他','知识库','敏感词','其他','3.1','3.3','4.4','5.1/5.3','5.2/5.4','5.5','5.6'] : + un_tag.append(item) + else: + tag.append(item) + + # 需要标注 + tag = sorted(tag, key=lambda k: (k[4])) + tag=pd.DataFrame(tag,columns=new_col) + tag.to_csv('./output_m2_1000/'+str+'_tag.xlsx',encoding='utf-8-sig', index_label='id') + + # 不需要标注 + un_tag = sorted(un_tag, key=lambda k: (k[4])) + un_tag = pd.DataFrame(un_tag, columns=new_col) + un_tag.to_csv('./output_m2_1000/'+str+'_untag.xlsx', encoding='utf-8-sig', index_label='id') \ No newline at end of file diff --git "a/Preprocessor/InfoExtraction/\345\223\210\351\223\266\346\225\260\346\215\256\345\244\204\347\220\206\350\257\264\346\230\216.md" "b/Preprocessor/InfoExtraction/\345\223\210\351\223\266\346\225\260\346\215\256\345\244\204\347\220\206\350\257\264\346\230\216.md" new file mode 100644 index 0000000..6c4c8b7 --- /dev/null +++ "b/Preprocessor/InfoExtraction/\345\223\210\351\223\266\346\225\260\346\215\256\345\244\204\347\220\206\350\257\264\346\230\216.md" @@ -0,0 +1,14 @@ +# ReadeMe + +input*文件夹是原始通话记录数据 input*_1000是 +output*是数据解析后的文件存放位置。 +caiyang.py 是对原始数据随机采样1000条的结果 +m1.py 对m1数据进行解析然后去重之后的结果 +m2.py 对m1数据进行解析然后去重之后的结果 + +nodel_m1.py 是对m1数据进行解析按照入节点去重等待后续标注 +nodel_m2.py 是对m2数据进行解析按照入节点去重等待后续标注 + +*dict*.py 存放特定流程关键字的字典,用于解析数据时确定出入节点。 + +备注:在处理数据时,只需要更换相应流程的字典即可。特殊情况时可以考虑修改代码。 \ No newline at end of file