index = list(range(total)) # 打乱数据 np.random.shuffle(index)
# all_texts 和 all_labels 都是 shuffle 之后的数据 all_texts = [] all_labels = [] for i in index: all_texts.append(texts[i]) all_labels.append(labels[i])
# 构造一个 dict,key 为 label,value 是一个 list,存储的是该类对应的 index label2id = {} for i in range(total): label = str(all_labels[i]) if label not in label2id: label2id[label] = [i] else: label2id[label].append(i)
# all_index 是一个 list,里面包括 10 个 list,称为 10 个 fold,存储 10 个 fold 对应的 index all_index = [[] for _ in range(fold_num)] for label, data in label2id.items(): # print(label, len(data)) batch_size = int(len(data) / fold_num) # other 表示多出来的数据,other 的数据量是小于 fold_num 的 other = len(data) - batch_size * fold_num # 把每一类对应的 index,添加到每个 fold 里面去 for i in range(fold_num): # 如果 i < other,那么将一个数据添加到这一轮 batch 的数据中 cur_batch_size = batch_size + 1 if i < other else batch_size # print(cur_batch_size) # batch_data 是该轮 batch 对应的索引 batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)] all_index[i].extend(batch_data)
# 由于上面在分 batch 的过程中,每个 batch 的数据量不一样,这里是把数据平均到每个 batch for fold in range(fold_num): num = len(all_index[fold]) texts = [all_texts[i] for i in all_index[fold]] labels = [all_labels[i] for i in all_index[fold]]
if num > batch_size: # 如果大于 batch_size 那么就取 batch_size 大小的数据 fold_texts = texts[:batch_size] other_texts.extend(texts[batch_size:]) fold_labels = labels[:batch_size] other_labels.extend(labels[batch_size:]) other_num += num - batch_size elif num < batch_size: # 如果小于 batch_size,那么就补全到 batch_size 的大小 end = start + batch_size - num fold_texts = texts + other_texts[start: end] fold_labels = labels + other_labels[start: end] start = end else: fold_texts = texts fold_labels = labels
assert batch_size == len(fold_labels)
# shuffle index = list(range(batch_size)) np.random.shuffle(index) # 这里是为了打乱数据 shuffle_fold_texts = [] shuffle_fold_labels = [] for i in index: shuffle_fold_texts.append(fold_texts[i]) shuffle_fold_labels.append(fold_labels[i])
data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts} fold_data.append(data)
logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
return fold_data
# fold_data 是一个 list,有 10 个元素,每个元素是 dict,包括 label 和 text fold_data = all_data2fold(10) for i in range(0, 10): data = fold_data[i]
path = os.path.join(dir, "train_" + str(i)) my_open = open(path, 'w') # 打开文件,采用写入模式 # 若文件不存在,创建,若存在,清空并写入 for text in data['text']: my_open.write(text) my_open.write('\n') # 换行 my_open.write('\n') # 添加一个空行,作为文章之间的分隔符 logging.info("complete train_" + str(i)) my_open.close()
from collections import Counter import pandas as pd import os.path as osp import os
word_counter = Counter() # 计算每个词出现的次数 data_file = osp.join('data','train_set.csv') f = pd.read_csv(data_file, sep='\t', encoding='UTF-8') data = f['text'].tolist() for text in data: words = text.split()
# Input file format: # (1) One sentence per line. These should ideally be actual sentences, not # entire paragraphs or arbitrary spans of text. (Because we use the # sentence boundaries for the "next sentence prediction" task). # (2) Blank lines between documents. Document boundaries are needed so # that the "next sentence prediction" task doesn't span between documents. for input_file in input_files: with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip()
# Empty lines are used as document delimiters # 如果是空行,作为文章之间的分隔符,那么使用一个新的 list 来存储文章 if not line: all_documents.append([]) tokens = tokenizer.tokenize(line) if tokens: all_documents[-1].append(tokens)
# Remove empty documents # 去除空行 all_documents = [x for x in all_documents if x] rng.shuffle(all_documents) # vocab_words 是一个 list,每个元素是单词 (word) vocab_words = list(tokenizer.vocab.keys()) instances = [] # 对文档重复 dupe_factor 次,随机产生训练集 for _ in range(dupe_factor): for document_index in range(len(all_documents)): instances.extend( create_instances_from_document_nsp( all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
# tokens 是一个句子 # 返回 mask 后的 tokens,mask 的位置以及真实的 token(label) def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng): """Creates the predictions for the masked LM objective."""
cand_indexes = [] for (i, token) in enumerate(tokens): if token == "[CLS]" or token == "[SEP]": continue # Whole Word Masking means that if we mask all of the wordpieces # corresponding to an original word. When a word has been split into # WordPieces, the first token does not have any marker and any subsequence # tokens are prefixed with ##. So whenever we see the ## token, we # append it to the previous set of word indexes. # # Note that Whole Word Masking does *not* change the training code # at all -- we still predict each WordPiece independently, softmaxed # over the entire vocabulary. if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and token.startswith("##")): cand_indexes[-1].append(i) else: # 只会执行这个条件,添加的是一个 list,里面只有一个元素 cand_indexes.append([i])
masked_lms = [] covered_indexes = set() for index_set in cand_indexes: # 如果 mask 的数量大于 num_to_predict,就停止 if len(masked_lms) >= num_to_predict: break # If adding a whole-word mask would exceed the maximum number of # predictions, then just skip this candidate. # index_set 是一个 list,里面只有一个元素 if len(masked_lms) + len(index_set) > num_to_predict: continue is_any_index_covered = False for index in index_set: if index in covered_indexes: is_any_index_covered = True break # 如果已经包含了这些 index,那么就跳过 if is_any_index_covered: continue
for index in index_set: # covered_indexes 是一个 set covered_indexes.add(index)
masked_token = None # 80% of the time, replace with [MASK] # 80% 的概率替换为 mask if rng.random() < 0.8: masked_token = "[MASK]" else: # 剩下的 20%,再分为两半 # 10% of the time, keep original # 20% *0.5 =10% 的概率保留原来的token if rng.random() < 0.5: masked_token = tokens[index] # 10% of the time, replace with random word else: # 20% *0.5 =10% 的概率替换为随机的一个 token masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
total_written += 1 # 打印前 20 句话的内容 if inst_index < 20: tf.logging.info("*** Example ***") tf.logging.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in instance.tokens]))
for feature_name in features.keys(): feature = features[feature_name] values = [] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value tf.logging.info( "%s: %s" % (feature_name, " ".join([str(x) for x in values])))
for writer in writers: writer.close()
tf.logging.info("Wrote %d total instances", total_written)