import tensorflow_datasets as tfds import tensorflow as tf print(tf.__version__)
加载数据集
1 2 3
# Get the data dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True) train_dataset, test_dataset = dataset['train'], dataset['test']
import json import tensorflow as tf import csv import random import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.utils import to_categorical from tensorflow.keras import regularizers
# Note that I cleaned the Stanford dataset to remove LATIN1 encoding to make it easier for Python CSV reader # You can do that yourself with: # iconv -f LATIN1 -t UTF8 training.1600000.processed.noemoticon.csv -o training_cleaned.csv # I then hosted it on my site to make it easier to use in this notebook
sentences=[] labels=[] random.shuffle(corpus) for x in range(training_size): # list 中每个元素是 [text, label] sentences.append(corpus[x][0]) labels.append(corpus[x][1])
# Note this is the 100 dimension version of GloVe from Stanford # I unzipped and hosted it on my site to make this notebook easier !wget --no-check-certificate https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt -O glove.6B.100d.txt
# 构建 dict,key 是单词,value 是词向量 embeddings_index = {}; with open('glove.6B.100d.txt') as f: for line in f: values = line.split(); # 每行第0个元素是词 word = values[0]; # 每行剩下的元素是词向量 coefs = np.asarray(values[1:], dtype='float32'); embeddings_index[word] = coefs;
# 使用 vocab_size+1,是因为需要留一个单词索引给 OOV embeddings_matrix = np.zeros((vocab_size+1, embedding_dim)); # 注意 for 循环的内容,word 和 i for word, i in word_index.items(): # 根据 word 获取词向量 embedding_vector = embeddings_index.get(word); if embedding_vector is not None: # 把词向量放进矩阵中对应的单词索引 embeddings_matrix[i] = embedding_vector;
import matplotlib.image as mpimg import matplotlib.pyplot as plt
#----------------------------------------------------------- # Retrieve a list of list results on training and test data # sets for each training epoch #----------------------------------------------------------- acc=history.history['accuracy'] val_acc=history.history['val_accuracy'] loss=history.history['loss'] val_loss=history.history['val_loss']
epochs=range(len(acc)) # Get number of epochs
#------------------------------------------------ # Plot training and validation accuracy per epoch #------------------------------------------------ plt.plot(epochs, acc, 'r') plt.plot(epochs, val_acc, 'b') plt.title('Training and validation accuracy') plt.xlabel("Epochs") plt.ylabel("Accuracy") plt.legend(["Accuracy", "Validation Accuracy"])
plt.figure()
#------------------------------------------------ # Plot training and validation loss per epoch #------------------------------------------------ plt.plot(epochs, loss, 'r') plt.plot(epochs, val_loss, 'b') plt.title('Training and validation loss') plt.xlabel("Epochs") plt.ylabel("Loss") plt.legend(["Loss", "Validation Loss"])
plt.figure()
# Expected Output # A chart where the validation loss does not increase sharply!