使用Python的Keras库来学习深度学习中的二分类问题 ------ IMDB。
IMDB它包含来自互联网电影数据库的50000条严重两级分化的评论,数据集被分为用于训练的25000条评论和用于测试的25000条评论,训练集和测试集都包含50%的正面评论和50%的负面评论。
将评论解码为英文单词
用下面的代码解析第一条train_data[0]中的评论
# word_index is a dictionary mapping words to an integer index
word_index = imdb.get_word_index()
# We reverse it, mapping integer indices to words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# We decode the review; note that our indices were offset by 3
# because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown".
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])
print(decoded_review)
训练网络
from keras.datasets import imdb
import numpy as np
from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics
import matplotlib.pyplot as plt
from keras.utils import to_categorical
#加载IMDB数据集
(train_data,train_labels),(test_data,test_labels) = imdb.load_data(num_words=10000)
#将整数序列编码为二进制矩阵
def vectorize_sequences(sequences, dimension = 10000):
results = np.zeros((len(sequences),dimension))
for i, sequences in enumerate(sequences):
results[i, sequences] = 1.
return results
#将数据向量化
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)
#将标签向量化
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')
#模型定义
model =models.Sequential()
model.add((layers.Dense(16, activation='relu', input_shape=(10000,))))
model.add(layers.Dense(16,activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
#编译模型
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
#配置优化器
model.compile(optimizer=optimizers.RMSprop(lr=0.001),
loss='binary_crossentropy',
metrics=['accuracy'])
#使用自定义的损失和指标
model.compile(optimizer=optimizers.RMSprop(lr=0.001),
loss=losses.binary_crossentropy,
metrics=[metrics.binary_accuracy])
#留出验证集
x_val = x_train[:10000]
partial_x_train = x_train[10000:]
y_val = y_train[:10000]
partial_y_train = y_train[10000:]
#训练模型
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['acc'])
history = model.fit(partial_x_train,
partial_y_train,
epochs=20,
batch_size=512,
validation_data=(x_val,y_val))
#绘制训练损失和验证损失
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = range(1, len(loss_values) + 1)
plt.plot(epochs,loss_values,'bo', label='Training loss')
plt.plot(epochs,val_loss_values,'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
#绘制训练精度和验证精度
plt.clf()
acc = history_dict['acc']
val_acc = history_dict['val_acc']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()