HBU-NNDL 实验七 循环神经网络(3)LSTM的记忆能力实验

目录

6.3 LSTM的记忆能力实验

6.3.1 模型构建

6.3.1.1 LSTM层

6.3.2 模型训练

6.3.2.1 训练指定长度的数字预测模型

6.3.2.2 多组训练

【思考题1】LSTM与SRN实验结果对比,谈谈看法。

6.3.3 模型评价

6.3.3.1 在测试集上进行模型评价

6.3.3.2 模型在不同长度的数据集上的准确率变化图 

【思考题2】LSTM与SRN在不同长度数据集上的准确度对比,谈谈看法。 

6.3.3.3 LSTM模型门状态和单元状态的变化

 【思考题3】分析LSTM中单元状态和门数值的变化图,并用自己的话解释该图。

全面总结RNN


 

6.3 LSTM的记忆能力实验

长短期记忆网络(Long Short-Term Memory Network,LSTM)是一种可以有效缓解长程依赖问题的循环神经网络

 

6.3.1 模型构建

在本实验中,我们将使用第6.1.2.4节中定义Model_RNN4SeqClass模型,并构建 LSTM 算子.只需要实例化 LSTM 算,并传入Model_RNN4SeqClass模型,就可以用 LSTM 进行数字求和实验

6.3.1.1 LSTM层

LSTM层的代码与SRN层结构相似,只是在SRN层的基础上增加了内部状态、输入门、遗忘门和输出门的定义和计算。这里LSTM层的输出也依然为序列的最后一个位置的隐状态向量。代码实现如下:

import torch.nn.functional as F
import torch.nn as nn
import torch
# 声明LSTM和相关参数
class LSTM(nn.Module):
 def __init__(self, input_size, hidden_size, Wi_attr=None, Wf_attr=None, Wo_attr=None, Wc_attr=None,
 Ui_attr=None, Uf_attr=None, Uo_attr=None, Uc_attr=None, bi_attr=None, bf_attr=None,
 bo_attr=None, bc_attr=None):
 super(LSTM, self).__init__()
 self.input_size = input_size
 self.hidden_size = hidden_size
 # 初始化模型参数
 if Wi_attr==None:
 Wi= torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
 else:
 Wi = torch.tensor(Wi_attr, dtype=torch.float32)
 self.W_i = torch.nn.Parameter(Wi)
 if Wf_attr==None:
 Wf=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
 else:
 Wf = torch.tensor(Wf_attr, dtype=torch.float32)
 self.W_f = torch.nn.Parameter(Wf)
 if Wo_attr==None:
 Wo=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
 else:
 Wo = torch.tensor(Wo_attr, dtype=torch.float32)
 self.W_o =torch.nn.Parameter(Wo)
 if Wc_attr==None:
 Wc=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
 else:
 Wc = torch.tensor(Wc_attr, dtype=torch.float32)
 self.W_c = torch.nn.Parameter(Wc)
 if Ui_attr==None:
 Ui = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
 else:
 Ui = torch.tensor(Ui_attr, dtype=torch.float32)
 self.U_i = torch.nn.Parameter(Ui)
 if Uf_attr == None:
 Uf = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
 else:
 Uf = torch.tensor(Uf_attr, dtype=torch.float32)
 self.U_f = torch.nn.Parameter(Uf)
 if Uo_attr == None:
 Uo = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
 else:
 Uo = torch.tensor(Uo_attr, dtype=torch.float32)
 self.U_o = torch.nn.Parameter(Uo)
 if Uc_attr == None:
 Uc = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
 else:
 Uc = torch.tensor(Uc_attr, dtype=torch.float32)
 self.U_c = torch.nn.Parameter(Uc)
 if bi_attr == None:
 bi = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
 else:
 bi = torch.tensor(bi_attr, dtype=torch.float32)
 self.b_i = torch.nn.Parameter(bi)
 if bf_attr == None:
 bf = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
 else:
 bf = torch.tensor(bf_attr, dtype=torch.float32)
 self.b_f = torch.nn.Parameter(bf)
 if bo_attr == None:
 bo = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
 else:
 bo = torch.tensor(bo_attr, dtype=torch.float32)
 self.b_o = torch.nn.Parameter(bo)
 if bc_attr == None:
 bc = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
 else:
 bc = torch.tensor(bc_attr, dtype=torch.float32)
 self.b_c = torch.nn.Parameter(bc)
 # 初始化状态向量和隐状态向量
 def init_state(self, batch_size):
 hidden_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
 cell_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
 return hidden_state, cell_state
 # 定义前向计算
 def forward(self, inputs, states=None):
 # inputs: 输入数据,其shape为batch_size x seq_len x input_size
 batch_size, seq_len, input_size = inputs.shape
 # 初始化起始的单元状态和隐状态向量,其shape为batch_size x hidden_size
 if states is None:
 states = self.init_state(batch_size)
 hidden_state, cell_state = states
 # 执行LSTM计算,包括:输入门、遗忘门和输出门、候选内部状态、内部状态和隐状态向量
 for step in range(seq_len):
 # 获取当前时刻的输入数据step_input: 其shape为batch_size x input_size
 step_input = inputs[:, step, :]
 # 计算输入门, 遗忘门和输出门, 其shape为:batch_size x hidden_size
 I_gate = F.sigmoid(torch.matmul(step_input, self.W_i) + torch.matmul(hidden_state, self.U_i) + self.b_i)
 F_gate = F.sigmoid(torch.matmul(step_input, self.W_f) + torch.matmul(hidden_state, self.U_f) + self.b_f)
 O_gate = F.sigmoid(torch.matmul(step_input, self.W_o) + torch.matmul(hidden_state, self.U_o) + self.b_o)
 # 计算候选状态向量, 其shape为:batch_size x hidden_size
 C_tilde = F.tanh(torch.matmul(step_input, self.W_c) + torch.matmul(hidden_state, self.U_c) + self.b_c)
 # 计算单元状态向量, 其shape为:batch_size x hidden_size
 cell_state = F_gate * cell_state + I_gate * C_tilde
 # 计算隐状态向量,其shape为:batch_size x hidden_size
 hidden_state = O_gate * F.tanh(cell_state)
 return hidden_state
Wi_attr = torch.tensor([[0.1, 0.2], [0.1, 0.2]])
Wf_attr = torch.tensor([[0.1, 0.2], [0.1, 0.2]])
Wo_attr = torch.tensor([[0.1, 0.2], [0.1, 0.2]])
Wc_attr = torch.tensor([[0.1, 0.2], [0.1, 0.2]])
Ui_attr = torch.tensor([[0.0, 0.1], [0.1, 0.0]])
Uf_attr = torch.tensor([[0.0, 0.1], [0.1, 0.0]])
Uo_attr = torch.tensor([[0.0, 0.1], [0.1, 0.0]])
Uc_attr = torch.tensor([[0.0, 0.1], [0.1, 0.0]])
bi_attr = torch.tensor([[0.1, 0.1]])
bf_attr = torch.tensor([[0.1, 0.1]])
bo_attr = torch.tensor([[0.1, 0.1]])
bc_attr = torch.tensor([[0.1, 0.1]])
lstm = LSTM(2, 2, Wi_attr=Wi_attr, Wf_attr=Wf_attr, Wo_attr=Wo_attr, Wc_attr=Wc_attr,
 Ui_attr=Ui_attr, Uf_attr=Uf_attr, Uo_attr=Uo_attr, Uc_attr=Uc_attr,
 bi_attr=bi_attr, bf_attr=bf_attr, bo_attr=bo_attr, bc_attr=bc_attr)
inputs = torch.tensor([[[1, 0]]], dtype=torch.float32)
hidden_state = lstm(inputs)
print(hidden_state)

tensor([[0.0594, 0.0952]], grad_fn=<MulBackward0>)

这里我们可以将自己实现的SRN和PyTorch框架内置的SRN返回的结果进行打印展示,实现代码如下。

# 这里创建一个随机数组作为测试数据,数据shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = 8, 20, 32
inputs = torch.randn(size=[batch_size, seq_len, input_size])
# 设置模型的hidden_size
hidden_size = 32
torch_lstm = nn.LSTM(input_size, hidden_size)
self_lstm = LSTM(input_size, hidden_size)
self_hidden_state = self_lstm(inputs)
torch_outputs, (torch_hidden_state, torch_cell_state) = torch_lstm(inputs)
print("self_lstm hidden_state: ", self_hidden_state.shape)
print("torch_lstm outpus:", torch_outputs.shape)
print("torch_lstm hidden_state:", torch_hidden_state.shape)
print("torch_lstm cell_state:", torch_cell_state.shape)

self_lstm hidden_state:  torch.Size([8, 32])
torch_lstm outpus: torch.Size([8, 20, 32])
torch_lstm hidden_state: torch.Size([1, 20, 32])
torch_lstm cell_state: torch.Size([1, 20, 32])

 

在进行实验时,首先定义输入数据inputs,然后将该数据分别传入PyTorch内置的LSTM与自己实现的LSTM模型中,最后通过对比两者的隐状态输出向量。代码实现如下:

torch.seed()
# 这里创建一个随机数组作为测试数据,数据shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size, hidden_size = 2, 5, 10, 10
inputs = torch.randn([batch_size, seq_len, input_size])
# 设置模型的hidden_size
torch_lstm = nn.LSTM(input_size, hidden_size, bias=True)
# 获取torch_lstm中的参数,并设置相应的paramAttr,用于初始化lstm
print(torch_lstm.weight_ih_l0.T.shape)
chunked_W = torch.split(torch_lstm.weight_ih_l0.T, split_size_or_sections=10, dim=-1)
chunked_U = torch.split(torch_lstm.weight_hh_l0.T, split_size_or_sections=10, dim=-1)
chunked_b = torch.split(torch_lstm.bias_hh_l0.T, split_size_or_sections=10, dim=-1)
Wi_attr = chunked_W[0]
Wf_attr = chunked_W[1]
Wc_attr = chunked_W[2]
Wo_attr = chunked_W[3]
Ui_attr = chunked_U[0]
Uf_attr = chunked_U[1]
Uc_attr = chunked_U[2]
Uo_attr = chunked_U[3]
bi_attr = chunked_b[0]
bf_attr = chunked_b[1]
bc_attr = chunked_b[2]
bo_attr = chunked_b[3]
self_lstm = LSTM(input_size, hidden_size, Wi_attr=Wi_attr, Wf_attr=Wf_attr, Wo_attr=Wo_attr, Wc_attr=Wc_attr,
 Ui_attr=Ui_attr, Uf_attr=Uf_attr, Uo_attr=Uo_attr, Uc_attr=Uc_attr,
 bi_attr=bi_attr, bf_attr=bf_attr, bo_attr=bo_attr, bc_attr=bc_attr)
# 进行前向计算,获取隐状态向量,并打印展示
self_hidden_state = self_lstm(inputs)
torch_outputs, (torch_hidden_state, _) = torch_lstm(inputs)
print("torch SRN:\n", torch_hidden_state.detach().numpy().squeeze(0))
print("self SRN:\n", self_hidden_state.detach().numpy())

torch.Size([10, 40])
torch SRN:
 [[ 0.15448314  0.1058649   0.15513758  0.10020707 -0.09891309  0.24404514
  -0.10054383  0.02534968 -0.09427626  0.19086668]
 [ 0.1005525  -0.10019652  0.09727433  0.06858747 -0.12354212 -0.0772252
  -0.06487443  0.07479167  0.05020904 -0.06898089]
 [-0.20555076 -0.10189811 -0.21259196  0.15717389 -0.0481133  -0.03887707
  -0.0319021   0.10424501 -0.12051586 -0.03992889]
 [-0.1919373   0.1589279  -0.00179314  0.11847072 -0.25509602 -0.0959354
  -0.36128283  0.029731   -0.06177601 -0.24320896]
 [-0.11731672  0.20817927  0.00452011  0.14359923  0.0507274  -0.03588643
  -0.10982763  0.3008023  -0.14857852 -0.10586609]]
self SRN:
 [[-0.05930986  0.07991815  0.16814934 -0.03768372 -0.18313237  0.08653796
  -0.2073734   0.1649277  -0.01274812 -0.13361353]
 [-0.15101278  0.12001547 -0.16750741  0.22780265  0.03029063 -0.02781414
  -0.31316137  0.35433516 -0.09388448 -0.09385198]]

可以看到,两者的输出基本是一致的。另外,还可以进行对比两者在运算速度方面的差异。代码实现如下:

import time
# 这里创建一个随机数组作为测试数据,数据shape为batch_size x seq_len x input_size
batch_size, seq_len, input_size = 8, 20, 32
inputs = torch.randn([batch_size, seq_len, input_size])
# 设置模型的hidden_size
hidden_size = 32
self_lstm = LSTM(input_size, hidden_size)
torch_lstm = nn.LSTM(input_size, hidden_size)
# 计算自己实现的SRN运算速度
model_time = 0
for i in range(100):
 strat_time = time.time()
 hidden_state = self_lstm(inputs)
 # 预热10次运算,不计入最终速度统计
 if i < 10:
 continue
 end_time = time.time()
 model_time += (end_time - strat_time)
avg_model_time = model_time / 90
print('self_lstm speed:', avg_model_time, 's')
# 计算torch内置的SRN运算速度
model_time = 0
for i in range(100):
 strat_time = time.time()
 outputs, (hidden_state, cell_state) = torch_lstm(inputs)
 # 预热10次运算,不计入最终速度统计
 if i < 10:
 continue
 end_time = time.time()
 model_time += (end_time - strat_time)
avg_model_time = model_time / 90
print('torch_lstm speed:', avg_model_time, 's')

 self_lstm speed: 0.003036334779527452 s
torch_lstm speed: 0.0007313781314425998 s

可以看到,由于PyTorch框架的LSTM底层采用了C++实现并进行优化,PyTorch框架内置的LSTM运行效率远远高于自己实现的LSTM。 

6.3.2 模型训练

6.3.2.1 训练指定长度的数字预测模型

本节将基于RunnerV3类进行训练,首先定义模型训练的超参数,并保证和简单循环网络的超参数一致. 然后定义一个train函数,其可以通过指定长度的数据集,并进行训练. 在train函数中,首先加载长度为length的数据,然后实例化各项组件并创建对应的Runner,然后训练该Runner。同时在本节将使用4.5.4节定义的准确度(Accuracy)作为评估指标,代码实现如下:

import os
import random
import torch
import numpy as np
from torch.utils.data import DataLoader,Dataset
from srn import load_data,DigitSumDataset,Model_RNN4SeqClass
from metric import Accuracy
from Runner import RunnerV3
# 训练轮次
num_epochs = 500
# 学习率
lr = 0.001
# 输入数字的类别数
num_digits = 10
# 将数字映射为向量的维度
input_size = 32
# 隐状态向量的维度
hidden_size = 32
# 预测数字的类别数
num_classes = 19
# 批大小
batch_size = 8
# 模型保存目录
save_dir = "./checkpoints"
# 可以设置不同的length进行不同长度数据的预测实验
def train(length):
 print(f"\n====> Training LSTM with data of length {length}.")
 np.random.seed(0)
 random.seed(0)
 # 加载长度为length的数据
 data_path = f"./datasets/{length}"
 train_examples, dev_examples, test_examples = load_data(data_path)
 train_set, dev_set, test_set = DigitSumDataset(train_examples), DigitSumDataset(dev_examples), DigitSumDataset(test_examples)
 train_loader = DataLoader(train_set, batch_size=batch_size)
 dev_loader = DataLoader(dev_set, batch_size=batch_size)
 test_loader = DataLoader(test_set, batch_size=batch_size)
 # 实例化模型
 base_model = LSTM(input_size, hidden_size)
 model = Model_RNN4SeqClass(base_model, num_digits, input_size, hidden_size, num_classes)
 # 指定优化器
 optimizer = torch.optim.Adam(lr=lr, params=model.parameters())
 # 定义评价指标
 metric = Accuracy()
 # 定义损失函数
 loss_fn = torch.nn.CrossEntropyLoss()
 # 基于以上组件,实例化Runner
 runner = RunnerV3(model, optimizer, loss_fn, metric)
 # 进行模型训练
 model_save_path = os.path.join(save_dir, f"best_lstm_model_{length}.pdparams")
 runner.train(train_loader, dev_loader, num_epochs=num_epochs, eval_steps=100, log_steps=100, save_path=model_save_path)
 return runner

6.3.2.2 多组训练

lstm_runners = {}
lengths = [10, 15, 20, 25, 30, 35]
for length in lengths:
 runner = train(length)
 lstm_runners[length] = runner

[Train] epoch: 489/500, step: 18600/19000, loss: 0.01708
[Evaluate]  dev score: 0.54000, dev loss: 2.41391
[Train] epoch: 492/500, step: 18700/19000, loss: 0.09806
[Evaluate]  dev score: 0.53000, dev loss: 2.46762
[Train] epoch: 494/500, step: 18800/19000, loss: 0.04476
[Evaluate]  dev score: 0.54000, dev loss: 2.51601
[Train] epoch: 497/500, step: 18900/19000, loss: 0.01534
[Evaluate]  dev score: 0.52000, dev loss: 2.52584
[Evaluate]  dev score: 0.54000, dev loss: 2.52936
[Train] Training done!

                        L=10                                                                        L=15 

 

                        L=20                                                                        L=25

                         L=30                                                                        L=35

  

 LSTM模型在不同长度数据集上进行训练后的损失变化,同SRN模型一样,随着序列长度的增加,训练集上的损失逐渐不稳定,验证集上的损失整体趋向于变大,这说明当序列长度增加时,保持长期依赖的能力同样在逐渐变弱. 但是同上节实验运行结果(下图)相比,LSTM模型在序列长度增加时,收敛情况比SRN模型更好。
 

【思考题1】LSTM与SRN实验结果对比,谈谈看法。

上次SRN实验的损失曲线:

从Dev Loss上看,LSTM的准确率要明显优于SRN,对于SRN来说,它能够处理一定的短期依赖,但无法处理长期依赖问题,当序列较长时,序列后部的梯度很难反向传播到前面的序列,比如15个元素以前,这就产生了梯度消失问题。

 

 

6.3.3 模型评价

6.3.3.1 在测试集上进行模型评价

lstm_dev_scores = []
lstm_test_scores = []
for length in lengths:
 print(f"Evaluate LSTM with data length {length}.")
 runner = lstm_runners[length]
 # 加载训练过程中效果最好的模型
 model_path = os.path.join(save_dir, f"best_lstm_model_{length}.pdparams")
 runner.load_model(model_path)
 # 加载长度为length的数据
 data_path = f"./datasets/{length}"
 train_examples, dev_examples, test_examples = load_data(data_path)
 test_set = DigitSumDataset(test_examples)
 test_loader = DataLoader(test_set, batch_size=batch_size)
 # 使用测试集评价模型,获取测试集上的预测准确率
 score, _ = runner.evaluate(test_loader)
 lstm_test_scores.append(score)
 lstm_dev_scores.append(max(runner.dev_scores))
for length, dev_score, test_score in zip(lengths, lstm_dev_scores, lstm_test_scores):
 print(f"[LSTM] length:{length}, dev_score: {dev_score}, test_score: {test_score: .5f}")

 Evaluate LSTM with data length 10.
Evaluate LSTM with data length 15.
Evaluate LSTM with data length 20.
Evaluate LSTM with data length 25.
Evaluate LSTM with data length 30.
Evaluate LSTM with data length 35.
[LSTM] length:10, dev_score: 0.89, test_score:  0.90000
[LSTM] length:15, dev_score: 0.89, test_score:  0.93000
[LSTM] length:20, dev_score: 0.9, test_score:  0.92000
[LSTM] length:25, dev_score: 0.92, test_score:  0.86000
[LSTM] length:30, dev_score: 0.74, test_score:  0.70000
[LSTM] length:35, dev_score: 0.59, test_score:  0.47000

 

6.3.3.2 模型在不同长度的数据集上的准确率变化图 

接下来,将SRN和LSTM在不同长度的验证集和测试集数据上的准确率绘制成图片,以方面观察。

import matplotlib.pyplot as plt
plt.plot(lengths, lstm_dev_scores, '-o', color='#e8609b', label="LSTM Dev Accuracy")
plt.plot(lengths, lstm_test_scores,'-o', color='#000000', label="LSTM Test Accuracy")
#绘制坐标轴和图例
plt.ylabel("accuracy", fontsize='large')
plt.xlabel("sequence length", fontsize='large')
plt.legend(loc='lower left', fontsize='x-large')
fig_name = "./images/6.12.pdf"
plt.savefig(fig_name)
plt.show()

【思考题2】LSTM与SRN在不同长度数据集上的准确度对比,谈谈看法。 

SRN准确度:

        可以看出,在长度35之前,LSTM的准确率要明显优于SRN,整体趋势LSTM和SRN都是随着序列长度增加,准确率下降, 但是由于LSTM的门控机制,LSTM在中等长度序列的准确率依然高于SRN。

6.3.3.3 LSTM模型门状态和单元状态的变化

LSTM模型通过门控机制控制信息的单元状态的更新,这里可以观察当LSTM在处理一条数字序列的时候,相应门和单元状态是如何变化的。首先需要对以上LSTM模型实现代码中,定义相应列表进行存储这些门和单元状态在每个时刻的向量。

# 声明LSTM和相关参数
class LSTM(nn.Module):
 def __init__(self, input_size, hidden_size, Wi_attr=None, Wf_attr=None, Wo_attr=None, Wc_attr=None,
 Ui_attr=None, Uf_attr=None, Uo_attr=None, Uc_attr=None, bi_attr=None, bf_attr=None,
 bo_attr=None, bc_attr=None):
 super(LSTM, self).__init__()
 self.input_size = input_size
 self.hidden_size = hidden_size
 # 初始化模型参数
 if Wi_attr==None:
 Wi=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
 else:
 Wi = torch.tensor(Wi_attr, dtype=torch.float32)
 self.W_i = torch.nn.Parameter(Wi)
 if Wf_attr==None:
 Wf=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
 else:
 Wf = torch.tensor(Wf_attr, dtype=torch.float32)
 self.W_f = torch.nn.Parameter(Wf)
 if Wo_attr==None:
 Wo=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
 else:
 Wo = torch.tensor(Wo_attr, dtype=torch.float32)
 self.W_o =torch.nn.Parameter(Wo)
 if Wc_attr==None:
 Wc=torch.zeros(size=[input_size, hidden_size], dtype=torch.float32)
 else:
 Wc = torch.tensor(Wc_attr, dtype=torch.float32)
 self.W_c = torch.nn.Parameter(Wc)
 if Ui_attr==None:
 Ui = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
 else:
 Ui = torch.tensor(Ui_attr, dtype=torch.float32)
 self.U_i = torch.nn.Parameter(Ui)
 if Uf_attr == None:
 Uf = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
 else:
 Uf = torch.tensor(Uf_attr, dtype=torch.float32)
 self.U_f = torch.nn.Parameter(Uf)
 if Uo_attr == None:
 Uo = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
 else:
 Uo = torch.tensor(Uo_attr, dtype=torch.float32)
 self.U_o = torch.nn.Parameter(Uo)
 if Uc_attr == None:
 Uc = torch.zeros(size=[hidden_size, hidden_size], dtype=torch.float32)
 else:
 Uc = torch.tensor(Uc_attr, dtype=torch.float32)
 self.U_c = torch.nn.Parameter(Uc)
 if bi_attr == None:
 bi = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
 else:
 bi = torch.tensor(bi_attr, dtype=torch.float32)
 self.b_i = torch.nn.Parameter(bi)
 if bf_attr == None:
 bf = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
 else:
 bf = torch.tensor(bf_attr, dtype=torch.float32)
 self.b_f = torch.nn.Parameter(bf)
 if bo_attr == None:
 bo = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
 else:
 bo = torch.tensor(bo_attr, dtype=torch.float32)
 self.b_o = torch.nn.Parameter(bo)
 if bc_attr == None:
 bc = torch.zeros(size=[1,hidden_size], dtype=torch.float32)
 else:
 bc = torch.tensor(bc_attr, dtype=torch.float32)
 self.b_c = torch.nn.Parameter(bc)
 # 初始化状态向量和隐状态向量
 def init_state(self, batch_size):
 hidden_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
 cell_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
 return hidden_state, cell_state
 # 定义前向计算
 def forward(self, inputs, states=None):
 # inputs: 输入数据,其shape为batch_size x seq_len x input_size
 batch_size, seq_len, input_size = inputs.shape
 # 初始化起始的单元状态和隐状态向量,其shape为batch_size x hidden_size
 if states is None:
 states = self.init_state(batch_size)
 hidden_state, cell_state = states
 
 # 定义相应的门状态和单元状态向量列表
 self.Is = []
 self.Fs = []
 self.Os = []
 self.Cs = []
 # 初始化状态向量和隐状态向量
 cell_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
 hidden_state = torch.zeros(size=[batch_size, self.hidden_size], dtype=torch.float32)
 # 执行LSTM计算,包括:隐藏门、输入门、遗忘门、候选状态向量、状态向量和隐状态向量
 for step in range(seq_len):
 input_step = inputs[:, step, :]
 I_gate = F.sigmoid(torch.matmul(input_step, self.W_i) + torch.matmul(hidden_state, self.U_i) + self.b_i)
 F_gate = F.sigmoid(torch.matmul(input_step, self.W_f) + torch.matmul(hidden_state, self.U_f) + self.b_f)
 O_gate = F.sigmoid(torch.matmul(input_step, self.W_o) + torch.matmul(hidden_state, self.U_o) + self.b_o)
 C_tilde = F.tanh(torch.matmul(input_step, self.W_c) + torch.matmul(hidden_state, self.U_c) + self.b_c)
 cell_state = F_gate * cell_state + I_gate * C_tilde
 hidden_state = O_gate * F.tanh(cell_state)
 # 存储门状态向量和单元状态向量
 self.Is.append(I_gate.detach().numpy().copy())
 self.Fs.append(F_gate.detach().numpy().copy())
 self.Os.append(O_gate.detach().numpy().copy())
 self.Cs.append(cell_state.detach().numpy().copy())
 return hidden_state

接下来,需要使用新的LSTM模型,重新实例化一个runner,本节使用序列长度为10的模型进行此项实验,因此需要加载序列长度为10的模型。

# 实例化模型
base_model = LSTM(input_size, hidden_size)
model = Model_RNN4SeqClass(base_model, num_digits, input_size, hidden_size, num_classes) 
# 指定优化器
optimizer = torch.optim.Adam(lr=lr, params=model.parameters())
# 定义评价指标
metric = Accuracy()
# 定义损失函数
loss_fn = torch.nn.CrossEntropyLoss()
# 基于以上组件,重新实例化Runner
runner = RunnerV3(model, optimizer, loss_fn, metric)
length = 10
# 加载训练过程中效果最好的模型
model_path = os.path.join(save_dir, f"best_lstm_model_{length}.pdparams")
runner.load_model(model_path)

接下来,给定一条数字序列,并使用数字预测模型进行数字预测,这样便会将相应的门状态和单元状态向量保存至模型中. 然后分别从模型中取出这些向量,并将这些向量进行绘制展示。代码实现如下:

import seaborn as sns
import matplotlib.pyplot as plt
def plot_tensor(inputs, tensor, save_path, vmin=0, vmax=1):
 tensor = np.stack(tensor, axis=0)
 tensor = np.squeeze(tensor, 1).T
 plt.figure(figsize=(16,6))
 # vmin, vmax定义了色彩图的上下界
 ax = sns.heatmap(tensor, vmin=vmin, vmax=vmax) 
 ax.set_xticklabels(inputs)
 ax.figure.savefig(save_path)
# 定义模型输入
inputs = [6, 7, 0, 0, 1, 0, 0, 0, 0, 0]
X = torch.as_tensor(inputs.copy())
X = X.unsqueeze(0)
# 进行模型预测,并获取相应的预测结果
logits = runner.predict(X)
predict_label = torch.argmax(logits, dim=-1)
print(f"predict result: {predict_label.numpy()[0]}")
# 输入门
Is = runner.model.rnn_model.Is
plot_tensor(inputs, Is, save_path="./images/6.13_I.pdf")
# 遗忘门
Fs = runner.model.rnn_model.Fs
plot_tensor(inputs, Fs, save_path="./images/6.13_F.pdf")
# 输出门
Os = runner.model.rnn_model.Os
plot_tensor(inputs, Os, save_path="./images/6.13_O.pdf")
# 单元状态
Cs = runner.model.rnn_model.Cs
plot_tensor(inputs, Cs, save_path="./images/6.13_C.pdf", vmin=-5, vmax=5)

 输入门:

 输出门:

遗忘门: 

 单元状态:

 【思考题3】分析LSTM中单元状态和门数值的变化图,并用自己的话解释该图。

        这几个门用来控制网络前面的遗忘了多少,现在输入了多少,单元状态又要输出多少。横坐标代表的是序列,纵向的就是每个序列不同门的向量,,向量不同的颜色代表了取值的大小,可以看出6,7序列的取值要和后面明显不同,而后面的则差不多,有不同就是对前面序列一些维度的遗忘的变化。

全面总结RNN

        RNN的学习比之前FNN,CNN难了不止一点,尤其是LSTM,本来就难的知识,加上网课,就导致效率很低。这次实验后,整体上明白了LSTM的框架,但是当涉及到具体的公式,还得花时间来了解。

 

作者:不是蒋承翰原文地址:https://blog.csdn.net/m0_57215376/article/details/128073222

%s 个评论

要回复文章请先登录注册