์ด๋ฒ์๋ ์ค์ ๋ก LSTM์ ๊ตฌํํด๋ณด๊ฒ ์ต๋๋ค.
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
`torch.nn` ์ ํ์ฉํ์ฌ LSTM cell ์ ์์ฑํ๋ ๋ฐฉ๋ฒ์ ๋ค์๊ณผ ๊ฐ์ต๋๋ค.
* `input_size` : The number of expected features in the input x
* `hidden_size` : The number of features in the hidden state h
lstm = nn.LSTM(input_size, hidden_size)
# input_size: 3, hidden_size: 3 ์ผ๋ก ์ค์ ํ์ฌ LSTM cell ์ ์์ฑํฉ๋๋ค.
lstm = nn.LSTM(3, 3)
LSTM cell ์ ์์ฑํ ํ์๋, ์ ๋ ฅ์ผ๋ก ๋ค์ด๊ฐ input x, hidden state h, cell state c ๋ฅผ ์์ฑํด์ผ ํฉ๋๋ค.
์์์ ์ ํ input_size ์ hidden_size ๋ฅผ ๊ณ ๋ คํ์ฌ inputs ์ hidden (h ์ c) ์ ์์ฑํด ๋ด ์๋ค.
# sequence length ๊ฐ 5 ์ธ input์ ์์ฑํฉ๋๋ค.
# ์ด๋, input_size ๋ฅผ 3 ์ผ๋ก ์ค์ ํ์ผ๋ฏ๋ก, 3 ์ฐจ์ ๋ฒกํฐ 5๊ฐ๋ฅผ ์์ฑํด์ผ ํฉ๋๋ค.
inputs = [torch.randn(1, 3) for _ in range(5)]
# lstm ์ input x ์ hidden state h ๋ฅผ ์
๋ ฅ์ผ๋ก ๋ฐ๊ธฐ ๋๋ฌธ์, hidden state ๋ ์์ฑํด ์ค๋๋ค.
# ์ด๋, hidden_size ๋ฅผ 3 ์ผ๋ก ์ค์ ํ์ผ๋ฏ๋ก, 3 ์ฐจ์ ๋ฒกํฐ๋ฅผ ์์ฑํฉ๋๋ค.
# lstm ์ ์
๋ ฅ์ผ๋ก ๋ค์ด๊ฐ๋ h ๋ RNN ์์์ hidden state ์, lstm ์์ ๋ฑ์ฅํ ๊ฐ๋
์ธ cell state ๋ก ๊ตฌ์ฑ๋์ด ์๊ธฐ ๋๋ฌธ์
# hidden ์ 3 ์ฐจ์ ๋ฒกํฐ 2๊ฐ๋ก ๊ตฌ์ฑ๋์ด์ผ ํฉ๋๋ค.
hidden = (torch.randn(1, 1, 3),
torch.randn(1, 1, 3))
๋ฐฉ๋ฒ 1: Sequence length ๊ฐ 5 ์ธ input ์ ๋ํ์ฌ ํ ๋ฒ์ ํ๋์ element ๋ฅผ lstm cell ์ ํต๊ณผ์ํต๋๋ค.
๋ฐฉ๋ฒ 2: ์ ์ฒด ์ํ์ค๋ฅผ ํ๋ฒ์ ํต๊ณผ์ํค๋ ๋ฐฉ๋ฒ๋ ์์ต๋๋ค.
LSTM ์ด ๋ฐํํ๋ ์ถ๋ ฅ์ ์ฒซ ๋ฒ์งธ ๊ฐ์ ์ ์ฒด ์ํ์ค์ ๋ํ ํต๊ณผํ hidden state ์ด๊ณ , ๋ ๋ฒ์งธ ๊ฐ์ ๋ง์ง๋ง step ์ hidden state ์ ๋๋ค. out ๊ณผ hidden ์ size ๋ฅผ ๋น๊ตํด๋ณด์ธ์.
inputs = torch.cat(inputs).view(len(inputs), 1, -1) # ๋ฐฉ๋ฒ 2 ๋ฅผ ์ ์ฉํ๊ธฐ ์ํด input ์ list ๊ฐ ์๋ ํ๋์ tensor ๋ก concat ํด์ค๋๋ค.
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3)) # ๋ฐฉ๋ฒ 2 ๋ฅผ ์ ์ฉํ๊ธฐ ์ํด hidden ์ ๋ค์ ์ด๊ธฐํํฉ๋๋ค.
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)
LSTM ์ ์ด์ฉํด Part-of-Speech (PoS) Tagging ์ ํ๊ธฐ ์ํด ํ์ต ๋ฐ์ดํฐ๋ฅผ ์ค๋นํฉ๋๋ค.
- training_data ์๋ ๋จ์ด ์ํ์ค์ ๊ฐ ๋จ์ด์ ํ์ฌ ํ๊ทธ๋ฅผ ์ค๋นํด์ผ ํฉ๋๋ค.
- word_to_ix: ๋ชจ๋ธ์ ์ ๋ ฅ์ผ๋ก ์ฌ์ฉํ๊ธฐ ์ํด ๊ฐ ๋จ์ด๋ฅผ id ๋ก mapping ํฉ๋๋ค.
- tag_to_ix: ํ์ฌ ํ๊ทธ ๋ํ id ๋ก mapping ํฉ๋๋ค.
def prepare_sequence(seq, to_ix):
idxs = [to_ix[w] for w in seq]
return torch.tensor(idxs, dtype=torch.long)
training_data = [
# Tags are: DET - determiner; NN - noun; V - verb
# For example, the word "The" is a determiner
("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
for word in sent:
if word not in word_to_ix: # word has not been assigned an index yet
word_to_ix[word] = len(word_to_ix) # Assign each word with a unique index
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2} # Assign each tag with a unique index
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
Embedding layer, output layer, lstm cell ์ ํฌํจํ LSTMTagger ๋ชจ๋์ ์ ์ํฉ๋๋ค.
- embeds: input id ๋ฅผ embedding layer ๋ก encode ํ์ฌ input ์ ํด๋นํ๋ embedding ์์ฑํฉ๋๋ค.
- lstm_out: embedding ์ lstm ์ ํต๊ณผํ์ฌ ์ ์ฒด ์ํ์ค์ ๋ํ hidden state ๋ฅผ ์ ์ฅํฉ๋๋ค.
- tag_space: lstm ์ output ์ธ hidden ์ ์ด์ฉํด ์กด์ฌํ๋ tag (DET, NN, V) ๊ณต๊ฐ์ผ๋ก linear transform ํฉ๋๋ค.
- tag_scores: ์ดํ softmax ๋ฅผ ์ ์ฉํ์ฌ ๊ฐ tag ๊ฐ ๋ score ๋ฅผ ์ธก์ ํฉ๋๋ค.
class LSTMTagger(nn.Module):
def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
super(LSTMTagger, self).__init__()
self.hidden_dim = hidden_dim
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm = nn.LSTM(embedding_dim, hidden_dim)
# The linear layer that maps from hidden state space to tag space
self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
def forward(self, sentence):
embeds = self.word_embeddings(sentence)
lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
tag_scores = F.log_softmax(tag_space, dim=1)
return tag_scores
model ์ build ํ๊ณ , ํ์ต์ ํ์ํ loss ํจ์์ optimizer ๋ฅผ ์ ์ธํฉ๋๋ค.
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
์ด์ , training data ๋ฅผ ์ด์ฉํด ๋ชจ๋ธ์ ํ์ตํฉ๋๋ค. ์ฆ, input ์ LSTMTagger ์ ํต๊ณผ์์ผ ๊ฐ ๋จ์ด์ PoS tag ๋ฅผ ์์ธกํ๊ณ , ์ ๋ต tag ์ ๋น๊ตํ์ฌ loss ๋ฅผ ๊ณ์ฐํ ํ loss ๋ฅผ backpropagate ํ์ฌ ๋ชจ๋ธ ํ๋ผ๋ฏธํฐ๋ฅผ ์ ๋ฐ์ดํธ ํฉ๋๋ค
for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data
for sentence, tags in training_data:
# Step 1. Remember that Pytorch accumulates gradients.
# We need to clear them out before each instance
model.zero_grad()
# Step 2. Get our inputs ready for the network, that is, turn them into
# Tensors of word indices.
sentence_in = prepare_sequence(sentence, word_to_ix)
targets = prepare_sequence(tags, tag_to_ix)
# Step 3. Run our forward pass.
tag_scores = model(sentence_in)
# Step 4. Compute the loss, gradients, and update the parameters by
# calling optimizer.step()
loss = loss_function(tag_scores, targets)
loss.backward()
optimizer.step()
LSTM ์ด ์๋ GRU ๋ฅผ ์ฌ์ฉํ๋ ค๋ฉด nn.GRU ๋ฅผ ํ์ฉํ ์ ์์ต๋๋ค.
์์ฒ๋ผ ํ์ต ๋ฐ์ดํฐ๋ง ํ์ฉํด ๋ชจ๋ธ์ ํ๊ฐํ ๊ฒฝ์ฐ ๋ชจ๋ธ์ generalization ์ฑ๋ฅ์ด ์ด๋ป๊ฒ ๋๋์ง ํ๊ฐํ๊ธฐ ์ด๋ ต์ต๋๋ค. ํ์ต์ ๋ณด์ง ๋ชปํ ์๋ก์ด ๋ฐ์ดํฐ๋ฅผ ์ด์ฉํด ๋ชจ๋ธ์ ํ๊ฐํด์ผ ํฉ๋๋ค. ์ฃผ์ด์ง ๋ฐ์ดํฐ๋ฅผ train, test ๋ก split ํ๊ฑฐ๋ cross-validation ๋ฐฉ๋ฒ์ ํ์ฉํด์ผ ํฉ๋๋ค.
Reference: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
'์์ฐ์ด ์ฒ๋ฆฌ > Today I learned :' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
transfomers ๋ฅผ ์ฌ์ฉํ ๊ฐ๋จํ ๋ถ๋ฅ ์์ (BertForSequenceClassification) (0) | 2023.01.12 |
---|---|
Word Embedding (0) | 2023.01.05 |
[์์ฐ์ด ์ฒ๋ฆฌ] RNN์ ๋ณด์ํ๋ LSTM๊ณผ GRU (0) | 2023.01.05 |
[์์ฐ์ด ์ฒ๋ฆฌ] ํ ํฐํ์ ํ ํฌ๋์ด์ ์ข ๋ฅ (0) | 2023.01.05 |
[์์ฐ์ด ์ฒ๋ฆฌ] ์ ๊ทํํ์ ์ ์ฒ๋ฆฌ ํ๊ธ๋ง ๋จ๊ธฐ๊ธฐ (0) | 2023.01.03 |