์์ ์ธ์ : 1๋ช , ์์๊ธฐ๊ฐ : 1์ผ, ์์ ํ๊ฒฝ : google colab, python, KoBERT, ๋ฐ์ดํฐ : ๋ด์ค๊ธฐ์ฌ ํ ์คํฌ์ ํด๋น ์นดํ ๊ณ ๋ฆฌ
์ ํ๋ 0.847, F1 Score - 0.845
์์ฐ์ด์ฒ๋ฆฌ์์ ๋ถ๋ฅ์ ์ฌ์ฉ๋๋ ๋ชจ๋ธ์ ๊ฐ๋จํ RNN, LSTM๋ถํฐ BERT , ๋ฑ ๋ค์ํ ๋ชจ๋ธ์ด ์์ต๋๋ค. ์ด๋ฒ์๋ KoBERT๋ฅผ ์ฌ์ฉํด์ Fine-tuningํด๋ณด์๋๋ฐ, KoBERT๋ BERT์ ํ๊ตญ์ด ๋ฐ์ดํฐ๋ฅผ ์ถ๊ฐ๋ก ๋ ํ์ต์ํจ ๋ชจ๋ธ๋ก ํ๊ตญ์ด์ ๋ ์ข์ ์ฑ๋ฅ์ ๋ณด์ด๋ ๊ฒ์ด ํน์ง์ ๋๋ค. ์๋ ๋งํฌ์์ ๋ชจ๋ธ์ ๋ณผ ์ ์์ต๋๋ค.
https://github.com/SKTBrain/KoBERT
์คํ ์ ์ฝ๋ฉ์์ ๋ฐํ์ ์ ํ์ GPU๋ก ๋ณ๊ฒฝํฉ๋๋ค.
1. ํ์ํ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ๋ถ๋ฌ์ค๊ธฐ, ๋ชจ๋ธ ๋ค์ด๋ก๋
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3.0.2
!pip install torch
# KoBERT ๋ค์ด, ๋ก๋
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup''
import numpy as np
import random
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
# ์๋ ๊ณ ์
RANDOM_SEED = 2022
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
#BERT ๋ชจ๋ธ, Vocabulary ๋ถ๋ฌ์ค๊ธฐ
bertmodel, vocab = get_pytorch_kobert_model()
2. ์ ์ฒ๋ฆฌํ๊ธฐ (ํ๊ธ๊ณผ ์์ด๋ง ๋จ๊ธฐ๊ณ ๋ชจ๋ ์ญ์ ํ์ต๋๋ค.)
import re
df['text'] = [re.sub('[^A-Za-z0-9๊ฐ-ํฃ]', '', s) for s in df['text']]
+ ๋ฐ์ดํฐ ์ฆ๊ฐ์ผ๋ก ์์งํ ๋ฐ์ดํฐ๋ฅผ ๊ฐ ์นดํ ๊ณ ๋ฆฌ๋ณ๋ก 8000~8500 ๊ฐ๋ก ๋ง์ท์ต๋๋ค. ์ฐธ๊ณ !
https://getacherryontop.tistory.com/109
3. ๋ผ๋ฒจ ์ธ์ฝ๋ฉ
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(df['category'])
df['category'] = encoder.transform(df['category'])
mapping = dict(zip(range(len(encoder.classes_)), encoder.classes_))
๋ฌธ์์ธ ์นดํ ๊ณ ๋ฆฌ๋ฅผ ์ซ์๋ก ๋ณ๊ฒฝํด์ค๋๋ค.
data_list = []
for q, label in zip(df['text'], df['category']) :
data = []
data.append(q)
data.append(str(label))
data_list.append(data)
4. ํ๋ผ๋ฏธํฐ ์ค์
Bert์ ํน์ฑ์ 512ํ ํฐ๊น์ง๋ฐ์ ๋ค์ด๊ฐ์ง ์๋๋ค๋ ์ ์ ๋๋ค.(์ฝ 2~3๋ฌธ์ฅ ์ ๋) ์๋ฌด๋ฆฌ ๊ธด ํ ์คํธ๋ฅผ ๋ฃ์ด๋ ํด๋น ๊ธธ์ด๋ฐ์ ํ์ต์ด ๋์ง ์์ต๋๋ค. ๋ฐ๋ผ์ max_length๋ก ์์์๋ถํฐ ์๋ผ ๋ฃ์ด์ค๋๋ค. ๋ฐฐ์น์ฌ์ด์ฆ๋ ์๋ฌด๋๋ gpu์ ํ ๋น๋์ด ์์ด ์ต๋ํ ์๊ฒ ๋ฃ์์ต๋๋ค.
max_len = 128
batch_size = 32
class CustomDataset(Dataset):
def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
pad, pair):
transform = nlp.data.BERTSentenceTransform(
bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
self.sentences = [transform([i[sent_idx]]) for i in dataset]
self.labels = [np.int32(i[label_idx]) for i in dataset]
def __getitem__(self, i):
return (self.sentences[i] + (self.labels[i], ))
def __len__(self):
return (len(self.labels))
__init__ : ์ด๊ธฐํ , __len__ : ๋ฐ์ดํฐ์
(input)์ ๊ธธ์ด ๋ฐํ, __getitem__ : ๋ฐ์ดํฐ์
์ ์ธ๋ฑ์ค๋ก ๋ถ๋ฌ์ด
4. ๊ฒ์ฆ ๋ฐ์ดํฐ์ ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
dataset_train, dataset_val = train_test_split(data_list, test_size=0.1, random_state=42)
train , 44066 ๊ฐ , test 4897 ๊ฐ์ ๋๋ค.
5. ํ ํฐํ
#ํ ํฐํ
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
data_train = CustomDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_val = CustomDataset(dataset_val, 0, 1, tok, max_len, True, False)
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, num_workers=5)
valid_dataloader = torch.utils.data.DataLoader(data_val, batch_size=batch_size, shuffle=True, num_workers=5)
6. ํ์ดํผํ๋ผ๋ฏธํฐ, ๋ถ๋ฅ๋ชจ๋ธ ์ค์
learning_rate = 2e-5
num_epochs = 4
log_interval = 200
warmup_ratio = 0.1
# ๊ฒฝ๋ก ์ค์
ROOT_PATH = '/content/drive/MyDrive'
DATA_DIR = '/content/drive/MyDrive'
class BertClassifier(nn.Module):
def __init__(self,
bert,
hidden_size = 768,
num_classes=6, ##ํด๋์ค ์ ์กฐ์ ##
dr_rate=None,
params=None):
super(BertClassifier, self).__init__()
self.bert = bert
self.dr_rate = dr_rate
self.classifier = nn.Linear(hidden_size , num_classes)
if dr_rate:
self.dropout = nn.Dropout(p=dr_rate)
def gen_attention_mask(self, token_ids, valid_length):
attention_mask = torch.zeros_like(token_ids)
for i, v in enumerate(valid_length):
attention_mask[i][:v] = 1
return attention_mask.float()
def forward(self, token_ids, valid_length, segment_ids):
attention_mask = self.gen_attention_mask(token_ids, valid_length)
_, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
if self.dr_rate:
out = self.dropout(pooler)
return self.classifier(out)
#BERT ๋ชจ๋ธ ๋ถ๋ฌ์ค๊ธฐ
model = BertClassifier(bertmodel, dr_rate=0.5).to(device)
#optimizer์ schedule ์ค์
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
#์ ํ๋ ์ธก์ ์ ์ํ ํจ์ ์ ์
def calc_accuracy(X,Y):
max_vals, max_indices = torch.max(X, 1)
train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
return train_acc
train_dataloader
7. train
criterion = 0 #initialization
print('Training Start...')
for e in range(num_epochs):
train_acc = 0.0
val_acc = 0.0
model.train()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
optimizer.zero_grad()
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
loss = loss_fn(out, label)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
optimizer.step()
scheduler.step() # Update learning rate schedule
train_acc += calc_accuracy(out, label)
if batch_id % log_interval == 0:
print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
print("epoch {} train loss {} train acc {}".format(e+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(valid_dataloader)):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
loss = loss_fn(out, label)
val_acc += calc_accuracy(out, label)
print("epoch {} valid loss {} valid acc {}".format(e+1, loss.data.cpu().numpy(), val_acc / (batch_id+1)))
if val_acc / (batch_id+1) > criterion:
# ๋ชจ๋ธ์ด ๊ฐ์ ๋จ -> ๊ฒ์ฆ ์ ์์ weight ๊ฐฑ์
criterion = val_acc / (batch_id+1)
check_point = {
'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'scheduler': scheduler.state_dict()
}
# ๋ชจ๋ธ์ด ๊ฐ์ ๋จ -> ๋ชจ๋ธ ์ ์ฅ
print("Update model save...")
torch.save(check_point, os.path.join(ROOT_PATH, 'model_best.pt'))
print('Training Finish!')
8. ํ์ต๋ ๋ชจ๋ธ ์ ์ฉ
class TestDataset(Dataset):
def __init__(self, dataset, sent_idx, bert_tokenizer, max_len,
pad, pair):
transform = nlp.data.BERTSentenceTransform(
bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
self.categories = mapping
self.num_labels = len(self.categories)
self.sentences = [transform([i]) for i in dataset]
def label_decoder(self, labels):
try:
labels = list(map(lambda x : self.categories[x], labels))
return labels
except:
assert 'Invalid intent'
def __len__(self):
return (len(self.sentences))
def __getitem__(self, i):
return (self.sentences[i])
dataset_test = testdf['text'].tolist()
data_test = TestDataset(dataset_test, 0, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)
#MODEL_DIR = os.path.join(ROOT_PATH, 'model_best.pt')
#model = BertClassifier(bertmodel, dr_rate=0.5).to(device)
#model.load_state_dict(torch.load(MODEL_DIR)['model'])
model.eval()
pred = []
for batch_id, (token_ids, valid_length, segment_ids) in enumerate(test_dataloader):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length = valid_length.long().to(device)
out = model(token_ids, valid_length, segment_ids)
pred.extend(out.argmax(dim=1).tolist())
# ์งํ๊ณผ์ ์ถ๋ ฅ
if batch_id % 10 == 0:
print(f'Prediction: {batch_id}/{len(test_dataloader)} completed')
pred = data_test.label_decoder(pred)
print('decode Completed!')
๊ฒฐ๊ณผ
๋๋ฆ ์ ๋์์์ ๋ณผ ์ ์์ต๋๋ค.
'๋ํ ํ๋ก์ ํธ > ํ๋ก์ ํธ' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
๊ฐ์ ๋ถ๋ฅ ๋ชจ๋ธ ๋ง๋ค๊ณ ์ฑ๋ฅ ๊ฐ์ ๊น์ง (BERT, GPT2, RoBERTa, DistilBERT) (0) | 2023.01.29 |
---|