https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertForSequenceClassification
๊ฑฐ์ ๋ชจ๋ ์์ฐ์ด์ฒ๋ฆฌ ํ ์คํฌ์๋ ํธ๋์คํฌ๋จธ ๋ชจ๋์ ์ฌ์ฉํ๊ฒ ๋๋ค.
์ด๋ฒ์๋ ์ฌ์ ํ์ต๋ ๋ชจ๋ธ์ธ BERT๋ฅผ ๊ฐ์ง๊ณ ๊ฐ๋จํ ๋ถ๋ฅ ์์ ๋ฅผ ํด๋ณด๋ฉฐ transformers๋ฅผ ๋ง๋ณด๊ธฐ?ํ ๊ณํ์ด๋ค.
๋จผ์ BERT์ MLM์ ํ์ธํด๋ณด์.
์ํ ๋ฆฌ๋ทฐ ๋ฐ์ดํฐ๋ก ๋ถ๋ฅ๋ฅผ ํธ๋ ์ธํด๋ณด๊ธฐ
pip install transformers
pip install datasets
from datasets import load_dataset
data = load_dataset("imdb")
import torch
from transformers import BertTokenizer, BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
#bert-base-uncased์ mlm๊ธฐ๋ฐํ์ต- BertForSequenceClassification์ ํด๋นํ๋ ๊ฐ์ค์น๋ง ํ์ต๋จ
from pprint import pprint
from transformers import BertConfig, BertForMaskedLM
from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast
#์ ์ฒ๋ฆฌ
import re
def preprocess(sample):
return{
'text': ' '.join(re.sub(r'<[^(?:/>)]+/>',' ',sample['text']).split()),
'label':sample['label']
}
preprocessed = data.map(preprocess)
preprocessed
DatasetDict({
train: Dataset({ features: ['text', 'label'], num_rows: 25000 })
test: Dataset({ features: ['text', 'label'],
num_rows: 25000 })
unsupervised: Dataset({ features: ['text', 'label'], num_rows: 50000 })
})
from transformers import BertTokenizer
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased",use_fast=True)#BertTokenizerfast๋ก ๋จ
#or
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased",use_fast=True)
preprocessed = preprocessed.map(
lambda sample: tokenizer(sample['text'],truncation=True),#truncation-๊ธธ์ด๊ฐ ๊ธธ๋ฉด ์๋ฆ 512๋์ผ๋ฉด ์๋ฆ
remove_columns=['text'],
batched=True
)
#1000๊ฐ ๋ฌธ์ฅ์ ๋ฐฐ์น๋ก ์๋ผ ์ ์ฒ๋ฆฌ
#ํจ๋ฉ์ด ์๋ง๋๊ฒฝ์ฐ ํ๋์ ๋ฐฐ์น๋ฅผ ๋ง๋ค์ด์ผํ๋๊ฒฝ์ฐ
from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tokenizer)
from torch.utils.data import DataLoader
train_loader = DataLoader(preprocessed['train'],batch_size=16, collate_fn=collator, shuffle=True)
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
#or
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
import torch #finetuningํ ๋ mlm์ผ๋ก ํ์ต๋์ง ์์ ๋ถ๋ถ ํ์ต์ํค๊ณ ์ถ๋ค!
optimizer = torch.optim.AdamW(
[
{"params":model.bert.parameters(), "lr":3e-5},
{"params":model.classifier.parameters(), "lr":1e-3},
]
)
model.train()
for epoch in range(3):
print(f"Epoch: {epoch}")
for encodings in train_loader:
encodings = {key:value.cuda() for key,value in encodings.items()}
outputs = model(**encodings)
outputs.loss.backward()
print('\rLoss: ',outputs.loss.item(),end='')
optimizer.step()
optimizer.zero_grad(set_to_none=False)
loss๊ฐ ์ ์ ์ค์ด๋ฆ์ ํ์ธ
์ด ๊ณผ์ ์ ํ๊น ํ์ด์ค์ trainer๋ฅผ ์จ์ ํ๋ฉด ์์ฝ๊ฒ ํ ์ ์์
https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
#from transformers import DataCollatorWithPadding
#from transformers import AutoModelForSequenceClassification ๊น์ง๋ ์คํ ํ
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
num_train_epochs=3.0,
per_device_train_batch_size=16,
output_dir='dump/test'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset= preprocessed['train'],
eval_dataset=preprocessed['test'],
data_collator=collator
)
trainer.train()
'์์ฐ์ด ์ฒ๋ฆฌ > Today I learned :' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
ํธ๋์คํฌ๋จธ์ ์ดํ ์ ์ ๋ฆฌ --2 (0) | 2023.01.16 |
---|---|
ํธ๋์คํฌ๋จธ์ ์ดํ ์ ์ ๋ฆฌ --1 (1) | 2023.01.15 |
Word Embedding (0) | 2023.01.05 |
[์์ฐ์ด ์ฒ๋ฆฌ] ํ์ดํ ์น LSTM ๊ตฌํ (0) | 2023.01.05 |
[์์ฐ์ด ์ฒ๋ฆฌ] RNN์ ๋ณด์ํ๋ LSTM๊ณผ GRU (0) | 2023.01.05 |