https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertForSequenceClassification
BERT
call < source > ( input_ids: typing.Union[typing.List[tensorflow.python.framework.ops.Tensor], typing.List[numpy.ndarray], typing.List[tensorflow.python.keras.engine.keras_tensor.KerasTensor], typing.Dict[str, tensorflow.python.framework.ops.Tensor], typin
huggingface.co
๊ฑฐ์ ๋ชจ๋ ์์ฐ์ด์ฒ๋ฆฌ ํ ์คํฌ์๋ ํธ๋์คํฌ๋จธ ๋ชจ๋์ ์ฌ์ฉํ๊ฒ ๋๋ค.
์ด๋ฒ์๋ ์ฌ์ ํ์ต๋ ๋ชจ๋ธ์ธ BERT๋ฅผ ๊ฐ์ง๊ณ ๊ฐ๋จํ ๋ถ๋ฅ ์์ ๋ฅผ ํด๋ณด๋ฉฐ transformers๋ฅผ ๋ง๋ณด๊ธฐ?ํ ๊ณํ์ด๋ค.
๋จผ์ BERT์ MLM์ ํ์ธํด๋ณด์.
์ํ ๋ฆฌ๋ทฐ ๋ฐ์ดํฐ๋ก ๋ถ๋ฅ๋ฅผ ํธ๋ ์ธํด๋ณด๊ธฐ
pip install transformers
pip install datasets
from datasets import load_dataset
data = load_dataset("imdb")
import torch
from transformers import BertTokenizer, BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
#bert-base-uncased์ mlm๊ธฐ๋ฐํ์ต- BertForSequenceClassification์ ํด๋นํ๋ ๊ฐ์ค์น๋ง ํ์ต๋จ
from pprint import pprint
from transformers import BertConfig, BertForMaskedLM
from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast
#์ ์ฒ๋ฆฌ
import re
def preprocess(sample):
return{
'text': ' '.join(re.sub(r'<[^(?:/>)]+/>',' ',sample['text']).split()),
'label':sample['label']
}
preprocessed = data.map(preprocess)
preprocessed
DatasetDict({
train: Dataset({ features: ['text', 'label'], num_rows: 25000 })
test: Dataset({ features: ['text', 'label'],
num_rows: 25000 })
unsupervised: Dataset({ features: ['text', 'label'], num_rows: 50000 })
})
from transformers import BertTokenizer
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased",use_fast=True)#BertTokenizerfast๋ก ๋จ
#or
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased",use_fast=True)
preprocessed = preprocessed.map(
lambda sample: tokenizer(sample['text'],truncation=True),#truncation-๊ธธ์ด๊ฐ ๊ธธ๋ฉด ์๋ฆ 512๋์ผ๋ฉด ์๋ฆ
remove_columns=['text'],
batched=True
)
#1000๊ฐ ๋ฌธ์ฅ์ ๋ฐฐ์น๋ก ์๋ผ ์ ์ฒ๋ฆฌ
#ํจ๋ฉ์ด ์๋ง๋๊ฒฝ์ฐ ํ๋์ ๋ฐฐ์น๋ฅผ ๋ง๋ค์ด์ผํ๋๊ฒฝ์ฐ
from transformers import DataCollatorWithPadding
collator = DataCollatorWithPadding(tokenizer)
from torch.utils.data import DataLoader
train_loader = DataLoader(preprocessed['train'],batch_size=16, collate_fn=collator, shuffle=True)
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
#or
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
import torch #finetuningํ ๋ mlm์ผ๋ก ํ์ต๋์ง ์์ ๋ถ๋ถ ํ์ต์ํค๊ณ ์ถ๋ค!
optimizer = torch.optim.AdamW(
[
{"params":model.bert.parameters(), "lr":3e-5},
{"params":model.classifier.parameters(), "lr":1e-3},
]
)
model.train()
for epoch in range(3):
print(f"Epoch: {epoch}")
for encodings in train_loader:
encodings = {key:value.cuda() for key,value in encodings.items()}
outputs = model(**encodings)
outputs.loss.backward()
print('\rLoss: ',outputs.loss.item(),end='')
optimizer.step()
optimizer.zero_grad(set_to_none=False)
loss๊ฐ ์ ์ ์ค์ด๋ฆ์ ํ์ธ
์ด ๊ณผ์ ์ ํ๊น ํ์ด์ค์ trainer๋ฅผ ์จ์ ํ๋ฉด ์์ฝ๊ฒ ํ ์ ์์
https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
Trainer
When using gradient accumulation, one step is counted as one step with backward pass. Therefore, logging, evaluation, save will be conducted every gradient_accumulation_steps * xxx_step training examples.
huggingface.co
#from transformers import DataCollatorWithPadding
#from transformers import AutoModelForSequenceClassification ๊น์ง๋ ์คํ ํ
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
num_train_epochs=3.0,
per_device_train_batch_size=16,
output_dir='dump/test'
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset= preprocessed['train'],
eval_dataset=preprocessed['test'],
data_collator=collator
)
trainer.train()
'์์ฐ์ด ์ฒ๋ฆฌ > Today I learned :' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
ํธ๋์คํฌ๋จธ์ ์ดํ ์ ์ ๋ฆฌ --2 (0) | 2023.01.16 |
---|---|
ํธ๋์คํฌ๋จธ์ ์ดํ ์ ์ ๋ฆฌ --1 (1) | 2023.01.15 |
Word Embedding (0) | 2023.01.05 |
[์์ฐ์ด ์ฒ๋ฆฌ] ํ์ดํ ์น LSTM ๊ตฌํ (0) | 2023.01.05 |
[์์ฐ์ด ์ฒ๋ฆฌ] RNN์ ๋ณด์ํ๋ LSTM๊ณผ GRU (0) | 2023.01.05 |