728x90
๋ฐ์ํ
Pre-trained BERT๋ฅผ ํ์ฉํ์ฌ ์ํ๋ฆฌ๋ทฐ ๊ฐ์ ๋ถ๋ฅ
๋๊ฐ์ง ๋ฐฉ๋ฒ:
1. transformers ์ Trainer๋ฅผ ํ์ฉํ๋ค.
2. pytorch๋ฅผ ์ฌ์ฉํ๋ค.
์ด ๋๊ฐ์ง ๋ฐฉ๋ฒ์ ๋ค ์ตํ๋๋ ๊ฒ์ด ์ข์ผ๋ฉฐ, 1๋ฒ์ ์์ ํน์ ๋ฏธ์ธ์กฐ์ ํ๊ธฐ๊ฐ ์กฐ๊ธ ๊น๋ค๋ก์์ 2๋ฒ์ผ๋ก ํ๋ ๊ฒ์ ์ ํธ ํ๋ ๊ฒฝ์ฐ๊ฐ ๋ง๋ค๊ณ ํ๋ค.
tokenizer.model_max_length= 512
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets['test'].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets['train']
full_eval_dataset = tokenized_datasets['test']
์ฒซ๋ฒ์งธ๋ฐฉ๋ฒ
#Transformers library๋ฅผ ์ด์ฉํ ์ํ ๋ฆฌ๋ทฐ ๋ถ๋ฅ๊ธฐ ํ์ต
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments("test_trainer")
# ์ ์ฒด dataset ํ์ต/ํ๊ฐ์ ์ํ์๋ ๋ถ๋ค์ full_train_dataset, full_eval_dataset์ ์ฌ์ฉํ์๋ฉด ๋ฉ๋๋ค.
trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
trainer.train()
model = BertForSequenceClassification.from_pretrained('finiteautomata/beto-sentiment-analysis')
trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
trainer = Trainer(model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
compute_metrics=compute_metrics)
trainer.evaluate()
๋๋ฒ์งธ๋ฐฉ๋ฒ
#Pytorch library๋ฅผ ์ด์ฉํ ์ํ ๋ฆฌ๋ทฐ ๋ถ๋ฅ๊ธฐ ํ์ต
from transformers import AdamW
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
optimizer = AdamW(model.parameters(), lr=5e-5)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
# ๋ง์ฐฌ๊ฐ์ง๋ก 1000๊ฐ์ ํ์ต/ํ๊ฐ ๋ฐ์ดํฐ์
๋ง์ ์ด์ฉํด ์งํํด๋ณด๊ฒ ์ต๋๋ค.
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
from torch.utils.data import DataLoader
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=32)
import torch
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
for input in train_dataloader:
input = {k: v.to(device) for k, v in input.items()}
outputs = model(**input)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
progress_bar.update()
metric = load_metric("accuracy")
model.eval()
all_pred = []
all_ref = []
for input in eval_dataloader:
input = {k: v.to(device) for k, v in input.items()}
with torch.no_grad():
outputs = model(**input)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
all_pred.append(predictions.cpu().detach().numpy())
all_ref.append(input['labels'].cpu().detach().numpy())
metric.add_batch(predictions=predictions, references=input['labels'])
metric.compute()
๋ฐ์ํ
'์์ฐ์ด ์ฒ๋ฆฌ > Today I learned :' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
์์ฐ์ด์ฒ๋ฆฌ ๋ชจ๋ธ์ด ํ์คํฌ๋ฅผ ์ํํ๋ ๋ฐฉ๋ฒ์? (์ธ ์ปจํ ์คํธ ๋ฌ๋, ์ ๋ก์ท, ์์ท ํจ์ท ๋ฌ๋) (0) | 2023.01.17 |
---|---|
์ธ์ด๋ชจ๋ธ GPT (1) | 2023.01.17 |
ํ๊น ํ์ด์ค์ ํธ๋์คํฌ๋จธ ๐ค Huggingface's Transformers (0) | 2023.01.16 |
ํธ๋์คํฌ๋จธ์ ์ดํ ์ ์ ๋ฆฌ --2 (0) | 2023.01.16 |
ํธ๋์คํฌ๋จธ์ ์ดํ ์ ์ ๋ฆฌ --1 (1) | 2023.01.15 |