-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdefect-detection.py
139 lines (108 loc) · 5.07 KB
/
defect-detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import pandas as pd
from sklearn.metrics import confusion_matrix
# https://huggingface.co/datasets/mcanoglu/defect-detection
Train_dataset = pd.read_json('./data/train.jsonl', lines=True)
Train_dataset = Train_dataset[Train_dataset['code'].str.len() < 512]
Train_dataset = Train_dataset[Train_dataset['programming_language'] == 'Java']
Validation_dataset = pd.read_json('./data/validation.jsonl', lines=True)
Validation_dataset = Validation_dataset[Validation_dataset['code'].str.len() < 512]
Validation_dataset = Validation_dataset[Validation_dataset['programming_language'] == 'Java']
Test_dataset = pd.read_json('./data/test.jsonl', lines=True)
Test_dataset = Test_dataset[Test_dataset['code'].str.len() < 512]
Test_dataset = Test_dataset[Test_dataset['programming_language'] == 'Java']
x_train = list(Train_dataset['code'])
y_train = list(Train_dataset['label'])
x_val = list(Validation_dataset['code'])
y_val = list(Validation_dataset['label'])
x_test = list(Test_dataset['code'])
y_test = list(Test_dataset['label'])
tokenizer = RobertaTokenizer.from_pretrained('neulab/codebert-java')
model = RobertaForSequenceClassification.from_pretrained('neulab/codebert-java')
# freeze all layers except classifier
#for name, param in model.named_parameters():
# if 'classifier' not in name:
# param.requires_grad = False
class PreEncodedCustomDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len):
self.labels = [int(label) for label in labels]
self.encodings = [tokenizer(text, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt') for text in texts]
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
encoding = {key: val.squeeze() for key, val in self.encodings[idx].items()}
encoding['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
return encoding
train_dataset = PreEncodedCustomDataset(x_train, y_train, tokenizer, max_len=512)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = PreEncodedCustomDataset(x_val, y_val, tokenizer, max_len=512)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
num_hidden_units = model.config.hidden_size
num_classes = 2 # binary classification
classifier_head = nn.Sequential(
nn.Linear(num_hidden_units, 128),
nn.ReLU(),
nn.Dropout(0.1),
nn.Linear(128, num_classes)
)
#model.classifier = classifier_head
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = torch.nn.CrossEntropyLoss()
num_epochs = 1
# RobertaForSequenceClassification() and BertForSequenceClassification() has a built in classifier layer
#model.to(device)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total # of training epochs
per_device_train_batch_size=2, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
logging_dir='./logs', # directory for storing logs
fp16=True,
gradient_accumulation_steps=4,
evaluation_strategy="epoch",
optim="adafactor",
weight_decay=0.001,
warmup_steps=20,
#resume_from_checkpoint='./results/checkpoint-500',
save_steps=2000000
)
trainer = Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset, # evaluation dataset
)
trainer.train()
#trainer.evaluate()
predictions = []
true_labels = []
test_dataset = PreEncodedCustomDataset(x_test, y_test, tokenizer, max_len=512)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
with torch.no_grad():
for batch in tqdm(test_loader, desc='Validation'):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
true_labels.extend(labels.cpu().numpy())
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')
print(f'Validation Precision: {precision * 100:.2f}%')
print(f'Validation Recall: {recall * 100:.2f}%')
print(f'Validation F1: {f1 * 100:.2f}%')
cm = confusion_matrix(true_labels, predictions)
print("Confusion matrix:")
print(cm)