-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_test_split_model.py
43 lines (27 loc) · 1.53 KB
/
train_test_split_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import os
import json
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
file_path1 = os.path.join("artifact", "inputs.json")
file_path2 = os.path.join("artifact", "outputs.json")
with open(file_path1, "r") as f:
X = json.load(f)
with open(file_path2, "r") as f:
Y = json.load(f)
model = AutoModelForSequenceClassification.from_pretrained("sbcBI/sentiment_analysis_model", num_labels = 9, ignore_mismatched_sizes=True, problem_type="multi_label_classification")
tokenizer_path = os.path.join("artifact", "tokenizer")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train_tokenized = tokenizer(X_train, padding="max_length", truncation=True, return_tensors="pt")
X_test_tokenized = tokenizer(X_test, padding="max_length", truncation=True, return_tensors="pt")
Y_train_tensor = torch.tensor(Y_train)
Y_test_tensor = torch.tensor(Y_test)
torch.save(X_test_tokenized, os.path.join('artifact', 'X_test_tokenized.pt'))
torch.save(Y_test_tensor, os.path.join('artifact', 'Y_test_tensor.pt'))
torch.save(X_train_tokenized, os.path.join('artifact', 'X_train_tokenized.pt'))
torch.save(Y_train_tensor, os.path.join('artifact', 'Y_train_tensor.pt'))
model.save_pretrained(os.path.join('artifact', "downloaded_model"))
print("Done splitting the data and downloaded the model")