-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml19.py
93 lines (61 loc) · 3.14 KB
/
ml19.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#Principle Component Analysis
#It is used to reduce the no.of columns(features)
from sklearn.datasets import load_digits
import pandas as pd
dataset=load_digits()
#print(dataset.keys())
#print(dataset.data.shape) #(1797,64)
#print(dataset.data[0])
#print(dataset.data[0].reshape(8,8)) #for reshaping the data
import matplotlib.pyplot as plt
plt.gray()
plt.matshow(dataset.data[0].reshape(8,8)) #For printing the Oth element
plt.matshow(dataset.data[9].reshape(8,8)) #For printing the 9th element
#plt.show()
#print(dataset.target[:5]) #For printing the target of the data from 0 to 5(0,1,2,3,4)
df=pd.DataFrame(dataset.data,columns=dataset.feature_names)
#Changing the whole dataset into the dataframe(df)
#Lets see the target dataset
#print(dataset.target) #0 to 9
#Lets see the basic statistics of our main dataset
#print(df.describe())
X=df
y=dataset.target
from sklearn.preprocessing import StandardScaler
#Lets do the preprocessing of the data (scaling)
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X) #Scaling is done on the X(dataset)
#Lets split the data into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=30)
#Applying Logistic Regression
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,y_train) #Training our trainig set
print(model.score(X_test,y_test)) #Score of the Model (96%)
#Lets use PCA to reduce dimensions
#The original dimension of the dataset is (1797,64)
from sklearn.decomposition import PCA
pca=PCA(0.95) #This means we going to use the 95% of data which have more imoact on the model while predicting
x_pca=pca.fit_transform(X)
#print(x_pca.shape) #After using this we get(1797,29)
#print(pca.explained_variance_ratio_) #For knowing the variance
print(pca.n_components_) #To know the no.of features(columns)
X_train_pca, X_test_pca, y_train, y_test = train_test_split(x_pca, y, test_size=0.2, random_state=30)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_pca, y_train)
print(model.score(X_test_pca, y_test)) #After using the socre is still 96% but you see there is way difference in the computational power because we deleted lots of columns
#But this doesn't mean that it worked on every dataset works so well ,sometime the accuracy may degrade
#Lets see what will happen if i only select 2 components(Most Impactful components)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(X_pca.shape) #(1797,2)
#Lets see the variance Ratio
print(pca.explained_variance_ratio_)
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_pca, y_train)
print(model.score(X_test_pca, y_test))
#We get less accuancy (~60%) as using only 2 components did not retain much of the feature information.
# However in real life you will find many cases where using 2 or few PCA components can still give you a pretty good accuracy