-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathml13.py
72 lines (59 loc) · 2.33 KB
/
ml13.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#K-means Clustering
#New Copy (Continue)
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
df=pd.read_csv("income.csv")
#plt.scatter(df.Age,df["Income($)"])
#plt.xlabel('Age')
#plt.ylabel('Income($)')
#plt.show()
km=KMeans(n_clusters=3,n_init=10) #Using KMeans and using k=3
y_predicted=km.fit(df[['Age','Income($)']]) #Fitting the the Age and Income Using KMaens
df['cluster']=y_predicted #Adding a new column which tells which age vs income lie in which cluster
#print(km.cluster_centers_) #For printing the positions of centriod
#df1 = df[df.cluster==0]
#df2 = df[df.cluster==1]
#df3 = df[df.cluster==2]
#plt.scatter(df1.Age,df1['Income($)'],color='green')
#plt.scatter(df2.Age,df2['Income($)'],color='red')
#plt.scatter(df3.Age,df3['Income($)'],color='black')
#plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',label='centroid') #For showing the centroids
#plt.xlabel('Age')
#plt.ylabel('Income ($)')
#plt.legend()
#plt.show()
#Upto this we are using normal data withouir preprocessing and we can clearly see that the centroid is shifted from the actual cluster
#Lets do the same thing but first use the preprocessing
##Preprocessing using min max scaler
scaler=MinMaxScaler()
scaler.fit(df[['Income($)']])
df['Income($)']=scaler.transform(df[['Income($)']])
#Minimizing the valuie of income by using MinMaxScaler for better calculation
km = KMeans(n_clusters=3,n_init=10)
y_predicted = km.fit_predict(df[['Age','Income($)']])
df['cluster']=y_predicted
km.cluster_centers_
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Age,df1['Income($)'],color='green')
plt.scatter(df2.Age,df2['Income($)'],color='red')
plt.scatter(df3.Age,df3['Income($)'],color='black')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',label='centroid')
plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.legend()
plt.show()
#Lets draw the Elbow Plot(Its use to find the optimal value of K)
sse=[] #Sum of Squared Error
k_rng=range(1,10)
for k in k_rng:
km=KMeans(n_init=10)
km.fit(df[['Age','Income($)']])
sse.append(km.inertia_)
plt.xlabel('K')
plt.ylabel('Sum of Squared Error')
plt.plot(k_rng,sse)
plt.show()