What is K-means
物以類聚的概念,K-means
的 K
就是幾群的意思。利用距離和群心的計算去完成聚類的任務。
K-means Algorithm
- 先設定 K 要分為幾群
- 輸入特徵
- 為 K 個群心計算 Euclidean distance
- 把每個資料分群至距離最短的該群心
- 重新計算各群的群心
- 不斷重複 3-5,直到收斂
Python sklearn example
import pandas as pd
import numpy as np
import sklearn.metrics as sm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn import preprocessing
url = "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv"
data = pd.read_csv(url)
le = preprocessing.LabelEncoder()
data['species'] = le.fit_transform(data.iloc[:,-1])
X = data.iloc[:, 0:4].to_numpy()
y = data.iloc[:,-1].to_numpy()
'''
n_clusters 分幾群
n_init 運行幾次
init 選擇質心的演算法
verbose 為 True 顯示訓練過程
'''
model = KMeans(n_clusters=3, n_init=3, init='k-means++', verbose=True)
model.fit(X)
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y, model.labels_))
print(confusion_matrix(y, model.labels_))
colormap=np.array(['Red','green','blue'])
plt.scatter(data.petal_length, data.petal_width, c=colormap[y], s=50)
plt.title('Classification Actually')
plt.show()
plt.scatter(data.petal_length, data.petal_width, c=colormap[model.labels_], s=50)
plt.title('Classification Prediction')
plt.show()
Implement K-means
# Function: K Means
# -------------
# K-Means is an algorithm that takes in a dataset and a constant
# k and returns k centroids (which define clusters of data in the
# dataset which are similar to one another).
import numpy as np
import matplotlib.pyplot as plt
MAX_ITERATIONS = 10000
# Function: K Means
# -------------
# K-Means is an algorithm that takes in a dataset and a constant
# k and returns k centroids (which define clusters of data in the
# dataset which are similar to one another).
def kmeans(dataSet, k):
# Initialize centroids randomly
# numFeatures = dataSet.getNumFeatures()
numFeatures = dataSet.shape[1]
centroids = getRandomCentroids(numFeatures, k)
# Initialize book keeping vars.
iterations = 0
oldCentroids = None
# Run the main k-means algorithm
while not shouldStop(oldCentroids, centroids, iterations):
# Save old centroids for convergence test. Book keeping.
oldCentroids = centroids
iterations += 1
# Assign labels to each datapoint based on centroids
labels = getLabels(dataSet, centroids)
# Assign centroids based on datapoint labels
centroids = getCentroids(dataSet, labels, k)
# We can get the labels too by calling getLabels(dataSet, centroids)
return centroids
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations):
if iterations > MAX_ITERATIONS: return True
return (oldCentroids == centroids).any()
# Function: Get Labels
# -------------
# Returns a label for each piece of data in the dataset.
def getLabels(dataSet, centroids):
# For each element in the dataset, chose the closest centroid.
# Make that centroid the element's label.
label_list = np.zeros(len(dataSet))
for i in range(len(dataSet)):
old_distance = getDistance(dataSet[i], centroids[0])
label = 0
for j in range(len(centroids)):
distance = getDistance(dataSet[i], centroids[j])
if old_distance > distance:
label = j
label_list[i] = label
return label_list
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, labels, k):
# Each centroid is the geometric mean of the points that
# have that centroid's label. Important: If a centroid is empty (no points have
# that centroid's label) you should randomly re-initialize it.
col_num = dataSet.shape[1]
sum_ = np.zeros([k, col_num])
for i in range(k):
index_k = np.where(labels == i)[0]
sum_[i] = np.sum(dataSet[index_k], axis = 0) /(len(index_k))
return sum_
def getRandomCentroids(numFeatures, k):
centroids = np.random.normal(0,1,(k,numFeatures))
return centroids
def getDistance(d1, d2):
vec1 = np.array(d1)
vec2 = np.array(d2)
return np.sqrt(np.sum(np.square(vec1-vec2)))
from sklearn.datasets import make_blobs
if __name__ == '__main__' :
X_varied, y = make_blobs(n_samples=200,
centers=5,
n_features=2,
# cluster_std=[1.0, 2.5, 0.5],
random_state=123)
print(X_varied.shape)
k = 5
c = kmeans(X_varied, k)
print(c)
for i in range(len(X_varied)):
plt.plot(X_varied[i, 0], X_varied[i, 1], marker='o', markersize = 5)
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
for i in range(k):
plt.plot(c[i, 0], c[i, 1], mark[i], markersize = 12)
plt.show()
更新日期
- 2010/01/01 新增實作 k-means 部分(還會繼續修正)