ML day1 k-means

December 31, 2019

—

What is K-means

物以類聚的概念，K-means 的 K 就是幾群的意思。利用距離和群心的計算去完成聚類的任務。

K-means Algorithm

先設定 K 要分為幾群
輸入特徵
為 K 個群心計算 Euclidean distance
把每個資料分群至距離最短的該群心
重新計算各群的群心
不斷重複 3-5，直到收斂

Python sklearn example

import pandas as pd
import numpy as np
import sklearn.metrics as sm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn import preprocessing

url = "https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv"
data = pd.read_csv(url)
le = preprocessing.LabelEncoder()
data['species'] = le.fit_transform(data.iloc[:,-1])

X = data.iloc[:, 0:4].to_numpy()
y = data.iloc[:,-1].to_numpy()

'''
n_clusters 分幾群
n_init 運行幾次
init 選擇質心的演算法
verbose 為 True 顯示訓練過程
'''
model = KMeans(n_clusters=3, n_init=3, init='k-means++', verbose=True)
model.fit(X)

from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y, model.labels_))
print(confusion_matrix(y, model.labels_))

colormap=np.array(['Red','green','blue'])
plt.scatter(data.petal_length, data.petal_width, c=colormap[y], s=50)
plt.title('Classification Actually')
plt.show()

plt.scatter(data.petal_length, data.petal_width, c=colormap[model.labels_], s=50)
plt.title('Classification Prediction')
plt.show()

詳細配置

Implement K-means

參考來源

# Function: K Means
# -------------
# K-Means is an algorithm that takes in a dataset and a constant
# k and returns k centroids (which define clusters of data in the
# dataset which are similar to one another).
import numpy as np
import matplotlib.pyplot as plt

MAX_ITERATIONS = 10000
# Function: K Means
# -------------
# K-Means is an algorithm that takes in a dataset and a constant
# k and returns k centroids (which define clusters of data in the
# dataset which are similar to one another).
def kmeans(dataSet, k):
	
    # Initialize centroids randomly
    # numFeatures = dataSet.getNumFeatures()
    numFeatures = dataSet.shape[1]
    centroids = getRandomCentroids(numFeatures, k)
    # Initialize book keeping vars.
    iterations = 0
    oldCentroids = None
    
    # Run the main k-means algorithm
    while not shouldStop(oldCentroids, centroids, iterations):
        # Save old centroids for convergence test. Book keeping.
        oldCentroids = centroids
        iterations += 1
        
        # Assign labels to each datapoint based on centroids
        labels = getLabels(dataSet, centroids)
        
        # Assign centroids based on datapoint labels
        centroids = getCentroids(dataSet, labels, k)
        
    # We can get the labels too by calling getLabels(dataSet, centroids)
    return centroids
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations):
    if iterations > MAX_ITERATIONS: return True
    return (oldCentroids == centroids).any()
# Function: Get Labels
# -------------
# Returns a label for each piece of data in the dataset. 
def getLabels(dataSet, centroids):
    # For each element in the dataset, chose the closest centroid. 
    # Make that centroid the element's label.
    label_list = np.zeros(len(dataSet))
    for i in range(len(dataSet)):
        old_distance = getDistance(dataSet[i], centroids[0])
        label = 0
        for j in range(len(centroids)):
            distance = getDistance(dataSet[i], centroids[j])
            if old_distance > distance:
                label = j
        label_list[i] = label
    return label_list
    
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, labels, k):
    # Each centroid is the geometric mean of the points that
    # have that centroid's label. Important: If a centroid is empty (no points have
    # that centroid's label) you should randomly re-initialize it.
    col_num = dataSet.shape[1]
    sum_ = np.zeros([k, col_num])
    for i in range(k):
        index_k = np.where(labels == i)[0]
        sum_[i] = np.sum(dataSet[index_k], axis = 0) /(len(index_k))
    return sum_

def getRandomCentroids(numFeatures, k):
    centroids = np.random.normal(0,1,(k,numFeatures))
    return centroids

def getDistance(d1, d2):
    vec1 = np.array(d1)
    vec2 = np.array(d2)
    return np.sqrt(np.sum(np.square(vec1-vec2)))

from sklearn.datasets import make_blobs
if __name__ == '__main__' :
    X_varied, y = make_blobs(n_samples=200,
                                centers=5,
                                n_features=2,
                                # cluster_std=[1.0, 2.5, 0.5],
                                random_state=123)
    print(X_varied.shape)
    k = 5
    c = kmeans(X_varied, k)
    print(c)
    for i in range(len(X_varied)):
        plt.plot(X_varied[i, 0], X_varied[i, 1], marker='o', markersize = 5)
    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
    for i in range(k):
        plt.plot(c[i, 0], c[i, 1], mark[i], markersize = 12)
    plt.show()

更新日期

2010/01/01 新增實作 k-means 部分(還會繼續修正)

Home

Blog

Project

Life

Kubernetes

Coding

ML day1 k-means

What is K-means

K-means Algorithm

Python sklearn example

Implement K-means

更新日期