from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

먼저 필요한 도구 들을 불러와 줍니다. 당연하게 kmeans도 불러오고, 가우시안 값들로 이루어진 샘플데이터셋도 생성해 줍니다.

np.random.seed(0)

centers = [[1, 1], [0, 0], [2, -1]]

data, labels_true = make_blobs(n_samples = 2000, centers = centers, 
                               cluster_std = 0.7)

임의의 난수 값들을 생성하고, 불러온 데이터의 중심점들, 2000개의 배열 형태로 이루어진 클러스터 대상 데이터, 데이터의 실제 라벨을 설정해줍니다. 시드 값이란 난수를 생성할 때 사용되는 초기값입니다. 그래서 같은 시드값을 설정하면 같은 난수 값들을 생성합니다.

data.shape

(2000, 2)

print(data)
print()
print(labels_true)
print(np.unique(labels_true))

[[ 2.88735684  0.94825273]
 [ 0.00712986  1.53880744]
 [ 0.3264657  -0.06607475]
 ...
 [ 0.53901292  0.64003622]
 [ 1.65065358  1.40755721]
 [ 0.74131908 -0.71579507]]

[0 1 1 ... 0 0 1]
[0 1 2]

배열 값 중 중복을 제거하고 다른 값들을 추출해 줍니다.

plt.figure(figsize=(15,10))
plt.scatter(data[:,0], data[:,1])

<matplotlib.collections.PathCollection at 0x1996bb790b8>

이제 데이터들을 삼점도로 확인해봅니다.

# compute clustering with KMeans
estimator = KMeans(init = 'k-means++', n_clusters = 3, n_init = 10)
estimator.fit(data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

이제 모델을 학습시킬 차례입니다. kmeans를 수행하는 함수를 이용해서 훈련을 시켜줍니다. 초기 중심값을 3개로 정했으니 3개의 군집으로 나눠보겠습니다.

labels_predict = estimator.labels_
np.unique(labels_predict)

array([0, 1, 2])

cm = plt.cm.get_cmap('jet')
scaled_labels = (labels_predict - np.min(labels_predict)) 
scaled_labels = scaled_labels /(np.max(labels_predict) - np.min(labels_predict))
np.unique(scaled_labels)

array([0. , 0.5, 1. ])

jet는 칼라맵의 변수이고, 예측한 라벨데이터를 정규화해줍니다.

plt.figure(figsize=(15,10))
plt.scatter(data[:,0], data[:,1], c = cm(scaled_labels))

<matplotlib.collections.PathCollection at 0x1996bba2e48>

결과를 확인해보면 중심값들을 기준으로 군집들이 형성된 것을 확인할 수 있습니다. 군집분석은 아주 유용한 분석도구 중의 하나이기 때문에 조금 더 심화된 내용들을 꾸준히 업로드 할 것입니다. 참고해주세요 :)

Justkeepitsteady

K-means clustering에 대해 알아봅시다 :)

티스토리툴바