# Imports
from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn import metrics
# Daten herunterladen
wines = datasets.load_wine()
# Daten skalieren
scaler = StandardScaler()
data_scaled = scaler.fit_transform(wines.data);
# K-Means anwenden. Wir wissen, dass 3 Zielklassen existieren, daher erwarten wir n_clusters=3 Cluster
# Bei Daten ohne Zielklasse muss dieser Parameter variiert werden!
# Wir geben an, mit welchem random_state die zufälligen Anfangszentroide festgelegt werden sollen.
kmeans = cluster.KMeans(n_clusters=3, random_state=0, n_init=10).fit(data_scaled)
# Welche Cluster wurden für die Instanzen gefunden?
print(kmeans.labels_)
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 0 1 1 1 1 1 1 1 1 1 1 1 2
1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# Wie groß sind die Cluster? Wir möchten 3 Cluster sehen.
for i in range(0,3):
print("Name:", i, "Größe: ", list(kmeans.labels_).count(i))
# Wir können uns auch die Zentroide anzeigen lassen - diese geben Aufschluss über die Charakteristika der Cluster
# und sind daher für die Interpretation der Cluster nützlich.
print (kmeans.cluster_centers_)
Name: 0 Größe: 51
Name: 1 Größe: 66
Name: 2 Größe: 61
[[ 0.16490746 0.87154706 0.18689833 0.52436746 -0.07547277 -0.97933029
-1.21524764 0.72606354 -0.77970639 0.94153874 -1.16478865 -1.29241163
-0.40708796]
[-0.93900326 -0.39196582 -0.43920097 0.20898793 -0.46377382 -0.05334831
0.06690377 -0.01982215 0.06479192 -0.88207529 0.45298189 0.28973833
-0.75602559]
[ 0.87809728 -0.30457633 0.31894179 -0.66452366 0.56488825 0.87650546
0.94363903 -0.58558981 0.58178294 0.16718842 0.48372814 0.76705349
1.15834713]]
def bench_k_means(kmeans, name, data, labels):
"""Benchmark to evaluate the KMeans initialization methods.
Parameters
----------
kmeans : KMeans instance
A :class:`~sklearn.cluster.KMeans` instance that has already been trained
name : str
Name given to the strategy. It will be used to show the results in a
table.
data : ndarray of shape (n_samples, n_features)
The data whose clusters should be evaluated.
labels : ndarray of shape (n_samples,)
The labels used to compute the supervised clustering metrics.
"""
# Inertia
results = [name, kmeans.inertia_]
# The silhouette score requires the full dataset
results += [
metrics.silhouette_score(data_scaled, kmeans.labels_,
metric="euclidean")
]
# Supervised metrics which require the true labels and cluster
# labels
clustering_metrics = [
metrics.homogeneity_score,
metrics.completeness_score,
metrics.v_measure_score,
]
results += [m(labels, kmeans.labels_) for m in clustering_metrics]
# Show the results
formatter_result = ("{:9s}\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
print(formatter_result.format(*results))
# Aufrufen der Benchmark mit unserem trainierten kmeans und einer zufälligen Zuweisung (zum Vergleich)
print(82 * '_')
print('init\t\tinertia\tsil\thom\tcom\tv-meas')
# unser kmeans-Modell (zur Sicherheit neu trainieren)
kmeans = cluster.KMeans(n_clusters=3, n_init=10, random_state=0).fit(data_scaled)
bench_k_means(kmeans=kmeans, name="k-means_0", data=data_scaled, labels=wines["target"])
# unser kmeans-Modell (random_state 42)
kmeans = cluster.KMeans(n_clusters=3, n_init=10, random_state=42).fit(data_scaled)
bench_k_means(kmeans=kmeans, name="k-means_42", data=data_scaled, labels=wines["target"])
# unser kmeans-Modell (10x trainiert; das Modell mit der besten inertia gewinnt)
kmeans = cluster.KMeans(n_clusters=3, n_init=10).fit(data_scaled)
bench_k_means(kmeans=kmeans, name="k-means_opti", data=data_scaled, labels=wines["target"])
# zufällige Cluster-Zuordnung erzeugen
kmeans_rand = cluster.KMeans(init="random", n_init=10, n_clusters=3, random_state=0).fit(data_scaled)
bench_k_means(kmeans=kmeans_rand, name="random", data=data_scaled, labels=wines["target"])
print(82 * '_')
__________________________________________________________________________________
init inertia sil hom com v-meas
k-means_0 1279 0.286 0.895 0.890 0.893
k-means_42 1278 0.285 0.879 0.873 0.876
k-means_opti 1278 0.285 0.879 0.873 0.876
random 1278 0.285 0.879 0.873 0.876
__________________________________________________________________________________