# Imports

from sklearn import cluster, datasets

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

from sklearn import metrics
# Daten herunterladen
wines = datasets.load_wine()
# Daten skalieren
scaler = StandardScaler()
data_scaled = scaler.fit_transform(wines.data);
# K-Means anwenden. Wir wissen, dass 3 Zielklassen existieren, daher erwarten wir n_clusters=3 Cluster
# Bei Daten ohne Zielklasse muss dieser Parameter variiert werden!

# Wir geben an, mit welchem random_state die zufälligen Anfangszentroide festgelegt werden sollen.

kmeans = cluster.KMeans(n_clusters=3, random_state=0, n_init=10).fit(data_scaled)
# Welche Cluster wurden für die Instanzen gefunden?
print(kmeans.labels_)
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 0 1 1 1 1 1 1 1 1 1 1 1 2
 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# Wie groß sind die Cluster? Wir möchten 3 Cluster sehen.

for i in range(0,3):
    print("Name:", i, "Größe: ", list(kmeans.labels_).count(i))
  
# Wir können uns auch die Zentroide anzeigen lassen - diese geben Aufschluss über die Charakteristika der Cluster
# und sind daher für die Interpretation der Cluster nützlich.

print (kmeans.cluster_centers_)
Name: 0 Größe:  51
Name: 1 Größe:  66
Name: 2 Größe:  61
[[ 0.16490746  0.87154706  0.18689833  0.52436746 -0.07547277 -0.97933029
  -1.21524764  0.72606354 -0.77970639  0.94153874 -1.16478865 -1.29241163
  -0.40708796]
 [-0.93900326 -0.39196582 -0.43920097  0.20898793 -0.46377382 -0.05334831
   0.06690377 -0.01982215  0.06479192 -0.88207529  0.45298189  0.28973833
  -0.75602559]
 [ 0.87809728 -0.30457633  0.31894179 -0.66452366  0.56488825  0.87650546
   0.94363903 -0.58558981  0.58178294  0.16718842  0.48372814  0.76705349
   1.15834713]]
def bench_k_means(kmeans, name, data, labels):
    """Benchmark to evaluate the KMeans initialization methods.

    Parameters
    ----------
    kmeans : KMeans instance
        A :class:`~sklearn.cluster.KMeans` instance that has already been trained
    name : str
        Name given to the strategy. It will be used to show the results in a
        table.
    data : ndarray of shape (n_samples, n_features)
        The data whose clusters should be evaluated.
    labels : ndarray of shape (n_samples,)
        The labels used to compute the supervised clustering metrics.
    """
    
    # Inertia
    results = [name, kmeans.inertia_]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(data_scaled, kmeans.labels_,
                                 metric="euclidean")
    ]
    
    # Supervised metrics which require the true labels and cluster
    # labels
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
    ]
    results += [m(labels, kmeans.labels_) for m in clustering_metrics]

    # Show the results
    formatter_result = ("{:9s}\t{:.0f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
    print(formatter_result.format(*results))
# Aufrufen der Benchmark mit unserem trainierten kmeans und einer zufälligen Zuweisung (zum Vergleich)

print(82 * '_')
print('init\t\tinertia\tsil\thom\tcom\tv-meas')

# unser kmeans-Modell (zur Sicherheit neu trainieren)
kmeans = cluster.KMeans(n_clusters=3, n_init=10, random_state=0).fit(data_scaled)
bench_k_means(kmeans=kmeans, name="k-means_0", data=data_scaled, labels=wines["target"])

# unser kmeans-Modell (random_state 42)
kmeans = cluster.KMeans(n_clusters=3, n_init=10, random_state=42).fit(data_scaled)
bench_k_means(kmeans=kmeans, name="k-means_42", data=data_scaled, labels=wines["target"])

# unser kmeans-Modell (10x trainiert; das Modell mit der besten inertia gewinnt)
kmeans = cluster.KMeans(n_clusters=3, n_init=10).fit(data_scaled)
bench_k_means(kmeans=kmeans, name="k-means_opti", data=data_scaled, labels=wines["target"])

# zufällige Cluster-Zuordnung erzeugen
kmeans_rand = cluster.KMeans(init="random", n_init=10, n_clusters=3, random_state=0).fit(data_scaled)
bench_k_means(kmeans=kmeans_rand, name="random", data=data_scaled, labels=wines["target"])

print(82 * '_')
__________________________________________________________________________________
init		inertia	sil	hom	com	v-meas
k-means_0	1279	0.286	0.895	0.890	0.893
k-means_42	1278	0.285	0.879	0.873	0.876
k-means_opti	1278	0.285	0.879	0.873	0.876
random   	1278	0.285	0.879	0.873	0.876
__________________________________________________________________________________