refer to code:
.
import mlflow
from sklearn.metrics import adjusted_rand_score, silhouette_score
from sklearn.cluster import KMeans
from sklearn import datasets
import json
import matplotlib.pyplot as plt
try:
mlflow.create_experiment(name="iris_experiment")
except mlflow.exceptions.MlflowException:
print("The experement already exist")
with mlflow.start_run(experiment_id=mlflow.get_experiment_by_name("iris_experiment").experiment_id):
iris = datasets.load_iris()
length, width = iris['data'].shape
mlflow.log_dict( {"length":length, "width":width }, "shape.json")
params_dict = {"length":iris['data'].shape[0], "width":iris['data'].shape[1]}
params_json = json.dumps(params_dict)
mlflow.log_param(key="param", value=params_json)
BestK = 0
BestScore = 0
best_predict_labels = 0
best_centroids = 0
for step, k in enumerate(range(2,10)):
print(f'---- f:{k}')
# Fit the model to the data
kmeans = KMeans(n_clusters=k, random_state=0)
X = iris['data']
kmeans.fit(X)
centroids = kmeans.cluster_centers_
# Get the cluster assignments for each data point
predicted_labels = kmeans.labels_
true_labels = iris['target']
# Calculate the Adjusted Rand Index
ari = adjusted_rand_score(true_labels, predicted_labels)
print("Adjusted Rand Index:", ari)
# Calculate the Silhouette Score
sil_score = silhouette_score(X, predicted_labels)
print("Silhouette Score:", sil_score)
if BestScore < (ari+sil_score)/2 :
BestScore = (ari+sil_score)/2
BestK = k
best_predict_labels = predicted_labels
best_centroids = centroids
#log metric
# mlflow.log_metric(key="kmean score", value={"ARI":ari, "Sil":sil_score})
mlflow.log_metric(key="kmean_ari", value=ari, step=step)
mlflow.log_metric(key="silhouette_score", value=sil_score, step=step)
print('+++++++++')
print(f'best score:{BestScore}, k:{BestK}')
fig, ax = plt.subplots()
ax.hist(predicted_labels)
mlflow.log_figure(fig, artifact_file="lables_hist.png")
..
The code you provided is a Python script that uses the MLflow library to perform K-means clustering on the Iris dataset, logs the model parameters and metrics, and identifies the best K value (number of clusters) based on the Adjusted Rand Index (ARI) and Silhouette Score. It also generates a histogram plot of the predicted labels for the best K value and saves it as an artifact.
Here's an overview of the code:
Import the necessary libraries, including MLflow, scikit-learn, and matplotlib.
Create a new MLflow experiment called "iris_experiment" if it doesn't already exist.
Start an MLflow run in the context of the "iris_experiment".
Load the Iris dataset and log its shape as a dictionary.
Iterate through different K values (number of clusters) from 2 to 9 and perform the following steps:
a. Instantiate and fit a KMeans model with the current K value.
b. Obtain the predicted cluster assignments and compare them with the true labels using ARI and Silhouette Score.
c. Log the ARI and Silhouette Score metrics for each K value using MLflow.
d. Keep track of the best K value based on the average of ARI and Silhouette Score.
Print the best K value and its corresponding score.
Create a histogram plot of the predicted labels for the best K value and save it as an artifact using MLflow.
Thank you.
www.marearts.com
๐๐ป♂️
No comments:
Post a Comment