{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下載 seeds.csv\n",
    "! wget \"https://raw.githubusercontent.com/TA-aiacademy/course_3.0/main/02_ML/part7/Chapter9/excercise/datasets/seeds.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "from sklearn.datasets import make_blobs\n",
    "# dataset: https://archive.ics.uci.edu/ml/datasets/seeds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('seeds.csv', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 利用以上的seeds dataset，請利用Kmeans 進行clustering, 並設法找到最適合的number of clusters\n",
    "請比照教材中的繪圖，將inertial 對 number of cluster的折線圖劃出 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "from sklearn.metrics import silhouette_score\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "ks = range(1, 6)\n",
    "inertias = []\n",
    "samples = df.values\n",
    "for k in ks:\n",
    "    # Create a KMeans instance with k clusters: model\n",
    "    model = KMeans(n_clusters=k)\n",
    "\n",
    "    # Fit model to samples\n",
    "    model.fit(samples)\n",
    "\n",
    "    # Append the inertia to the list of inertias\n",
    "    inertias.append(model.inertia_)\n",
    "\n",
    "# Plot ks vs inertias\n",
    "plt.plot(ks, inertias, '-o')\n",
    "plt.xlabel('number of clusters, k')\n",
    "plt.ylabel('inertia')\n",
    "plt.xticks(ks)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 先指定centroids的位置及各cluster的標準差\n",
    "blob_centers = np.array([[-0.2, 2.3], [1.5, 2.3], [2.8, 2.8], [2.8, 1.3],\n",
    "                         [-1, -2], [-2, 2]])\n",
    "blob_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1, 0.2])\n",
    "X, y = make_blobs(n_samples=300,\n",
    "                  centers=blob_centers,\n",
    "                  cluster_std=blob_std,\n",
    "                  random_state=7)\n",
    "# samples = df1.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 利用以上的dataset，請利用Kmeans 進行clustering, 並設法找到最適合的number of clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# k = 1~12 做十二次kmeans, 並將每次結果的inertia收集在一個list裡\n",
    "kmeans_per_k = [\n",
    "    KMeans(n_clusters=k, random_state=42).fit(X) for k in range(1, 12)\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "silhouette_scores = [\n",
    "    silhouette_score(X, model.labels_) for model in kmeans_per_k[1:]\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "silhouette_scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(8, 3))\n",
    "plt.plot(range(2, 12), silhouette_scores, \"bo-\")\n",
    "plt.xlabel(\"$k$\", fontsize=14)\n",
    "plt.ylabel(\"Silhouette score\", fontsize=14)\n",
    "# plt.axis([1.8, 8.5, 0.55, 0.75])\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}