{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下載 seeds.csv\n",
    "! wget \"https://raw.githubusercontent.com/TA-aiacademy/course_3.0/main/02_ML/part7/Chapter9/excercise/datasets/seeds.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from scipy.stats import pearsonr\n",
    "import numpy as np\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('seeds.csv', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**這個dataset 是穀物的資料，其中column 3 and column 4 分別是穀物的長與寬**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**練習一: 請取出這兩個feature並計算其pearson correlation**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from scipy.stats import pearsonr\n",
    "\n",
    "# 寬度\n",
    "width = df.iloc[:, 4].values\n",
    "\n",
    "# 長度\n",
    "length = df.iloc[:, 3].values\n",
    "\n",
    "# 畫出scatter plot, width vs length\n",
    "plt.scatter(width, length)\n",
    "plt.axis('equal')\n",
    "plt.show()\n",
    "\n",
    "# 算出 Pearson correlation\n",
    "correlation, pvalue = pearsonr(width, length)\n",
    "\n",
    "# Display the correlation\n",
    "print(correlation)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "**練習2: 請利用pca 將上述長與寬兩個feature 進行轉換以去除其correlation, 並顯示其轉換後的correlation**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import PCA\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "# Create PCA instance: model\n",
    "model = PCA()\n",
    "grains = df.iloc[:, [3, 4]]\n",
    "# Apply the fit_transform method of model to grains: pca_features\n",
    "pca_features = # [your code here] #\n",
    "\n",
    "# Assign 0th column of pca_features: xs\n",
    "xs = # [your code here] #\n",
    "\n",
    "# Assign 1st column of pca_features: ys\n",
    "ys = # [your code here] #\n",
    "\n",
    "# Scatter plot xs vs ys\n",
    "plt.scatter(xs, ys)\n",
    "plt.axis('equal')\n",
    "plt.show()\n",
    "\n",
    "# Calculate the Pearson correlation of xs and ys\n",
    "correlation, pvalue = pearsonr(xs, ys)\n",
    "\n",
    "# Display the correlation\n",
    "print(correlation)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**練習3: 檢視explained variance**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import make_pipeline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "samples = df.values\n",
    "# 先對資料進行標準化, 宣告一個名為scaler的StandardScaler\n",
    "scaler = # [your code here] #\n",
    "\n",
    "# 宣告一個名為 pca的PCA\n",
    "pca = # [your code here] #\n",
    "\n",
    "# 以make_pipeline將前述兩個模型依序串一起\n",
    "pipeline = ## [your code here] ##\n",
    "\n",
    "# 將模型fit資料samples\n",
    "# [your code here] #\n",
    "\n",
    "# 將各feature 的explained_variance_ 畫出\n",
    "features = range(pca.n_components_)\n",
    "plt.bar(features, pca.explained_variance_)\n",
    "plt.xlabel('PCA feature')\n",
    "plt.ylabel('variance')\n",
    "plt.xticks(features)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grains = np.zeros((data.shape[0], 2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grains[:, 0] = width\n",
    "grains[:, 1] = length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 畫出width 與 length的散佈圖\n",
    "plt.scatter(grains[:, 0], grains[:, 1])\n",
    "\n",
    "# Create a PCA instance: model\n",
    "model = PCA()\n",
    "\n",
    "# Fit model to points\n",
    "model.fit(grains)\n",
    "\n",
    "# Get the mean of the grain samples: mean\n",
    "mean = model.mean_\n",
    "\n",
    "# Get the first principal component: first_pc\n",
    "first_pc = model.components_[0, :]\n",
    "\n",
    "# Plot first_pc as an arrow, starting at mean\n",
    "plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)\n",
    "\n",
    "# Keep axes on same scale\n",
    "plt.axis('equal')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}