{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 下載 data.txt\n", "!wget \"https://raw.githubusercontent.com/TA-aiacademy/course_3.0/ML/02_ML/part6/Chapter7/data.txt\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "from numpy import loadtxt\n", "from xgboost import XGBClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import accuracy_score\n", "import matplotlib.pyplot as plt\n", "%pylab inline\n", "\n", "# load pima-indians-diabetes data, it's a binary classification task\n", "dataset = loadtxt(\"data.txt\", delimiter=\",\")\n", "\n", "# split data into X (features) and y (label)\n", "X = dataset[:, 0:8]\n", "y = dataset[:, 8]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "scrolled": true }, "outputs": [], "source": [ "print(X.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "print(y.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "# we have two labels, 0 with no diabetes, 1 with disbetes\n", "y[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# split our data into train & test\n", "# train : test = 0.67 : 0.33\n", "X_train, X_test, y_train, y_test = train_test_split(X,\n", " y,\n", " random_state=7,\n", " test_size=0.33)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "# eval_metrics can selecy rmse, logloss, error,\\\n", "# auc, merror, mlogloss or custom define\n", "eval_set = [(X_test, y_test)]\n", "model = XGBClassifier(early_stopping_rounds=10,\n", " eval_metric=\"auc\")\n", "model.fit(X_train,\n", " y_train,\n", " \n", " eval_set=eval_set,\n", " verbose=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "# model.predict_proba will return the probility\n", "# model.predict will return the predict label (use 0.5 as threshold)\n", "y_pred = model.predict_proba(X_test)\n", "y_pred[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "# we use model.predict to get the label\n", "y_pred = model.predict(X_test)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy: %.2f%%\" % (accuracy * 100.0))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "# we can show the feature importances for our features\n", "print(model.feature_importances_)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "# import the plot_importance function to visualize the feature importance\n", "from xgboost import plot_importance\n", "plot_importance(model)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "!pip install graphviz" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "from xgboost import plot_tree\n", "from matplotlib.pylab import rcParams\n", "\n", "plot_tree(model, num_trees=1)\n", "# plt.title(\"max_depth = 100, with gamma = 10\")\n", "# plt.savefig(\"tree_with_max_depth_gamma\", dpi = 700)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reference for XGBoost\n", "\n", "[Well explained for Gradient Boosting](http://blog.kaggle.com/2017/01/23/a-kaggle-master-explains-gradient-boosting/)\n", "\n", "[Analytic vidhya parameter tuning](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)\n", "[How parralle xgboost work](http://zhanpengfang.github.io/418home.html)\n", "\n", "[How to tune gamma](https://medium.com/data-design/xgboost-hi-im-gamma-what-can-i-do-for-you-and-the-tuning-of-regularization-a42ea17e6ab6)\n", "\n", "[slides for xgboost by tqchen](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)\n", "\n", "[slides for xgboost by kaggle winner](https://www.slideshare.net/ShangxuanZhang/kaggle-winning-solution-xgboost-algorithm-let-us-learn-from-its-author)\n", "\n", "[xgboost 理論與參數介紹](https://medium.com/jameslearningnote/%E8%B3%87%E6%96%99%E5%88%86%E6%9E%90-%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92-%E7%AC%AC5-2%E8%AC%9B-kaggle%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92%E7%AB%B6%E8%B3%BD%E7%A5%9E%E5%99%A8xgboost%E4%BB%8B%E7%B4%B9-1c8f55cffcc)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.2" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }