{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part2/06-2_predict_ans.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9T3doV20AG30"
      },
      "source": [
        "# build model prediction and compare each features set"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "clkVOQ0OAG4F"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import xgboost as xgb\n",
        "import pickle\n",
        "import numpy as np\n",
        "import os\n",
        "\n",
        "from gensim.models import Doc2Vec, doc2vec"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ub2JM2pFAG4K"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_6.zip\n",
        "!unzip -q NLP_part2_6.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "P9AeqAH_AG4O"
      },
      "outputs": [],
      "source": [
        "# read preprocess article df\n",
        "df = pd.read_csv('Data/article_preprocessed.csv')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "r-sw-X9GAG4Q"
      },
      "outputs": [],
      "source": [
        "# drop data\n",
        "diff_threshold = 20\n",
        "df = df[abs(df['push']-df['boo']) > diff_threshold].copy()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "uZ8Jwu7HAG4S"
      },
      "outputs": [],
      "source": [
        "# define y\n",
        "df['type'] = np.clip(df['push']-df['boo'], 0, 1)\n",
        "df = df.reset_index(drop=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9sjj_b20AG4U"
      },
      "outputs": [],
      "source": [
        "df['type'].value_counts()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2rYJnUtXAG4X"
      },
      "outputs": [],
      "source": [
        "# create a numpy format data\n",
        "basic_data = np.zeros((df.shape[0], 258))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9-u8wW5aAG4Z"
      },
      "outputs": [],
      "source": [
        "basic_data[:, 0] = df['idx']\n",
        "basic_data[:, 1] = df['type']"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fdg7qJcWAG4b"
      },
      "source": [
        "## bag of words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gwIUQ0gcAG4c"
      },
      "outputs": [],
      "source": [
        "# load bag of words result\n",
        "with open('Data/article_count', 'rb') as file:\n",
        "    _, count = pickle.load(file)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "MZz3YBdnAG4e"
      },
      "outputs": [],
      "source": [
        "# select top 256 words (counts of document)\n",
        "most_count_id = np.array((count > 0).sum(axis=0))[0].argsort()[::-1][:256]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KOKmqPEoAG4f"
      },
      "outputs": [],
      "source": [
        "# subset data\n",
        "count = count[:, most_count_id]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "AJT_hTXkAG4g"
      },
      "outputs": [],
      "source": [
        "count_data = basic_data.copy().astype('int')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "K7wRViPOAG4i"
      },
      "outputs": [],
      "source": [
        "# subset bag of words matrix\n",
        "count_data[:, 2:] = count[count_data[:, 0]].toarray()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8gN2gDufAG4j"
      },
      "source": [
        "## TF-IDF"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "nARDCwHaAG4k"
      },
      "outputs": [],
      "source": [
        "# load tf-idf result\n",
        "with open('Data/article_tfidf', 'rb') as file:\n",
        "    _, tfidf = pickle.load(file)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mx74pYbQAG4l"
      },
      "outputs": [],
      "source": [
        "# select top 256 words (counts of document)\n",
        "most_tfidf_id = np.array((tfidf > 0).sum(axis=0))[0].argsort()[::-1][:256]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fcqkGOGfAG4m"
      },
      "outputs": [],
      "source": [
        "# subset data\n",
        "tfidf = tfidf[:, most_tfidf_id]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NwPLB-2vAG4m"
      },
      "outputs": [],
      "source": [
        "tfidf_data = basic_data.copy().astype('int')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9RBDqESBAG4n"
      },
      "outputs": [],
      "source": [
        "# subset tf-idf matrix\n",
        "tfidf_data[:, 2:] = tfidf[tfidf_data[:, 0]].toarray()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lH5KmIaCAG4n"
      },
      "source": [
        "## average word2vec"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BdIJ29AvAG4o"
      },
      "outputs": [],
      "source": [
        "# load average word2vec result\n",
        "with open('Data/avg_article_vector', 'rb') as file:\n",
        "    avg_vector = pickle.load(file)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SgxD_8YpAG4p"
      },
      "outputs": [],
      "source": [
        "avg_data = basic_data.copy()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7LyCK7qOAG4p"
      },
      "outputs": [],
      "source": [
        "# select rows of average word2vec\n",
        "for i, row in df.iterrows():\n",
        "    avg_data[i, 2:] = avg_vector[row['idx']]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OSaIM7KtAG4q"
      },
      "source": [
        "## doc2vec"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Sx-7qmUnAG4q"
      },
      "outputs": [],
      "source": [
        "# load doc2vec model\n",
        "model = Doc2Vec.load('word2vec_model/doc2vec')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zBZWY26tAG4q"
      },
      "outputs": [],
      "source": [
        "doc2vec_data = basic_data.copy()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "VtOFWmsmAG4r"
      },
      "outputs": [],
      "source": [
        "# select idx of doc2vec\n",
        "for i, row in df.iterrows():\n",
        "    doc2vec_data[i, 2:] = model.docvecs[str(row['idx'])]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HVexRhi2AG4s"
      },
      "source": [
        "# prediction model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HioP_zRDAG4t"
      },
      "outputs": [],
      "source": [
        "from sklearn.model_selection import train_test_split"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NKbl6KvzAG4t"
      },
      "outputs": [],
      "source": [
        "# split data to training and testing data\n",
        "train, test = train_test_split(df, test_size=0.2, stratify=df['type'])\n",
        "train_idx = np.array(train.index)\n",
        "test_idx = np.array(test.index)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RzbtLrEAAG4t"
      },
      "outputs": [],
      "source": [
        "# define a dictionary to collect model result\n",
        "result = {}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-aDpdPyyAG4u"
      },
      "source": [
        "## train model use xgboost"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "TiQc2g9jAG4u"
      },
      "outputs": [],
      "source": [
        "# bag of words\n",
        "model = xgb.XGBClassifier()\n",
        "model.fit(count_data[train_idx, 2:], count_data[train_idx, 1],\n",
        "          eval_set=[(count_data[test_idx, 2:], count_data[test_idx, 1])], eval_metric='auc'\n",
        "         )\n",
        "\n",
        "# testing auc\n",
        "result['bag_of_words'] = model.evals_result()['validation_0']['auc'][-1]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "N6IHir6NAG4v"
      },
      "outputs": [],
      "source": [
        "# tf-idf\n",
        "model = xgb.XGBClassifier()\n",
        "model.fit(tfidf_data[train_idx, 2:], tfidf_data[train_idx, 1],\n",
        "          eval_set=[(tfidf_data[test_idx, 2:], tfidf_data[test_idx, 1])], eval_metric='auc'\n",
        "         )\n",
        "\n",
        "# testing auc\n",
        "result['tf-idf'] = model.evals_result()['validation_0']['auc'][-1]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "Yrk26aBUAG4v"
      },
      "outputs": [],
      "source": [
        "# average word2vec\n",
        "model = xgb.XGBClassifier()\n",
        "model.fit(avg_data[train_idx, 2:], avg_data[train_idx, 1],\n",
        "          eval_set=[(avg_data[test_idx, 2:], avg_data[test_idx, 1])], eval_metric='auc'\n",
        "         )\n",
        "\n",
        "# testing auc\n",
        "result['avg_word2vec'] = model.evals_result()['validation_0']['auc'][-1]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "LworZW9AAG4w"
      },
      "outputs": [],
      "source": [
        "# doc2vec\n",
        "model = xgb.XGBClassifier()\n",
        "model.fit(doc2vec_data[train_idx, 2:], doc2vec_data[train_idx, 1],\n",
        "          eval_set=[(doc2vec_data[test_idx, 2:], doc2vec_data[test_idx, 1])], eval_metric='auc'\n",
        "         )\n",
        "\n",
        "# testing auc\n",
        "result['doc2vec'] = model.evals_result()['validation_0']['auc'][-1]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Q69o6KD_AG4x"
      },
      "source": [
        "## plot result"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LUJTEsUCAG40"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "XGpWGEtbAG41"
      },
      "outputs": [],
      "source": [
        "plt.bar(np.arange(4), result.values())\n",
        "plt.xticks(np.arange(4), result.keys())\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VJ5ykAHgAG41"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}