{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# build model prediction and compare each features set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import xgboost as xgb\n",
    "import pickle\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "from gensim.models import Doc2Vec, doc2vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 上傳資料\n",
    "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_6.zip\n",
    "!unzip -q NLP_part2_6.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## read preprocess article df\n",
    "df = pd.read_csv('Data/article_preprocessed.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## drop data\n",
    "diff_threshold = 20\n",
    "df = df[abs(df['push']-df['boo']) > diff_threshold].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## define y\n",
    "df['type'] = np.clip(df['push']-df['boo'], 0, 1)\n",
    "df = df.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['type'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## create a numpy format data\n",
    "basic_data = np.zeros((df.shape[0], 258))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "basic_data[:, 0] = df['idx']\n",
    "basic_data[:, 1] = df['type']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## bag of words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## load bag of words result\n",
    "with open('Data/article_count', 'rb') as file:\n",
    "    _, count = pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## select top 256 words (counts of document) \n",
    "most_count_id = np.array((count > 0).sum(axis=0))[0].argsort()[::-1][:256]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## subset data\n",
    "count = count[:, most_count_id]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "count_data = basic_data.copy().astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## subset bag of words matrix\n",
    "count_data[:, 2:] = count[count_data[:, 0]].toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 請產生 tf-idf, average word2vec, doc2vec features, 並分別建立 prediction model (xgboost, svm or linear regression...) , 比較 4 組 features 的結果 (AUC)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## code"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}