{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part3/text_classification.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hmDNzNm_VlrR"
      },
      "source": [
        "# RNN text classification"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "g-_VNikcVlra"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import os\n",
        "import pandas as pd\n",
        "import time\n",
        "\n",
        "from pprint import pprint\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import classification_report, confusion_matrix\n",
        "\n",
        "import tensorflow as tf\n",
        "import tensorflow_datasets as tfds\n",
        "print('tensorflow version: ', tf.__version__)\n",
        "\n",
        "# 指定使用第一張GPU\n",
        "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4F52YnIHVlrg"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part3.zip\n",
        "!unzip -q NLP_part3.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LA6wWwcqVlri"
      },
      "outputs": [],
      "source": [
        "output_dir = \"Data\"\n",
        "zh_vocab_file = os.path.join(output_dir, \"zh_vocab\")\n",
        "checkpoint_path = os.path.join(output_dir, \"checkpoints.h5\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "D8kAKoGIVlrk"
      },
      "source": [
        "## Load Data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4_fjI1w6Vlrm"
      },
      "outputs": [],
      "source": [
        "ptt_gossip = pd.read_csv('Data/ptt_gossip.csv')\n",
        "ptt_gossip.drop(columns='idx', inplace=True)\n",
        "print(ptt_gossip.shape)\n",
        "ptt_gossip.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "n35_p8zLVlro"
      },
      "source": [
        "## Filter sentence length\n",
        "\n",
        "依照句子長度過濾"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "be0ezjxqVlrq"
      },
      "outputs": [],
      "source": [
        "max_length = 256\n",
        "\n",
        "ptt_gossip = ptt_gossip[ptt_gossip.sentence.str.len() < max_length]\n",
        "ptt_gossip.reset_index(drop=True, inplace=True)\n",
        "print(ptt_gossip.shape)\n",
        "ptt_gossip.head()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "R1diPCggVlrr"
      },
      "source": [
        "## Train validation split"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zxeDWZi0Vlrt"
      },
      "outputs": [],
      "source": [
        "valid_size = 0.2\n",
        "X_train, X_valid, y_train, y_valid = train_test_split(ptt_gossip['sentence'],\n",
        "                                                      ptt_gossip['label'],\n",
        "                                                      test_size=valid_size,\n",
        "                                                      shuffle=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FrC7e-huVlru"
      },
      "source": [
        "## Pre-processing\n",
        "\n",
        "1. 將資料轉換成`tf.tensor`格式。\n",
        "2. 使用`tfds.features.text.SubwordTextEncoder`進行斷詞，斷詞方式為`character-level`方式。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-LxkN9eFVlrv"
      },
      "outputs": [],
      "source": [
        "train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))\n",
        "valid_dataset = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "nKq_GeRnVlrv"
      },
      "outputs": [],
      "source": [
        "%%time\n",
        "try:\n",
        "    tokenizer_zh = tfds.deprecated.text.SubwordTextEncoder.load_from_file(zh_vocab_file)\n",
        "    print('Load Chinese vocabulary: %s' % zh_vocab_file)\n",
        "except:\n",
        "    print('Build Chinese vocabulary: %s' % zh_vocab_file)\n",
        "    tokenizer_zh = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus((x.numpy() for x, y in train_dataset),\n",
        "                                                                             max_subword_length=1,\n",
        "                                                                             target_vocab_size=2**13)\n",
        "    tokenizer_zh.save_to_file(zh_vocab_file)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "M39QBBfHVlrw"
      },
      "outputs": [],
      "source": [
        "print('Vocabulary size: ', tokenizer_zh.vocab_size)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QLCDja4WVlrw"
      },
      "outputs": [],
      "source": [
        "tokenizer_zh"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xolgu3JsVlrx"
      },
      "source": [
        "### Example"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6eaAyc8DVlrx"
      },
      "outputs": [],
      "source": [
        "sentence = '文瑋助教真壯'\n",
        "token_id = tokenizer_zh.encode(sentence)\n",
        "\n",
        "print('Sentence token_id: ', token_id)\n",
        "print('Tokenization: ', [tokenizer_zh.decode([t]) for t in token_id])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nI7c8XT1Vlry"
      },
      "source": [
        "## Convert to token_id\n",
        "\n",
        "因為訓練時需要將每個字轉換成，這邊使用`.map`方式將`train_dataset`轉換成`token_id`。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wplXs61MVlry"
      },
      "outputs": [],
      "source": [
        "def encode(sentence, label):\n",
        "    zh_id = tokenizer_zh.encode(sentence.numpy())\n",
        "    return (tf.cast(zh_id, tf.int32), tf.cast(label, tf.int32))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ekO4fYrPVlrz"
      },
      "outputs": [],
      "source": [
        "def tf_encode(sentence, label):\n",
        "    \"\"\"\n",
        "    從encode輸出的zh_id不是Eager Tensor\n",
        "    需要透過 tf.py_function 轉為Eager Tensor\n",
        "    \"\"\"\n",
        "    return tf.py_function(encode, [sentence, label], [tf.int32, tf.int32])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dQfez7KXVlrz"
      },
      "outputs": [],
      "source": [
        "train_dataset = train_dataset.map(tf_encode)\n",
        "valid_dataset = valid_dataset.map(tf_encode)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mA0B2zJVVlr0"
      },
      "outputs": [],
      "source": [
        "tmp_valid = next(iter(valid_dataset))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "l2jPcls-Vlr0"
      },
      "outputs": [],
      "source": [
        "pprint(tmp_valid)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Lk6IqiIWVlr1"
      },
      "outputs": [],
      "source": [
        "pprint(tokenizer_zh.decode(tmp_valid[0].numpy()))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nqbQ1PDuVlr2"
      },
      "source": [
        "## Input pipeline\n",
        "\n",
        "這邊使用`tf.data.Data.from_tensor_slices`建立一個`generator`，每次訓練時讀取`batch_size`張圖片，通常會建立`generator`都是因為圖片量過大無法一次讀入記憶體，這邊使用`generator`是為了示範。\n",
        "\n",
        "1. `.shuffle()`:進行`buffer_size`的打亂，每次從資料中取`buffer_size`個`batch`作為`buffer`，然後再從`buffer`中隨機抽一個`batch`出來做訓練，所以適當的`buffer_size`很重要，如果`buffer_size`過小會導致放在`buffer`裡的都是同一類別的圖片，最好的做法是直接把`buffer_size`設為訓練圖片數量(`len(X_train)`)，這樣能夠確保隨機性。\n",
        "\n",
        "2. `.padded_batch()`:將每個`batch`進行`padding`，符合訓練的輸入格式。\n",
        "\n",
        "3. `.repeat()`: 複製資料集為`epochs`份，訓練時需要`epochs`份"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "vjbCoFQuVlr2"
      },
      "outputs": [],
      "source": [
        "buffer_size = len(X_train)\n",
        "\n",
        "embedding_size = 256\n",
        "rnn_units = 512\n",
        "\n",
        "batch_size = 64\n",
        "epochs = 10"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LwhtmM1sVlr3"
      },
      "outputs": [],
      "source": [
        "train_dataset = train_dataset.shuffle(buffer_size).padded_batch(batch_size, padded_shapes=([-1], []), drop_remainder=True).repeat(epochs)\n",
        "valid_dataset = valid_dataset.padded_batch(batch_size, padded_shapes=([-1], []))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Tx6XOnx8Vlr3"
      },
      "source": [
        "### Example\n",
        "\n",
        "這邊使用`iter`呼叫`generator`來觀看其中一個`batch`。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "I_53vtQSVlr4"
      },
      "outputs": [],
      "source": [
        "tmp_generator = iter(train_dataset)\n",
        "tmp_x, tmp_y = next(tmp_generator)\n",
        "\n",
        "print('Sentence.shape: ', tmp_x.shape)\n",
        "print(tmp_x)\n",
        "print('-'*20)\n",
        "print('Label.shape: ', tmp_y.shape)\n",
        "print(tmp_y)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HUFubn7vVlr5"
      },
      "source": [
        "## Define LSTM model\n",
        "\n",
        "`tensorflow2.0.0`預設是`eager model`，有助於在撰寫模型時`debug`以及觀看數值運算結果。\n",
        "\n",
        "這裡使用`tf.keras`為基底進行建模，在`lstm`中需要注意輸入型態為`(timesteps, feature_size)`，另外常見有三個參數需要注意：\n",
        "\n",
        "1. `embedding_size`: 每個字的詞向量大小。\n",
        "2. `rnn_units`: `lstm`模型的神經元數量。\n",
        "3. `return_sequences`: 是否輸出每個`timestep`的結果(`hidden_state`)，輸出型態為`(batch_size, )`。\n",
        "4. `return_state`: 是否輸出最後一個`timestep`的結果(`hidden_state`和`cell_state`)。\n",
        "\n",
        "其實`3.`和`4.`的功能有點重複了，通常我們只會拿最後一個`timestep`作為輸出，這邊我們將`return_sequences`設為`True`，並使用`slice`方式將最後一個`hidden_sate`拿出來。\n",
        "\n",
        "最後使用`tf.keras.layers.Dense`輸出`2`個類別的概率。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QjouEid4Vlr5"
      },
      "outputs": [],
      "source": [
        "def rnn_model(batch_size, rnn_units):\n",
        "    input_layer = tf.keras.Input(shape=[None],batch_size=batch_size)\n",
        "    embedding_layer = tf.keras.layers.Embedding(tokenizer_zh.vocab_size, embedding_size)(input_layer)\n",
        "\n",
        "    lstm = tf.keras.layers.LSTM(units=rnn_units,\n",
        "                                activation='tanh',\n",
        "                                recurrent_activation='sigmoid',\n",
        "                                use_bias=True,\n",
        "                                return_sequences=True,\n",
        "                                return_state=False,\n",
        "                                recurrent_initializer='glorot_uniform')\n",
        "\n",
        "    lstm_hidden_states = lstm(embedding_layer)\n",
        "\n",
        "    lstm_last_state = lstm_hidden_states[:,-1,:]\n",
        "\n",
        "    output = tf.keras.layers.Dense(2, activation='softmax', name='output')(lstm_last_state)\n",
        "\n",
        "    return input_layer, output"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YWffKEy2Vlr6"
      },
      "outputs": [],
      "source": [
        "input_layer, output = rnn_model(batch_size,rnn_units)\n",
        "model = tf.keras.Model(inputs=input_layer, outputs=output)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cTGyZ2GYVlr6"
      },
      "outputs": [],
      "source": [
        "model.compile(loss='sparse_categorical_crossentropy',\n",
        "              optimizer=tf.keras.optimizers.Adam(1e-4),\n",
        "              metrics=['accuracy'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "M4X-xfSrVlr7"
      },
      "outputs": [],
      "source": [
        "model.summary()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qCyxBbzcVlr7"
      },
      "outputs": [],
      "source": [
        "history = model.fit(train_dataset,\n",
        "                    epochs=epochs,\n",
        "                    steps_per_epoch=len(X_train) // batch_size,\n",
        "                    validation_data=valid_dataset,\n",
        "                    validation_steps=len(X_valid) // batch_size)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Rh9I1MgMVlr8"
      },
      "outputs": [],
      "source": [
        "model.save(checkpoint_path)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "W5Ns98khVlr8"
      },
      "source": [
        "## Testing prediction\n",
        "\n",
        "觀察`testing`的`precision, recall, f1-score`以及`confusion matrix`。"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pYFcD5boVlr9"
      },
      "outputs": [],
      "source": [
        "valid_pred = model.predict(valid_dataset)\n",
        "valid_pred_id = np.argmax(valid_pred, axis=-1)\n",
        "valid_true_id = np.array(y_valid)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NtywlaRtVlr9"
      },
      "outputs": [],
      "source": [
        "print(classification_report(y_pred = valid_pred_id, y_true = valid_true_id))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4omYfzzqVlsG"
      },
      "outputs": [],
      "source": [
        "confm = confusion_matrix(y_pred = valid_pred_id, y_true = valid_true_id)\n",
        "pd.DataFrame(confm, index=['Actual_0', 'Actual_1'], columns=['Pred_0', 'Pred_1'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "jj2IHcatVlsG"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "include_colab_link": true
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}