{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part2/04-1_word2vec_application.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iFxoavs6v6Gw"
      },
      "source": [
        "# the application of word2vec"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NoC3oHV8v6G4"
      },
      "outputs": [],
      "source": [
        "from gensim.models import word2vec\n",
        "from sklearn.cluster import KMeans\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import os"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6OEo-1a4v6G6"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_4.zip\n",
        "!unzip -q NLP_part2_4.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "XvLfZDvHv6G8"
      },
      "outputs": [],
      "source": [
        "# load word2vec model\n",
        "model = word2vec.Word2Vec.load('word2vec_model/CBOW')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JFs4ADhHv6G8"
      },
      "source": [
        "## similarity"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9Mg-VH-cv6G9"
      },
      "outputs": [],
      "source": [
        "# get most similarity with given words\n",
        "# 可以替換關鍵字\n",
        "model.wv.most_similar('KMT')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5i99ojIGv6G-"
      },
      "outputs": [],
      "source": [
        "# get most similarity with given words's relationship\n",
        "# 可以替換關鍵字\n",
        "model.wv.most_similar(positive=['KMT', '綠吱'], negative=['DPP'])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-MYpP-jlv6G-"
      },
      "source": [
        "## clustering"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "3AoozOTOv6G_"
      },
      "outputs": [],
      "source": [
        "# create a dictionary: words as key ; count as values\n",
        "words = {word: model.wv.get_vecattr(word, \"count\") for word in list(model.wv.index_to_key)}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PNlJH1fiv6HA"
      },
      "outputs": [],
      "source": [
        "# sort and select the top 10000 count of words\n",
        "words = sorted(words.items(), key=lambda x: x[1], reverse=True)\n",
        "words = words[:10000]\n",
        "words = np.array(words)[:, 0]\n",
        "words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2u7DZB3rv6HA"
      },
      "outputs": [],
      "source": [
        "# extract the word vectors\n",
        "vecs = model.wv[words]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "z7bOrbd0v6HB"
      },
      "outputs": [],
      "source": [
        "# run clustering algorithm\n",
        "kmeans = KMeans(n_clusters=50)\n",
        "cluster = kmeans.fit_predict(vecs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "onbvWYxxv6HB"
      },
      "outputs": [],
      "source": [
        "# print the result\n",
        "df = pd.DataFrame([words.tolist(), cluster.tolist()], index=['words', 'no. cluster']).T\n",
        "df.head(n=5)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Z4berA4Tv6HC"
      },
      "outputs": [],
      "source": [
        "## print every cluster of words\n",
        "data = pd.concat([d[['words']].reset_index(drop=True).rename(columns={'words': k}) for k, d in df.groupby('no. cluster')],\n",
        "                 axis=1)\n",
        "data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "JtDvXsKbv6HD"
      },
      "outputs": [],
      "source": [
        "# 可以嘗試調整 Kmeans 的參數 , etc. 分個 100 群如何 ?"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}