{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part2/04-2_word2vec_application_ans.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "heoTQhHLdngf"
      },
      "source": [
        "# the application of word2vec"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sUE9zPQZdngs"
      },
      "outputs": [],
      "source": [
        "from gensim.models import word2vec\n",
        "from sklearn.cluster import KMeans\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import os"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N9n6O1FWdng3"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_4.zip\n",
        "!unzip -q NLP_part2_4.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "glutUXLsdng5"
      },
      "outputs": [],
      "source": [
        "# load word2vec model\n",
        "model = word2vec.Word2Vec.load('word2vec_model/CBOW')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_-LCLFuldng6"
      },
      "source": [
        "## similarity"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YQvRxcLxdng7"
      },
      "outputs": [],
      "source": [
        "# get most similarity with given words\n",
        "model.wv.most_similar('KMT')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "df8USRcedng9"
      },
      "outputs": [],
      "source": [
        "# get most similarity with given words's relationship\n",
        "model.wv.most_similar(positive=['KMT', '綠吱'], negative=['DPP'])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Im2jqQSTdng_"
      },
      "source": [
        "## clustering"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "U3xzZXwZdnhA"
      },
      "outputs": [],
      "source": [
        "# create a dictionary: words as key ; count as values\n",
        "words = {word: model.wv.get_vecattr(word, \"count\") for word in list(model.wv.index_to_key)}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KJ8tyX9DdnhC"
      },
      "outputs": [],
      "source": [
        "# sort and select the top 10000 count of words\n",
        "words = sorted(words.items(), key=lambda x: x[1], reverse=True)\n",
        "words = words[:10000]\n",
        "words = np.array(words)[:, 0]\n",
        "words"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DqFOiALYdnhD"
      },
      "outputs": [],
      "source": [
        "# extract the word vectors\n",
        "vecs = model.wv[words]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "mC3MfYS0dnhD"
      },
      "outputs": [],
      "source": [
        "# run clustering algorithm\n",
        "kmeans = KMeans(n_clusters=50)\n",
        "cluster = kmeans.fit_predict(vecs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "yDQ-N2xZdnhE"
      },
      "outputs": [],
      "source": [
        "# print the result\n",
        "df = pd.DataFrame([words.tolist(), cluster.tolist()], index=['words', 'no. cluster']).T\n",
        "df.head(n=5)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "weVNxbbtdnhF"
      },
      "outputs": [],
      "source": [
        "df.groupby('no. cluster')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "VVWK2NxodnhG"
      },
      "outputs": [],
      "source": [
        "for k, d in df.groupby('no. cluster'):\n",
        "    print(k)\n",
        "    print(d)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "WE7XG3IVdnhG"
      },
      "outputs": [],
      "source": [
        "# print every cluster of words\n",
        "data = pd.concat([d[['words']].reset_index(drop=True).rename(columns={'words': k}) for k, d in df.groupby('no. cluster')],\n",
        "                 axis=1)\n",
        "data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tMKamMsrdnhH"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}