{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part2/03-2_word2vec_build_ans.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pbFhiF22dmV0"
      },
      "source": [
        "# how to build words relationship ? word2vec !!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "xlwq11QGdmXA"
      },
      "outputs": [],
      "source": [
        "import pickle\n",
        "from gensim.models import word2vec\n",
        "import random\n",
        "import logging\n",
        "import os"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ooVpnA2sdmXb"
      },
      "outputs": [],
      "source": [
        "logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')\n",
        "logging.root.setLevel(level=logging.INFO)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HQVh1AYHdmXe"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_3.zip\n",
        "!unzip -q NLP_part2_3.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kux4hc81dmXh"
      },
      "outputs": [],
      "source": [
        "## load 'article_cutted'\n",
        "with open('Data/article_cutted', 'rb') as file:\n",
        "    data = pickle.load(file)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SjMGOMBfdmXl"
      },
      "outputs": [],
      "source": [
        "# build word2vec\n",
        "# sg=0 CBOW ; sg=1 skip-gram\n",
        "model = word2vec.Word2Vec(vector_size=256, min_count=5, window=5, sg=0)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "oZ1DRT7odmXn"
      },
      "outputs": [],
      "source": [
        "# build vocabulary\n",
        "model.build_vocab(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "292VISfvdmXq"
      },
      "outputs": [],
      "source": [
        "# train word2vec model ; shuffle data every epoch\n",
        "for i in range(20):\n",
        "    random.shuffle(data)\n",
        "    model.train(data, total_examples=len(data), epochs=1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SGxSIw9qdmXs"
      },
      "outputs": [],
      "source": [
        "## print an example\n",
        "model.wv['人工智慧']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "v6rE7_AodmXy"
      },
      "outputs": [],
      "source": [
        "## save model\n",
        "os.mkdir('word2vec_model')\n",
        "model.save('word2vec_model/CBOW')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Uj1K4lREdmX4"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}