{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/TA-aiacademy/course_3.0/blob/v2-5_nlp/09_v2-5_NLP/Part2/05-1_document_vector.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NwMyKRYg8nuN"
      },
      "source": [
        "# how to generate document vector ?"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "IZimxra88nuS"
      },
      "outputs": [],
      "source": [
        "import pickle\n",
        "from gensim.models import Doc2Vec, doc2vec\n",
        "from gensim.models import word2vec\n",
        "import random\n",
        "import numpy as np\n",
        "import logging\n",
        "import os"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "knXkrlip8nuV"
      },
      "outputs": [],
      "source": [
        "logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')\n",
        "logging.root.setLevel(level=logging.INFO)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QuQfp8lL8nuW"
      },
      "outputs": [],
      "source": [
        "# 上傳資料\n",
        "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_5.zip\n",
        "!unzip -q NLP_part2_5.zip"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4OMQF3E08nuW"
      },
      "outputs": [],
      "source": [
        "# load 'article_cutted'\n",
        "with open('Data/article_cutted', 'rb') as file:\n",
        "    data = pickle.load(file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ampiXgH48nuX"
      },
      "source": [
        "## average word vec"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "v-9d9QlL8nuY"
      },
      "outputs": [],
      "source": [
        "# load word2vec model\n",
        "model = word2vec.Word2Vec.load('word2vec_model/CBOW')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "g9Q4dP4z8nuZ"
      },
      "outputs": [],
      "source": [
        "# filter words that not in word2vec's vocab\n",
        "data_filtered = [[w for w in l if w in model.wv] for l in data]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "l1aZQOlz8nuZ"
      },
      "outputs": [],
      "source": [
        "# compute average word vector\n",
        "avg_vector = []\n",
        "\n",
        "# 計算每一個文件的平均 word2vec 並存在 list 裏"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "N-ebEmrl8nua"
      },
      "outputs": [],
      "source": [
        "# save result\n",
        "with open('Data/avg_article_vector', 'wb') as file:\n",
        "    pickle.dump(avg_vector, file)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iY8nyoMr8nub"
      },
      "source": [
        "# doc2vec"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "jw49o9sr8nuc"
      },
      "outputs": [],
      "source": [
        "## create a document id map\n",
        "sentence_list = []\n",
        "\n",
        "for i, l in enumerate(data):\n",
        "    sentence_list.append(doc2vec.TaggedDocument(words=l, tags=[str(i)]))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CyeKcYMG8nue"
      },
      "outputs": [],
      "source": [
        "## print result\n",
        "sentence_list[0]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "twXkK81H8nue"
      },
      "outputs": [],
      "source": [
        "## define 轉換器\n",
        "model = Doc2Vec(vector_size=256, min_count=5, window=4)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "xPLc73qS8nuf"
      },
      "outputs": [],
      "source": [
        "## build vocabulary\n",
        "model.build_vocab(sentence_list)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "scrolled": true,
        "id": "YYa7DMZU8nuf"
      },
      "outputs": [],
      "source": [
        "# train word2vec model ; shuffle data every epoch\n",
        "for i in range(20):\n",
        "    random.shuffle(sentence_list)\n",
        "    model.train(sentence_list, total_examples=len(data), epochs=1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "m-cKdcFq8nuf"
      },
      "outputs": [],
      "source": [
        "## print result\n",
        "model.docvecs['0']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_cD4hJrR8nuf"
      },
      "outputs": [],
      "source": [
        "# save result\n",
        "model.save('word2vec_model/doc2vec')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "IinsfWjh8nuf"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.12"
    },
    "colab": {
      "provenance": [],
      "include_colab_link": true
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}