{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "NwMyKRYg8nuN" }, "source": [ "# how to generate document vector ?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IZimxra88nuS" }, "outputs": [], "source": [ "import pickle\n", "from gensim.models import Doc2Vec, doc2vec\n", "from gensim.models import word2vec\n", "import random\n", "import numpy as np\n", "import logging\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "knXkrlip8nuV" }, "outputs": [], "source": [ "logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')\n", "logging.root.setLevel(level=logging.INFO)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "QuQfp8lL8nuW" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_5.zip\n", "!unzip -q NLP_part2_5.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4OMQF3E08nuW" }, "outputs": [], "source": [ "# load 'article_cutted'\n", "with open('Data/article_cutted', 'rb') as file:\n", " data = pickle.load(file)" ] }, { "cell_type": "markdown", "metadata": { "id": "ampiXgH48nuX" }, "source": [ "## average word vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "v-9d9QlL8nuY" }, "outputs": [], "source": [ "# load word2vec model\n", "model = word2vec.Word2Vec.load('word2vec_model/CBOW')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "g9Q4dP4z8nuZ" }, "outputs": [], "source": [ "# filter words that not in word2vec's vocab\n", "data_filtered = [[w for w in l if w in model.wv] for l in data]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "l1aZQOlz8nuZ" }, "outputs": [], "source": [ "# compute average word vector\n", "avg_vector = []\n", "\n", "# 計算每一個文件的平均 word2vec 並存在 list 裏" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N-ebEmrl8nua" }, "outputs": [], "source": [ "# save result\n", "with open('Data/avg_article_vector', 'wb') as file:\n", " pickle.dump(avg_vector, file)" ] }, { "cell_type": "markdown", "metadata": { "id": "iY8nyoMr8nub" }, "source": [ "# doc2vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "jw49o9sr8nuc" }, "outputs": [], "source": [ "## create a document id map\n", "sentence_list = []\n", "\n", "for i, l in enumerate(data):\n", " sentence_list.append(doc2vec.TaggedDocument(words=l, tags=[str(i)]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CyeKcYMG8nue" }, "outputs": [], "source": [ "## print result\n", "sentence_list[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "twXkK81H8nue" }, "outputs": [], "source": [ "## define 轉換器\n", "model = Doc2Vec(vector_size=256, min_count=5, window=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "xPLc73qS8nuf" }, "outputs": [], "source": [ "## build vocabulary\n", "model.build_vocab(sentence_list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "YYa7DMZU8nuf" }, "outputs": [], "source": [ "# train word2vec model ; shuffle data every epoch\n", "for i in range(20):\n", " random.shuffle(sentence_list)\n", " model.train(sentence_list, total_examples=len(data), epochs=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "m-cKdcFq8nuf" }, "outputs": [], "source": [ "## print result\n", "model.docvecs['0']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_cD4hJrR8nuf" }, "outputs": [], "source": [ "# save result\n", "model.save('word2vec_model/doc2vec')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IinsfWjh8nuf" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "include_colab_link": true } }, "nbformat": 4, "nbformat_minor": 0 }