{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "s-c_LNpRn0JA" }, "source": [ "# how to generate document vector ?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "VXLfRSZQn0JM" }, "outputs": [], "source": [ "import pickle\n", "from gensim.models import Doc2Vec, doc2vec\n", "from gensim.models import word2vec\n", "import random\n", "import numpy as np\n", "import logging\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "1YuEgVBCn0JR" }, "outputs": [], "source": [ "logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')\n", "logging.root.setLevel(level=logging.INFO)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Ku1P9z49n0JS" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_5.zip\n", "!unzip -q NLP_part2_5.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "okz6S7m1n0JT" }, "outputs": [], "source": [ "# load 'article_cutted'\n", "with open('Data/article_cutted', 'rb') as file:\n", " data = pickle.load(file)" ] }, { "cell_type": "markdown", "metadata": { "id": "eArg85Iqn0JU" }, "source": [ "## average word vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-ORzQPDrn0JV" }, "outputs": [], "source": [ "# load word2vec model\n", "model = word2vec.Word2Vec.load('word2vec_model/CBOW')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lOGLtm41n0JX" }, "outputs": [], "source": [ "# filter words that not in word2vec's vocab\n", "data_filtered = [[w for w in l if w in model.wv] for l in data]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gx_e_dg6n0JZ" }, "outputs": [], "source": [ "# compute average word vector\n", "avg_vector = []\n", "\n", "for l in data_filtered:\n", " if len(l)==0:\n", " avg_vector.append(np.array([0]*256))\n", " else:\n", " avg_vector.append(np.mean([model.wv[w] for w in l], axis=0))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Zk3JeScnn0Ja" }, "outputs": [], "source": [ "# print result\n", "avg_vector[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZVPbbnlrn0Jb" }, "outputs": [], "source": [ "# save result\n", "with open('Data/avg_article_vector', 'wb') as file:\n", " pickle.dump(avg_vector, file)" ] }, { "cell_type": "markdown", "metadata": { "id": "67joC_w6n0Jc" }, "source": [ "# doc2vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3GKQ_Xpmn0Jd" }, "outputs": [], "source": [ "# create a document id map\n", "sentence_list = []\n", "\n", "for i, l in enumerate(data):\n", " sentence_list.append(doc2vec.TaggedDocument(words=l, tags=[str(i)]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "-fCrjQ6rn0Jd" }, "outputs": [], "source": [ "# print result\n", "sentence_list[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "37Nmfehsn0Jd" }, "outputs": [], "source": [ "# define 轉換器\n", "model = Doc2Vec(vector_size=256, min_count=5, window=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "daY-4cg5n0Je" }, "outputs": [], "source": [ "# build vocabulary\n", "model.build_vocab(sentence_list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "rudH0wAjn0Je" }, "outputs": [], "source": [ "# train word2vec model ; shuffle data every epoch\n", "for i in range(20):\n", " random.shuffle(sentence_list)\n", " model.train(sentence_list, total_examples=len(data), epochs=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "r1uOIgD9n0Jf" }, "outputs": [], "source": [ "# print result\n", "model.docvecs['0']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cOWP7q0tn0Jf" }, "outputs": [], "source": [ "# save result\n", "model.save('word2vec_model/doc2vec')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0p6ZHkk1n0Jg" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "include_colab_link": true } }, "nbformat": 4, "nbformat_minor": 0 }