{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "pbFhiF22dmV0" }, "source": [ "# how to build words relationship ? word2vec !!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xlwq11QGdmXA" }, "outputs": [], "source": [ "import pickle\n", "from gensim.models import word2vec\n", "import random\n", "import logging\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ooVpnA2sdmXb" }, "outputs": [], "source": [ "logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')\n", "logging.root.setLevel(level=logging.INFO)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "HQVh1AYHdmXe" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_3.zip\n", "!unzip -q NLP_part2_3.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "kux4hc81dmXh" }, "outputs": [], "source": [ "## load 'article_cutted'\n", "with open('Data/article_cutted', 'rb') as file:\n", " data = pickle.load(file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SjMGOMBfdmXl" }, "outputs": [], "source": [ "# build word2vec\n", "# sg=0 CBOW ; sg=1 skip-gram\n", "model = word2vec.Word2Vec(vector_size=256, min_count=5, window=5, sg=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "oZ1DRT7odmXn" }, "outputs": [], "source": [ "# build vocabulary\n", "model.build_vocab(data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "292VISfvdmXq" }, "outputs": [], "source": [ "# train word2vec model ; shuffle data every epoch\n", "for i in range(20):\n", " random.shuffle(data)\n", " model.train(data, total_examples=len(data), epochs=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "SGxSIw9qdmXs" }, "outputs": [], "source": [ "## print an example\n", "model.wv['人工智慧']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "v6rE7_AodmXy" }, "outputs": [], "source": [ "## save model\n", "os.mkdir('word2vec_model')\n", "model.save('word2vec_model/CBOW')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Uj1K4lREdmX4" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "include_colab_link": true } }, "nbformat": 4, "nbformat_minor": 0 }