{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "heoTQhHLdngf" }, "source": [ "# the application of word2vec" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "sUE9zPQZdngs" }, "outputs": [], "source": [ "from gensim.models import word2vec\n", "from sklearn.cluster import KMeans\n", "import numpy as np\n", "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "N9n6O1FWdng3" }, "outputs": [], "source": [ "# 上傳資料\n", "!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_4.zip\n", "!unzip -q NLP_part2_4.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "glutUXLsdng5" }, "outputs": [], "source": [ "# load word2vec model\n", "model = word2vec.Word2Vec.load('word2vec_model/CBOW')" ] }, { "cell_type": "markdown", "metadata": { "id": "_-LCLFuldng6" }, "source": [ "## similarity" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YQvRxcLxdng7" }, "outputs": [], "source": [ "# get most similarity with given words\n", "model.wv.most_similar('KMT')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "df8USRcedng9" }, "outputs": [], "source": [ "# get most similarity with given words's relationship\n", "model.wv.most_similar(positive=['KMT', '綠吱'], negative=['DPP'])" ] }, { "cell_type": "markdown", "metadata": { "id": "Im2jqQSTdng_" }, "source": [ "## clustering" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true, "id": "U3xzZXwZdnhA" }, "outputs": [], "source": [ "# create a dictionary: words as key ; count as values\n", "words = {word: model.wv.get_vecattr(word, \"count\") for word in list(model.wv.index_to_key)}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "KJ8tyX9DdnhC" }, "outputs": [], "source": [ "# sort and select the top 10000 count of words\n", "words = sorted(words.items(), key=lambda x: x[1], reverse=True)\n", "words = words[:10000]\n", "words = np.array(words)[:, 0]\n", "words" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DqFOiALYdnhD" }, "outputs": [], "source": [ "# extract the word vectors\n", "vecs = model.wv[words]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mC3MfYS0dnhD" }, "outputs": [], "source": [ "# run clustering algorithm\n", "kmeans = KMeans(n_clusters=50)\n", "cluster = kmeans.fit_predict(vecs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "yDQ-N2xZdnhE" }, "outputs": [], "source": [ "# print the result\n", "df = pd.DataFrame([words.tolist(), cluster.tolist()], index=['words', 'no. cluster']).T\n", "df.head(n=5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "weVNxbbtdnhF" }, "outputs": [], "source": [ "df.groupby('no. cluster')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "VVWK2NxodnhG" }, "outputs": [], "source": [ "for k, d in df.groupby('no. cluster'):\n", " print(k)\n", " print(d)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "WE7XG3IVdnhG" }, "outputs": [], "source": [ "# print every cluster of words\n", "data = pd.concat([d[['words']].reset_index(drop=True).rename(columns={'words': k}) for k, d in df.groupby('no. cluster')],\n", " axis=1)\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "tMKamMsrdnhH" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "colab": { "provenance": [], "include_colab_link": true } }, "nbformat": 4, "nbformat_minor": 0 }