Adapted from Sergey Smetanin's "Google News and Leo Tolstoy" post on Medium (2018). Read that first for instruction, then come back here to execute the (updated) code.

Updates by Scott H. Hawley (2020):

  • Automatically installs packages, downloads model and data.
  • Support for (local) interactive plots
  • Speed: Enabled parallel compution of t-SNE in sklearn.

    • TODO/Abandoned: RAPIDS' cuML GPU-powered t-SNE, is potentially faster than sklearn's, but it's still really spotty: RAPIDS' examples work great with my laptop's RTX 2070 but not at all on Colab (despite advertisements). Also my tests of cuML for this-here dataset on my laptop show plenty of instances where cuML's t-SNE will hang indefinitely for no 'good' reason (e.g. increase n_iter from 1750 to 1800 and suddenly execution time goes from 600ms to never-finishing). ...Plus, it only does random init (not PCA) and only outputs 2D (no 3D). So...not gonna mess with it anymore. (Might check back in 6 months.)
  • A few updates as per package changes (e.g. matplotlib color spec)

  • TODO: Probably will ditch t-SNE & UMAP, and just go with PCA for starters, then maybe sklearn.manifold.LocallyLinearEmbedding

NOTE: I'm posting this on Colab for convenience, but if you've got a decent machine at home, I recommend downloading this notebook and running it locally -- Colab only offers 2 CPU cores and thus crucial t-SNE calculations likely run 5x to 10x slower on Colab than they would on your local machine.

0. Preliminary Setup: Installs and Imports

# Install required Python packages:
#     gensim offers a fully-featured set of Word2Vec routines
#     nltk is a text tokenizer
#     Other required packages are typically already installed with Jupyter
!python -m pip install gensim nltk | grep -v 'already satisfied'

# Python imports
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import FAST_VERSION
print(f"gensim FAST_VERSION = {FAST_VERSION}. <-- Hopefully that says '1' not '-1'")

from sklearn.manifold import TSNE   # actually TNSE is the speed bottleneck, not Word2Vec
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm   
import nltk
import re
import codecs
import multiprocessing
import os
import sys
import matplotlib.patheffects as PathEffects
import tempfile
import imageio
import shutil

#  plot_type: 'notebook' allows for interactive plots (=better!), but Colab 
#       only supports 'inline'.
#       For interactive plots, best to execute one cell at a time (manually), rather 
#       than Kernal > Run All, because interactives will appear blank until all 
#       code cells have executed (whereas inline plots render immediately).
plot_type = 'inline' if 'google.colab' in sys.modules else 'notebook' # Auto-detect Colab
%matplotlib $plot_type
gensim FAST_VERSION = 1. <-- Hopefully that says '1' not '-1'

1. Visualizing Similar Words from Google News

First, download the word vectors for Google News

if not os.path.exists("GoogleNews-vectors-negative300.bin"):
    !wget -c -nc "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
    !gunzip GoogleNews-vectors-negative300.bin.gz
else:
    print("\nWord2Vec vectors already there; not retrieving.\n")
--2020-05-30 10:44:22--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.186.213
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.186.213|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’

GoogleNews-vectors- 100%[===================>]   1.53G  35.7MB/s    in 45s     

2020-05-30 10:45:07 (35.1 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]

Read in the model (may take a while)

model_gn = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:253: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL

For a sample set of key words, generate clusters of nearby similar words.

WARNING: Memory-intensive operation. 8GB of RAM will not be sufficient (you will start swapping).

keys = ['Paris', 'Python', 'Sunday', 'Tolstoy', 'Twitter', 'bachelor', 'delivery', 'election', 'expensive',
        'experience', 'financial', 'food', 'iOS', 'peace', 'release', 'war']

embedding_clusters = []
word_clusters = []
for word in keys:
    print(f"Key = {word}")
    embeddings = []
    words = []
    for similar_word, _ in model_gn.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(model_gn[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)
Key = Paris
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):
Key = Python
Key = Sunday
Key = Tolstoy
Key = Twitter
Key = bachelor
Key = delivery
Key = election
Key = expensive
Key = experience
Key = financial
Key = food
Key = iOS
Key = peace
Key = release
Key = war

Take these clusters and generate points for a t-SNE embedding

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32, n_jobs=-1)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    figsize = (9.5,6) if (matplotlib.get_backend() == 'nbAgg') else (20,12)  # interactive plot should be smaller
    plt.figure(figsize=(figsize))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=[color], alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    plt.show()


tsne_plot_similar_words('Similar words from Google News', keys, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')

2. Visualizing Word2Vec Vectors from Leo Tolstoy Books

First download the punkt tokenizer and the Russian texts

# PUNKT tokenizer for Russian
nltk.download('punkt')

# Download Tolstoy texts
!wget -c -nc 'https://raw.githubusercontent.com/sismetanin/word2vec-tsne/master/data/Anna%20Karenina%20by%20Leo%20Tolstoy%20(ru).txt'
!wget -c -nc 'https://raw.githubusercontent.com/sismetanin/word2vec-tsne/master/data/War%20and%20Peace%20by%20Leo%20Tolstoy%20(ru).txt'
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
--2020-05-30 10:47:28--  https://raw.githubusercontent.com/sismetanin/word2vec-tsne/master/data/Anna%20Karenina%20by%20Leo%20Tolstoy%20(ru).txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1880828 (1.8M) [text/plain]
Saving to: ‘Anna Karenina by Leo Tolstoy (ru).txt’

Anna Karenina by Le 100%[===================>]   1.79M  --.-KB/s    in 0.07s   

2020-05-30 10:47:28 (27.2 MB/s) - ‘Anna Karenina by Leo Tolstoy (ru).txt’ saved [1880828/1880828]

--2020-05-30 10:47:29--  https://raw.githubusercontent.com/sismetanin/word2vec-tsne/master/data/War%20and%20Peace%20by%20Leo%20Tolstoy%20(ru).txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3114987 (3.0M) [text/plain]
Saving to: ‘War and Peace by Leo Tolstoy (ru).txt’

War and Peace by Le 100%[===================>]   2.97M  --.-KB/s    in 0.1s    

2020-05-30 10:47:30 (24.4 MB/s) - ‘War and Peace by Leo Tolstoy (ru).txt’ saved [3114987/3114987]

Define routines for preprocessing texts

def preprocess_text(text):
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip()


def prepare_for_w2v(filename_from, filename_to, lang):
    raw_text = codecs.open(filename_from, "r", encoding='windows-1251').read()
    with open(filename_to, 'w', encoding='utf-8') as f:
        for sentence in nltk.sent_tokenize(raw_text, lang):
            print(preprocess_text(sentence.lower()), file=f)
            

def train_word2vec(filename):
    data = gensim.models.word2vec.LineSentence(filename)
    return Word2Vec(data, size=200, window=5, min_count=3, workers=multiprocessing.cpu_count())

2.1. Visualizing Word2Vec Vectors from Anna Karenina

prepare_for_w2v('Anna Karenina by Leo Tolstoy (ru).txt', 'train_anna_karenina_ru.txt', 'russian')
model_ak = train_word2vec('train_anna_karenina_ru.txt')
/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:253: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL

Note: On Colab, the call to TSNE() in the next cell is slow because Colab only gives you 2 cores. If you download & run locally it will go faster.

# run the words through the model to get their embedding vectors
words_ak = []
embeddings_ak = []
for word in list(model_ak.wv.vocab):
    embeddings_ak.append(model_ak.wv[word])
    words_ak.append(word)

# now project that into 2d using t-SNE
tsne_ak_2d = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3500, random_state=32, n_jobs=-1)
embeddings_ak_2d = tsne_ak_2d.fit_transform(embeddings_ak)
def tsne_plot_2d(label, embeddings, words=[], a=1, interactive=(matplotlib.get_backend() == 'nbAgg')):
    figsize = (9.5,6) if interactive else (16,9)  # interactive plot should be smaller
    plt.figure(figsize=figsize)
    colors = cm.rainbow(np.linspace(0, 1, 1))
    x = embeddings[:,0]
    y = embeddings[:,1]
    plt.scatter(x, y, c=colors, alpha=a, label=label)
    # if words is non-empty, write out each word next to its data point
    for i, word in enumerate(words):
        plt.annotate(word, alpha=0.3, xy=(x[i], y[i]), xytext=(5, 2), 
                     textcoords='offset points', ha='right', va='bottom', size=10)
    plt.legend(loc=4)
    plt.grid(True)
    plt.show()
    
tsne_plot_2d('Anna Karenina by Leo Tolstoy', embeddings_ak_2d, a=0.1)
# writes out the words next to the data points - this takes more computation
tsne_plot_2d('Anna Karenina by Leo Tolstoy', embeddings_ak_2d, words_ak, 0.1) 

2.2. War and Peace

War & Peace is about 60% longer than Anna Kerenina

prepare_for_w2v('War and Peace by Leo Tolstoy (ru).txt', 'train_war_and_peace_ru.txt', 'russian')
model_wp = train_word2vec('train_war_and_peace_ru.txt')
/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:253: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL

Now generate the word vectors

words_wp = []
embeddings_wp = []
for word in list(model_wp.wv.vocab):
    embeddings_wp.append(model_wp.wv[word])
    words_wp.append(word)
print(f"{len(words_wp)} word vectors generated")
16812 word vectors generated

Create the t-SNE points

WARNING: The following call to TSNE() has a lot of calculations to make, and will take 'several minutes' to run on any system. But on Colab it will run 5x to 10x slower -- i.e., "go for a hike" -- due to there being only 2 CPU cores. Downloading this notebook & running locally will likely run way faster.

tsne_wp_3d = TSNE(perplexity=30, n_components=3, init='pca', n_iter=3500, random_state=12, n_jobs=-1)
embeddings_wp_3d = tsne_wp_3d.fit_transform(embeddings_wp)

And plot it in 3D

from mpl_toolkits.mplot3d import Axes3D

def tsne_plot_3d(title, label, embeddings, a=1):
    print(f"Plotting {embeddings.shape[0]} points.")
    if matplotlib.get_backend() == 'nbAgg':          # Instructions for interactive plots
        print("Zoom by moving the mouse up & down while holding down the right mouse button.")
    fig = plt.figure()
    ax = Axes3D(fig)
    colors = cm.rainbow(np.linspace(0, 1, 1))
    plt.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], c=colors, alpha=a, label=label)
    plt.legend(loc=4)
    plt.title(title)
    plt.show()

tsne_plot_3d('Visualizing Embeddings using t-SNE', 'War and Peace', embeddings_wp_3d, a=0.1)

What does it mean that these 3D points all lie on basically a 2D manifold? I have no idea. -SHH

Optional Ending: Generate GIFs

Sergey Smetanin's web page features some animated GIFs.

Note: The following will involve one t-SNE calculation per GIF frame. If you're running this on Colab (which only has 2 CPU cores), you can start this and walk away for at least an hour.

GIF #1: t-SNE for different perplexity values, using Google News data (2D)

# Revisit the Google News data and re-generate the clusters and embedding
keys_gif = ['Paris', 'Python', 'Sunday', 'Tolstoy', 'Twitter', 'bachelor', 'delivery', 'election', 'expensive',
        'experience', 'financial', 'food', 'iOS', 'peace', 'release', 'war']

embedding_clusters_gif = []
word_clusters_gif = []
for word in keys_gif:
    embeddings = []
    words = []
    for similar_word, _ in model_gn.most_similar(word, topn=200):
        words.append(similar_word)
        embeddings.append(model_gn[similar_word])
    embedding_clusters_gif.append(embeddings)
    word_clusters_gif.append(words)

embedding_clusters_gif = np.array(embedding_clusters_gif)
n, m, k = embedding_clusters_gif.shape
# Generate a bunch of frames for the GIF

# This GIF-generating code does not play well with interactive plots
%matplotlib inline 

def tsne_plot_similar_words_png(title, embedding_clusters, a, filename):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(embedding_clusters)))
    i = 1
    for embeddings, color in zip(embedding_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=[color], alpha=a)
        plt.text(x.mean(), y.mean(), str(i), color='white', weight='bold', fontsize=13, path_effects=[PathEffects.withStroke(linewidth=3,
                                                                          foreground="black", alpha=0.7)])
        i += 1
    plt.title(title)
    plt.grid(True)
    plt.xlim(-200, 200)
    plt.ylim(-200, 200)
    plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.close()

dirpath = tempfile.mkdtemp()

# loop over perplexity values
images = []
for i in range(1, 31):
    print(f"Generating frame {i}/30")
    fname = os.path.join(dirpath, str(i) + '.png')
    tsne_model_en_2d_gif = TSNE(perplexity=i, n_components=2, init='pca', n_iter=3500, random_state=32)
    embeddings_en_2d_gif = np.array(tsne_model_en_2d_gif.fit_transform(embedding_clusters_gif.reshape(n * m, k))).reshape(n, m, 2)
    tsne_plot_similar_words_png('Vizualizing similar words from Google News using t-SNE (perplexity={})'.format(i), embeddings_en_2d_gif, 0.6, fname)
    images.append(imageio.imread(fname))
imageio.mimsave("tsne_perplexity.gif", images, duration = 0.5)
shutil.rmtree(dirpath)
# Display the GIF
from IPython.display import Image
Image(open('tsne_perplexity.gif','rb').read())

(Optional) GIF #2: Spinning 3D plot of War & Peace

If you have interactive plots enabled, and already created the interactive 3D plot for War & Peace above, then you don't 'need' this.

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches

def tsne_plot_3d_gif(title, label, embeddings, filename, a):
    fig = plt.figure()
    ax = Axes3D(fig)
    colors = cm.rainbow(np.linspace(0, 1, 1))
    plt.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], c=colors, alpha=a)

    plt.title(title)

    red_patch = mpatches.Patch(color=colors[0], label=label)
    plt.legend(handles=[red_patch], loc=4)

    dirpath = tempfile.mkdtemp()
    images = []
    for angle in range(0, 360, 5):
        print(f"Angle = {angle} of 360")
        ax.view_init(30, angle)
        fname = os.path.join(dirpath, str(angle) + '.png')
        plt.savefig(fname, dpi=120, format='png', bbox_inches='tight')
        plt.close()
        images.append(imageio.imread(fname))
    imageio.mimsave('3d.gif', images)
    shutil.rmtree(dirpath)

    
tsne_plot_3d_gif('Visualizing Word Embeddings using t-SNE', 'War and Peace\nby Leo Tolstoy',
                                  embeddings_wp_3d, '3d.gif', 0.1)
from IPython.display import Image
Image(open('3d.gif','rb').read())