Visualizing Word2Vec Embeddings with tSNE
A first foray into "Digital Humanities"
- 0. Preliminary Setup: Installs and Imports¶
- 1. Visualizing Similar Words from Google News¶
- 2. Visualizing Word2Vec Vectors from Leo Tolstoy Books¶
- Optional Ending: Generate GIFs¶
Adapted from Sergey Smetanin's "Google News and Leo Tolstoy" post on Medium (2018). Read that first for instruction, then come back here to execute the (updated) code.
Updates by Scott H. Hawley (2020):
- Automatically installs packages, downloads model and data.
- Support for (local) interactive plots
-
Speed: Enabled parallel compution of t-SNE in sklearn.
-
TODO/Abandoned: RAPIDS' cuML GPU-powered t-SNE, is potentially faster than sklearn's, but it's still really spotty: RAPIDS' examples work great with my laptop's RTX 2070 but not at all on Colab (despite advertisements). Also my tests of cuML for this-here dataset on my laptop show plenty of instances where cuML's t-SNE will hang indefinitely for no 'good' reason (e.g. increase
n_iter
from 1750 to 1800 and suddenly execution time goes from 600ms to never-finishing). ...Plus, it only does random init (not PCA) and only outputs 2D (no 3D). So...not gonna mess with it anymore. (Might check back in 6 months.)
-
TODO/Abandoned: RAPIDS' cuML GPU-powered t-SNE, is potentially faster than sklearn's, but it's still really spotty: RAPIDS' examples work great with my laptop's RTX 2070 but not at all on Colab (despite advertisements). Also my tests of cuML for this-here dataset on my laptop show plenty of instances where cuML's t-SNE will hang indefinitely for no 'good' reason (e.g. increase
-
A few updates as per package changes (e.g. matplotlib color spec)
-
TODO: Probably will ditch t-SNE & UMAP, and just go with PCA for starters, then maybe
sklearn.manifold.LocallyLinearEmbedding
NOTE: I'm posting this on Colab for convenience, but if you've got a decent machine at home, I recommend downloading this notebook and running it locally -- Colab only offers 2 CPU cores and thus crucial t-SNE calculations likely run 5x to 10x slower on Colab than they would on your local machine.
¶
0. Preliminary Setup: Installs and Imports# Install required Python packages:
# gensim offers a fully-featured set of Word2Vec routines
# nltk is a text tokenizer
# Other required packages are typically already installed with Jupyter
!python -m pip install gensim nltk | grep -v 'already satisfied'
# Python imports
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import FAST_VERSION
print(f"gensim FAST_VERSION = {FAST_VERSION}. <-- Hopefully that says '1' not '-1'")
from sklearn.manifold import TSNE # actually TNSE is the speed bottleneck, not Word2Vec
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import nltk
import re
import codecs
import multiprocessing
import os
import sys
import matplotlib.patheffects as PathEffects
import tempfile
import imageio
import shutil
# plot_type: 'notebook' allows for interactive plots (=better!), but Colab
# only supports 'inline'.
# For interactive plots, best to execute one cell at a time (manually), rather
# than Kernal > Run All, because interactives will appear blank until all
# code cells have executed (whereas inline plots render immediately).
plot_type = 'inline' if 'google.colab' in sys.modules else 'notebook' # Auto-detect Colab
%matplotlib $plot_type
¶
1. Visualizing Similar Words from Google NewsFirst, download the word vectors for Google News
if not os.path.exists("GoogleNews-vectors-negative300.bin"):
!wget -c -nc "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
!gunzip GoogleNews-vectors-negative300.bin.gz
else:
print("\nWord2Vec vectors already there; not retrieving.\n")
¶
Read in the model (may take a while)model_gn = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
¶
For a sample set of key words, generate clusters of nearby similar words.WARNING: Memory-intensive operation. 8GB of RAM will not be sufficient (you will start swapping).
keys = ['Paris', 'Python', 'Sunday', 'Tolstoy', 'Twitter', 'bachelor', 'delivery', 'election', 'expensive',
'experience', 'financial', 'food', 'iOS', 'peace', 'release', 'war']
embedding_clusters = []
word_clusters = []
for word in keys:
print(f"Key = {word}")
embeddings = []
words = []
for similar_word, _ in model_gn.most_similar(word, topn=30):
words.append(similar_word)
embeddings.append(model_gn[similar_word])
embedding_clusters.append(embeddings)
word_clusters.append(words)
¶
Take these clusters and generate points for a t-SNE embeddingembedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32, n_jobs=-1)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
figsize = (9.5,6) if (matplotlib.get_backend() == 'nbAgg') else (20,12) # interactive plot should be smaller
plt.figure(figsize=(figsize))
colors = cm.rainbow(np.linspace(0, 1, len(labels)))
for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
x = embeddings[:, 0]
y = embeddings[:, 1]
plt.scatter(x, y, c=[color], alpha=a, label=label)
for i, word in enumerate(words):
plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom', size=8)
plt.legend(loc=4)
plt.title(title)
plt.grid(True)
plt.show()
tsne_plot_similar_words('Similar words from Google News', keys, embeddings_en_2d, word_clusters, 0.7,
'similar_words.png')
¶
2. Visualizing Word2Vec Vectors from Leo Tolstoy BooksFirst download the punkt tokenizer and the Russian texts
# PUNKT tokenizer for Russian
nltk.download('punkt')
# Download Tolstoy texts
!wget -c -nc 'https://raw.githubusercontent.com/sismetanin/word2vec-tsne/master/data/Anna%20Karenina%20by%20Leo%20Tolstoy%20(ru).txt'
!wget -c -nc 'https://raw.githubusercontent.com/sismetanin/word2vec-tsne/master/data/War%20and%20Peace%20by%20Leo%20Tolstoy%20(ru).txt'
Define routines for preprocessing texts
def preprocess_text(text):
text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
text = re.sub(' +', ' ', text)
return text.strip()
def prepare_for_w2v(filename_from, filename_to, lang):
raw_text = codecs.open(filename_from, "r", encoding='windows-1251').read()
with open(filename_to, 'w', encoding='utf-8') as f:
for sentence in nltk.sent_tokenize(raw_text, lang):
print(preprocess_text(sentence.lower()), file=f)
def train_word2vec(filename):
data = gensim.models.word2vec.LineSentence(filename)
return Word2Vec(data, size=200, window=5, min_count=3, workers=multiprocessing.cpu_count())
¶
2.1. Visualizing Word2Vec Vectors from Anna Kareninaprepare_for_w2v('Anna Karenina by Leo Tolstoy (ru).txt', 'train_anna_karenina_ru.txt', 'russian')
model_ak = train_word2vec('train_anna_karenina_ru.txt')
Note: On Colab, the call to TSNE()
in the next cell is slow because Colab only gives you 2 cores. If you download & run locally it will go faster.
# run the words through the model to get their embedding vectors
words_ak = []
embeddings_ak = []
for word in list(model_ak.wv.vocab):
embeddings_ak.append(model_ak.wv[word])
words_ak.append(word)
# now project that into 2d using t-SNE
tsne_ak_2d = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3500, random_state=32, n_jobs=-1)
embeddings_ak_2d = tsne_ak_2d.fit_transform(embeddings_ak)
def tsne_plot_2d(label, embeddings, words=[], a=1, interactive=(matplotlib.get_backend() == 'nbAgg')):
figsize = (9.5,6) if interactive else (16,9) # interactive plot should be smaller
plt.figure(figsize=figsize)
colors = cm.rainbow(np.linspace(0, 1, 1))
x = embeddings[:,0]
y = embeddings[:,1]
plt.scatter(x, y, c=colors, alpha=a, label=label)
# if words is non-empty, write out each word next to its data point
for i, word in enumerate(words):
plt.annotate(word, alpha=0.3, xy=(x[i], y[i]), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom', size=10)
plt.legend(loc=4)
plt.grid(True)
plt.show()
tsne_plot_2d('Anna Karenina by Leo Tolstoy', embeddings_ak_2d, a=0.1)
# writes out the words next to the data points - this takes more computation
tsne_plot_2d('Anna Karenina by Leo Tolstoy', embeddings_ak_2d, words_ak, 0.1)
¶
2.2. War and PeaceWar & Peace is about 60% longer than Anna Kerenina
prepare_for_w2v('War and Peace by Leo Tolstoy (ru).txt', 'train_war_and_peace_ru.txt', 'russian')
model_wp = train_word2vec('train_war_and_peace_ru.txt')
¶
Now generate the word vectorswords_wp = []
embeddings_wp = []
for word in list(model_wp.wv.vocab):
embeddings_wp.append(model_wp.wv[word])
words_wp.append(word)
print(f"{len(words_wp)} word vectors generated")
¶
Create the t-SNE pointsWARNING: The following call to TSNE()
has a lot of calculations to make, and will take 'several minutes' to run on any system. But on Colab it will run 5x to 10x slower -- i.e., "go for a hike" -- due to there being only 2 CPU cores. Downloading this notebook & running locally will likely run way faster.
tsne_wp_3d = TSNE(perplexity=30, n_components=3, init='pca', n_iter=3500, random_state=12, n_jobs=-1)
embeddings_wp_3d = tsne_wp_3d.fit_transform(embeddings_wp)
¶
And plot it in 3Dfrom mpl_toolkits.mplot3d import Axes3D
def tsne_plot_3d(title, label, embeddings, a=1):
print(f"Plotting {embeddings.shape[0]} points.")
if matplotlib.get_backend() == 'nbAgg': # Instructions for interactive plots
print("Zoom by moving the mouse up & down while holding down the right mouse button.")
fig = plt.figure()
ax = Axes3D(fig)
colors = cm.rainbow(np.linspace(0, 1, 1))
plt.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], c=colors, alpha=a, label=label)
plt.legend(loc=4)
plt.title(title)
plt.show()
tsne_plot_3d('Visualizing Embeddings using t-SNE', 'War and Peace', embeddings_wp_3d, a=0.1)
What does it mean that these 3D points all lie on basically a 2D manifold? I have no idea. -SHH
¶
Optional Ending: Generate GIFsSergey Smetanin's web page features some animated GIFs.
Note: The following will involve one t-SNE calculation per GIF frame. If you're running this on Colab (which only has 2 CPU cores), you can start this and walk away for at least an hour.
¶
GIF #1: t-SNE for different perplexity values, using Google News data (2D)# Revisit the Google News data and re-generate the clusters and embedding
keys_gif = ['Paris', 'Python', 'Sunday', 'Tolstoy', 'Twitter', 'bachelor', 'delivery', 'election', 'expensive',
'experience', 'financial', 'food', 'iOS', 'peace', 'release', 'war']
embedding_clusters_gif = []
word_clusters_gif = []
for word in keys_gif:
embeddings = []
words = []
for similar_word, _ in model_gn.most_similar(word, topn=200):
words.append(similar_word)
embeddings.append(model_gn[similar_word])
embedding_clusters_gif.append(embeddings)
word_clusters_gif.append(words)
embedding_clusters_gif = np.array(embedding_clusters_gif)
n, m, k = embedding_clusters_gif.shape
# Generate a bunch of frames for the GIF
# This GIF-generating code does not play well with interactive plots
%matplotlib inline
def tsne_plot_similar_words_png(title, embedding_clusters, a, filename):
plt.figure(figsize=(16, 9))
colors = cm.rainbow(np.linspace(0, 1, len(embedding_clusters)))
i = 1
for embeddings, color in zip(embedding_clusters, colors):
x = embeddings[:, 0]
y = embeddings[:, 1]
plt.scatter(x, y, c=[color], alpha=a)
plt.text(x.mean(), y.mean(), str(i), color='white', weight='bold', fontsize=13, path_effects=[PathEffects.withStroke(linewidth=3,
foreground="black", alpha=0.7)])
i += 1
plt.title(title)
plt.grid(True)
plt.xlim(-200, 200)
plt.ylim(-200, 200)
plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
plt.close()
dirpath = tempfile.mkdtemp()
# loop over perplexity values
images = []
for i in range(1, 31):
print(f"Generating frame {i}/30")
fname = os.path.join(dirpath, str(i) + '.png')
tsne_model_en_2d_gif = TSNE(perplexity=i, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d_gif = np.array(tsne_model_en_2d_gif.fit_transform(embedding_clusters_gif.reshape(n * m, k))).reshape(n, m, 2)
tsne_plot_similar_words_png('Vizualizing similar words from Google News using t-SNE (perplexity={})'.format(i), embeddings_en_2d_gif, 0.6, fname)
images.append(imageio.imread(fname))
imageio.mimsave("tsne_perplexity.gif", images, duration = 0.5)
shutil.rmtree(dirpath)
# Display the GIF
from IPython.display import Image
Image(open('tsne_perplexity.gif','rb').read())
¶
(Optional) GIF #2: Spinning 3D plot of War & PeaceIf you have interactive plots enabled, and already created the interactive 3D plot for War & Peace above, then you don't 'need' this.
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches
def tsne_plot_3d_gif(title, label, embeddings, filename, a):
fig = plt.figure()
ax = Axes3D(fig)
colors = cm.rainbow(np.linspace(0, 1, 1))
plt.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], c=colors, alpha=a)
plt.title(title)
red_patch = mpatches.Patch(color=colors[0], label=label)
plt.legend(handles=[red_patch], loc=4)
dirpath = tempfile.mkdtemp()
images = []
for angle in range(0, 360, 5):
print(f"Angle = {angle} of 360")
ax.view_init(30, angle)
fname = os.path.join(dirpath, str(angle) + '.png')
plt.savefig(fname, dpi=120, format='png', bbox_inches='tight')
plt.close()
images.append(imageio.imread(fname))
imageio.mimsave('3d.gif', images)
shutil.rmtree(dirpath)
tsne_plot_3d_gif('Visualizing Word Embeddings using t-SNE', 'War and Peace\nby Leo Tolstoy',
embeddings_wp_3d, '3d.gif', 0.1)
from IPython.display import Image
Image(open('3d.gif','rb').read())