Visualizing Word2Vec Embeddings with tSNE
A first foray into "Digital Humanities"
- 0. Preliminary Setup: Installs and Imports¶
- 1. Visualizing Similar Words from Google News¶
- 2. Visualizing Word2Vec Vectors from Leo Tolstoy Books¶
- Optional Ending: Generate GIFs¶
Adapted from Sergey Smetanin's "Google News and Leo Tolstoy" post on Medium (2018). Read that first for instruction, then come back here to execute the (updated) code.
Updates by Scott H. Hawley (2020):
- Automatically installs packages, downloads model and data.
- Support for (local) interactive plots
-
Speed: Enabled parallel compution of t-SNE in sklearn.
-
TODO/Abandoned: RAPIDS' cuML GPU-powered t-SNE, is potentially faster than sklearn's, but it's still really spotty: RAPIDS' examples work great with my laptop's RTX 2070 but not at all on Colab (despite advertisements). Also my tests of cuML for this-here dataset on my laptop show plenty of instances where cuML's t-SNE will hang indefinitely for no 'good' reason (e.g. increase
n_iter
from 1750 to 1800 and suddenly execution time goes from 600ms to never-finishing). ...Plus, it only does random init (not PCA) and only outputs 2D (no 3D). So...not gonna mess with it anymore. (Might check back in 6 months.)
-
TODO/Abandoned: RAPIDS' cuML GPU-powered t-SNE, is potentially faster than sklearn's, but it's still really spotty: RAPIDS' examples work great with my laptop's RTX 2070 but not at all on Colab (despite advertisements). Also my tests of cuML for this-here dataset on my laptop show plenty of instances where cuML's t-SNE will hang indefinitely for no 'good' reason (e.g. increase
-
A few updates as per package changes (e.g. matplotlib color spec)
-
TODO: Probably will ditch t-SNE & UMAP, and just go with PCA for starters, then maybe
sklearn.manifold.LocallyLinearEmbedding
NOTE: I'm posting this on Colab for convenience, but if you've got a decent machine at home, I recommend downloading this notebook and running it locally -- Colab only offers 2 CPU cores and thus crucial t-SNE calculations likely run 5x to 10x slower on Colab than they would on your local machine.
¶
0. Preliminary Setup: Installs and Imports# Install required Python packages:
# gensim offers a fully-featured set of Word2Vec routines
# nltk is a text tokenizer
# Other required packages are typically already installed with Jupyter
!python -m pip install gensim nltk | grep -v 'already satisfied'
# Python imports
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import FAST_VERSION
print(f"gensim FAST_VERSION = {FAST_VERSION}. <-- Hopefully that says '1' not '-1'")
from sklearn.manifold import TSNE # actually TNSE is the speed bottleneck, not Word2Vec
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import nltk
import re
import codecs
import multiprocessing
import os
import sys
import matplotlib.patheffects as PathEffects
import tempfile
import imageio
import shutil
# plot_type: 'notebook' allows for interactive plots (=better!), but Colab
# only supports 'inline'.
# For interactive plots, best to execute one cell at a time (manually), rather
# than Kernal > Run All, because interactives will appear blank until all
# code cells have executed (whereas inline plots render immediately).
plot_type = 'inline' if 'google.colab' in sys.modules else 'notebook' # Auto-detect Colab
%matplotlib $plot_type
¶
1. Visualizing Similar Words from Google NewsFirst, download the word vectors for Google News
if not os.path.exists("GoogleNews-vectors-negative300.bin"):
!wget -c -nc "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
!gunzip GoogleNews-vectors-negative300.bin.gz
else:
print("\nWord2Vec vectors already there; not retrieving.\n")
¶
Read in the model (may take a while)model_gn = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
¶
For a sample set of key words, generate clusters of nearby similar words.WARNING: Memory-intensive operation. 8GB of RAM will not be sufficient (you will start swapping).
keys = ['Paris', 'Python', 'Sunday', 'Tolstoy', 'Twitter', 'bachelor', 'delivery', 'election', 'expensive',
'experience', 'financial', 'food', 'iOS', 'peace', 'release', 'war']
embedding_clusters = []
word_clusters = []
for word in keys:
print(f"Key = {word}")
embeddings = []
words = []
for similar_word, _ in model_gn.most_similar(word, topn=30):
words.append(similar_word)
embeddings.append(model_gn[similar_word])
embedding_clusters.append(embeddings)
word_clusters.append(words)
¶
Take these clusters and generate points for a t-SNE embeddingembedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32, n_jobs=-1)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
figsize = (9.5,6) if (matplotlib.get_backend() == 'nbAgg') else (20,12) # interactive plot should be smaller
plt.figure(figsize=(figsize))
colors = cm.rainbow(np.linspace(0, 1, len(labels)))
for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
x = embeddings[:, 0]
y = embeddings[:, 1]
plt.scatter(x, y, c=[color], alpha=a, label=label)
for i, word in enumerate(words):
plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom', size=8)
plt.legend(loc=4)
plt.title(title)
plt.grid(True)
plt.show()
tsne_plot_similar_words('Similar words from Google News', keys, embeddings_en_2d, word_clusters, 0.7,
'similar_words.png')
¶
2. Visualizing Word2Vec Vectors from Leo Tolstoy BooksFirst download the punkt tokenizer and the Russian texts
# PUNKT tokenizer for Russian
nltk.download('punkt')
# Download Tolstoy texts
!wget -c -nc 'https://raw.githubusercontent.com/sismetanin/word2vec-tsne/master/data/Anna%20Karenina%20by%20Leo%20Tolstoy%20(ru).txt'
!wget -c -nc 'https://raw.githubusercontent.com/sismetanin/word2vec-tsne/master/data/War%20and%20Peace%20by%20Leo%20Tolstoy%20(ru).txt'
Define routines for preprocessing texts
def preprocess_text(text):
text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
text = re.sub(' +', ' ', text)
return text.strip()
def prepare_for_w2v(filename_from, filename_to, lang):
raw_text = codecs.open(filename_from, "r", encoding='windows-1251').read()
with open(filename_to, 'w', encoding='utf-8') as f:
for sentence in nltk.sent_tokenize(raw_text, lang):
print(preprocess_text(sentence.lower()), file=f)
def train_word2vec(filename):
data = gensim.models.word2vec.LineSentence(filename)
return Word2Vec(data, size=200, window=5, min_count=3, workers=multiprocessing.cpu_count())
¶
2.1. Visualizing Word2Vec Vectors from Anna Kareninaprepare_for_w2v('Anna Karenina by Leo Tolstoy (ru).txt', 'train_anna_karenina_ru.txt', 'russian')
model_ak = train_word2vec('train_anna_karenina_ru.txt')
Note: On Colab, the call to TSNE()
in the next cell is slow because Colab only gives you 2 cores. If you download & run locally it will go faster.
# run the words through the model to get their embedding vectors
words_ak = []
embeddings_ak = []
for word in list(model_ak.wv.vocab):
embeddings_ak.append(model_ak.wv[word])
words_ak.append(word)
# now project that into 2d using t-SNE
tsne_ak_2d = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3500, random_state=32, n_jobs=-1)
embeddings_ak_2d = tsne_ak_2d.fit_transform(embeddings_ak)
def tsne_plot_2d(label, embeddings, words=[], a=1, interactive=(matplotlib.get_backend() == 'nbAgg')):
figsize = (9.5,6) if interactive else (16,9) # interactive plot should be smaller
plt.figure(figsize=figsize)
colors = cm.rainbow(np.linspace(0, 1, 1))
x = embeddings[:,0]
y = embeddings[:,1]
plt.scatter(x, y, c=colors, alpha=a, label=label)
# if words is non-empty, write out each word next to its data point
for i, word in enumerate(words):
plt.annotate(word, alpha=0.3, xy=(x[i], y[i]), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom', size=10)
plt.legend(loc=4)
plt.grid(True)
plt.show()
tsne_plot_2d('Anna Karenina by Leo Tolstoy', embeddings_ak_2d, a=0.1)
# writes out the words next to the data points - this takes more computation
tsne_plot_2d('Anna Karenina by Leo Tolstoy', embeddings_ak_2d, words_ak, 0.1)