This notebook does the following using the ten chapters plus appendix of Frederick Douglass's Narrative of the Life of Frederick Douglass (1845):
- compares similarity scores using different minimum document frequency (df) values
- compare cosine similarity scores with the Euclidean distance metric
- uses MDS to show (dis)similarity between chapters
- visualizes the matrix
- clusters together using Seaborn
- calcuates variance between selected chapters and displays top fifty most variant terms
Revisions:
12/05/2018: James E. Dobson (james.e.dobson@dartmouth.edu): Created initial notebook. 12/06/2018: Added Cosine Similarity Comparison
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter
import nltk
import pandas
import seaborn as sn
sn.set(style="white")
%matplotlib inline
texts=["Douglass/01",
"Douglass/02",
"Douglass/03",
"Douglass/04",
"Douglass/05",
"Douglass/06",
"Douglass/07",
"Douglass/08",
"Douglass/09",
"Douglass/10",
"Douglass/11"]
similarity_chart=list()
# check minimum for minimum document frequencies from 1-13
for df_factor in range(1,12):
vectorizer = CountVectorizer(input='filename', stop_words='english',
strip_accents='unicode',lowercase=True,
min_df=df_factor)
dtm_matrix = vectorizer.fit_transform(texts).toarray()
# tell us about the model
documents, vocabulary = dtm_matrix.shape
print("DF: {0} Documents: {1} Vocabulary: {2}".format(df_factor,documents,vocabulary))
# calculate distances between texts
dist_matrix = euclidean_distances(dtm_matrix)
row = [df_factor]
#display distance from the first text
for x,y in sorted(enumerate(np.round(dist_matrix[0],3)), key=itemgetter(1)):
b,d,c = texts[x].partition('/')
v = '{0} ({1:.1f})'.format(c,y)
row.append(v)
similarity_chart.append(row)
pandas.DataFrame(similarity_chart)
# recalculate with no minimum df
df_factor=1
vectorizer = CountVectorizer(input='filename',stop_words='english',
strip_accents='unicode',lowercase=True,
min_df=df_factor)
dtm_matrix = vectorizer.fit_transform(texts).toarray()
# tell us about the model
documents, vocabulary = dtm_matrix.shape
print("DF: {0} Documents: {1} Vocabulary: {2}".format(df_factor,documents,vocabulary))
# calculate distances between texts
dist_matrix = euclidean_distances(dtm_matrix)
#display distance from the first text
for x,y in sorted(enumerate(np.round(dist_matrix[0],5)), key=itemgetter(1)):
print('{0} ({1})'.format(texts[x],y))
# reduce using MDS and allow MDS to calculate Euclidean distances
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="euclidean", random_state=1)
pos = mds.fit_transform(dtm_matrix)
# plot!
import matplotlib.pyplot as plt
xs, ys = pos[:, 0], pos[:, 1]
fig = plt.figure(figsize=(15, 10),)
for x, y, text in zip(xs, ys, texts):
plt.scatter(x, y, c='black',s=10)
plt.text(x, y, text)
plt.show()
# display the similarity matrix (white=similar, dark blue=different)
sn.heatmap(dist_matrix, cmap='Blues',linewidths=.5)
# cluster the chapters together
sn.clustermap(dist_matrix,cmap='Blues',fmt='g',linewidths=.5)
# recalcuate distance matrix using cosine (dis)similarity
dist_matrix = 1 - cosine_similarity(dtm_matrix)
#display distance from the first text
for x,y in sorted(enumerate(np.round(dist_matrix[0],5)), key=itemgetter(1)):
print('{0} ({1})'.format(texts[x],y))
# reduce using MDS
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=1)
pos = mds.fit_transform(dist_matrix)
# plot!
import matplotlib.pyplot as plt
xs, ys = pos[:, 0], pos[:, 1]
fig = plt.figure(figsize=(15, 10),)
for x, y, text in zip(xs, ys, texts):
plt.scatter(x, y, c='black',s=10)
plt.text(x, y, text)
plt.show()
# display the similarity matrix (white=similar, dark blue=different)
sn.heatmap(dist_matrix, cmap='Blues',linewidths=.5)
# cluster the chapters together
sn.clustermap(dist_matrix,cmap='Blues',fmt='g',linewidths=.5)