This notebook uses Cosine similarities to produce a distance matrix using a maximum number of tokens to determine the contribution of words to the model.
12/06/2018: James E. Dobson (james.e.dobson@dartmouth.edu): Created initial notebook.
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter
# Input dataset was created from the ten chapters plus appendix
# of Frederick Douglass's Narrative of the Life of Frederick Douglass (1845)
texts=["Douglass/01",
"Douglass/02",
"Douglass/03",
"Douglass/04",
"Douglass/05",
"Douglass/06",
"Douglass/07",
"Douglass/08",
"Douglass/09",
"Douglass/10",
"Douglass/11"]
# vectorize using Scikit Learn
vectorizer = CountVectorizer(input='filename',
stop_words='english',
strip_accents='unicode',
lowercase=True)
# fit texts to model
dtm_matrix = vectorizer.fit_transform(texts)
# convert from sparse object to numpy array
dtm_matrix = dtm_matrix.toarray()
# tell us about the model
documents, vocabulary = dtm_matrix.shape
print("Documents: {0} Vocabulary: {1}".format(documents,vocabulary))
# calculate distance between texts using cosine_similarity
dist_matrix = 1 - cosine_similarity(dtm_matrix)
# display distance from the first text
for x, y in sorted(enumerate(np.round(dist_matrix[0],4)), key=itemgetter(1)):
print('{0} ({1})'.format(texts[x],y))
# display the similarity matrix (white=similar, dark blue=different)
import seaborn as sn
%matplotlib inline
sn.clustermap(dist_matrix,cmap='Blues',fmt='g',linewidths=.5)
# sort and display top fifty most variant words by
# comparing just rows 3 and 6 (chapters 4 & 7) as the far ends of the above
variance_table=dict()
for t,y in enumerate(np.var(np.array([dtm_matrix[3,...],dtm_matrix[7,...]]), axis=0)):
variance_table[t] = y
for k,v in sorted(variance_table.items(), key=itemgetter(1), reverse=True)[:50]:
print(vectorizer.get_feature_names()[k],v)
# sort and display top fifty most variant words by
# comparing chapter 10 (row 9) to the mean
mean_rows = np.mean(dtm_matrix,axis=0)
variance_table=dict()
for t,y in enumerate(np.var(np.array([mean_rows,dtm_matrix[9,...]]), axis=0)):
variance_table[t] = y
for k,v in sorted(variance_table.items(), key=itemgetter(1), reverse=True)[:50]:
print(vectorizer.get_feature_names()[k],v)
# now calculate based on standard deviation of the mean value for each token
mean_rows = np.mean(dtm_matrix,axis=0)
std_dev = np.std([mean_rows,dtm_matrix[9,...]],axis=0)
# now determine which words contribute to distance
# via standard deviation from the mean value of each token
mean_rows = np.mean(dtm_matrix,axis=0)
std_dev = np.std([mean_rows,dtm_matrix[9,...]],axis=0)
std_dev_table=dict()
for t,y in enumerate(np.std([mean_rows,dtm_matrix[9,...]],axis=0)):
std_dev_table[t] = y
# sort and display top fifty most deviant words
for k,v in sorted(std_dev_table.items(), key=itemgetter(1), reverse=True)[:50]:
print(vectorizer.get_feature_names()[k],v)