This notebook uses Cosine similarities to produce a distance matrix using a maximum number of tokens to determine the contribution of words to the model.
12/06/2018: James E. Dobson (james.e.dobson@dartmouth.edu): Created initial notebook.
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from operator import itemgetter
# Input dataset was created from the ten chapters plus appendix 
# of Frederick Douglass's Narrative of the Life of Frederick Douglass (1845)
texts=["Douglass/01",
       "Douglass/02",
       "Douglass/03",
       "Douglass/04",
       "Douglass/05",
       "Douglass/06",
       "Douglass/07",
       "Douglass/08", 
       "Douglass/09", 
       "Douglass/10", 
       "Douglass/11"]
# vectorize using Scikit Learn
vectorizer = CountVectorizer(input='filename', 
                             stop_words='english',
                             strip_accents='unicode',
                             lowercase=True)
# fit texts to model
dtm_matrix = vectorizer.fit_transform(texts)
# convert from sparse object to numpy array
dtm_matrix = dtm_matrix.toarray()
# tell us about the model
documents, vocabulary = dtm_matrix.shape
print("Documents: {0} Vocabulary: {1}".format(documents,vocabulary))
# calculate distance between texts using cosine_similarity
dist_matrix = 1 - cosine_similarity(dtm_matrix)
# display distance from the first text
for x, y in sorted(enumerate(np.round(dist_matrix[0],4)), key=itemgetter(1)):
    print('{0} ({1})'.format(texts[x],y))
# display the similarity matrix (white=similar, dark blue=different)
import seaborn as sn
%matplotlib inline
sn.clustermap(dist_matrix,cmap='Blues',fmt='g',linewidths=.5)
# sort and display top fifty most variant words by 
# comparing just rows 3 and 6 (chapters 4 & 7) as the far ends of the above 
variance_table=dict()
for t,y in enumerate(np.var(np.array([dtm_matrix[3,...],dtm_matrix[7,...]]), axis=0)):
    variance_table[t] = y
    
for k,v in sorted(variance_table.items(), key=itemgetter(1), reverse=True)[:50]:
    print(vectorizer.get_feature_names()[k],v)
# sort and display top fifty most variant words by 
# comparing chapter 10 (row 9) to the mean 
mean_rows = np.mean(dtm_matrix,axis=0)
variance_table=dict()
for t,y in enumerate(np.var(np.array([mean_rows,dtm_matrix[9,...]]), axis=0)):
    variance_table[t] = y
    
for k,v in sorted(variance_table.items(), key=itemgetter(1), reverse=True)[:50]:
    print(vectorizer.get_feature_names()[k],v)
# now calculate based on standard deviation of the mean value for each token
mean_rows = np.mean(dtm_matrix,axis=0)
std_dev = np.std([mean_rows,dtm_matrix[9,...]],axis=0)
# now determine which words contribute to distance 
# via standard deviation from the mean value of each token
mean_rows = np.mean(dtm_matrix,axis=0)
std_dev = np.std([mean_rows,dtm_matrix[9,...]],axis=0)
std_dev_table=dict()
for t,y in enumerate(np.std([mean_rows,dtm_matrix[9,...]],axis=0)):
    std_dev_table[t] = y
    
# sort and display top fifty most deviant words
for k,v in sorted(std_dev_table.items(), key=itemgetter(1), reverse=True)[:50]:
    print(vectorizer.get_feature_names()[k],v)