Sentiment Analysis

Note: This is one in a series of documents and notebooks that will document and evaluate various machine learning and text mining tools for use in literary studies. These notebooks form the practical and critical archive of my book-in-progress, Digital Humanities and the Search for a Method. I have published a critique of some existing methods (Dobson 2016) that takes up some of these concerns and provides some theoretical background for my account of computational methods as used within the humanities.

Revision Date and Notes:

09/05/2017: Initial version (james.e.dobson@dartmouth.edu)

In [1]:
import csv
import os 
import nltk

from textblob.en import sentiment as pattern_sentiment
from textblob import TextBlob
In [2]:
# This function defines a set of steps to be used in each input text. 
# It does the following: 
#  - removes all non-alpha characters (numbers, stray punctuation, etc)
#  - converts all words to lowercase
#  - removes the above 127 NLTK-defined stopwords
#  - removes an additional set of stopwords
def preprocess(text,options):
    
        # default: drop to lowercase and remove non alpha characters
        pp_text = [word for word in text if word.isalpha() ]
        pp_text = [word.lower() for word in pp_text]
        
        # enable an option for preserving stopwords
        if options != "nostop":
            from nltk.corpus import stopwords
            stopwords = stopwords.words('english')
            custom_stopwords="""like go going gone one would got still really get"""
            stopwords += custom_stopwords.split()
            pp_text = [word for word in pp_text if word not in stopwords]
        
        return pp_text
In [3]:
# Jockers Sentiment
# input: NLTK "Text" object, just lowercase
# returns score
#

def get_jockers_sentiment(tokens):
    jockers_sentiment={}
    
    with open('affect/jockers-sentiment.csv') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in reader:
            jockers_sentiment[row[1]] = row[2]

    jockers_table=list()
    for word in tokens:
        if jockers_sentiment.__contains__(word) == True:
            # scale values
            jockers_sentiment_score=((float(jockers_sentiment[word]) - -1) / (1 - -1)) * (100 - 0) + 0
            jockers_table.append(float(jockers_sentiment_score))

    jockers_score = sum(jockers_table) / len(jockers_table)
    return jockers_score


def get_jockers_sentiment_verbose(tokens):
    jockers_sentiment={}
    
    with open('affect/jockers-sentiment.csv') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in reader:
            jockers_sentiment[row[1]] = row[2]

    jockers_table=list()
    for word in tokens:
        if jockers_sentiment.__contains__(word) == True:
            # scale values
            jockers_sentiment_score=((float(jockers_sentiment[word]) - -1) / (1 - -1)) * (100 - 0) + 0
            jockers_table.append([word,(float(jockers_sentiment_score))])

    #jockers_score = sum(jockers_table) / len(jockers_table)
    return (jockers_table)
In [4]:
def get_hedonometer_sentiment(text):
    # load scores
    hedonometer_affect=[line.rstrip('\n').split(",") for line in open('affect/word_score.csv')]
    
    # produce the dictionary
    affect_dict={}
    for word in hedonometer_affect:
        affect_dict[word[0]] = word[1]
    
    # determine sentiment
    word_count=len(text)
    total=0
    values=[]
    for word in text:
        if affect_dict.__contains__(word) == True:
            values.append(float(affect_dict[word]))

    # average
    hedonmeter_score = (float(sum(values)) / float(len(values))) * 10
    
    return hedonmeter_score

def get_hedonometer_sentiment_verbose(text):
    # load scores
    hedonometer_affect=[line.rstrip('\n').split(",") for line in open('affect/word_score.csv')]
    
    # produce the dictionary
    affect_dict={}
    for word in hedonometer_affect:
        affect_dict[word[0]] = word[1]
    
    # determine sentiment
    word_count=len(text)
    total=0
    values=[]
    for word in text:
        if affect_dict.__contains__(word) == True:
            values.append([word,(float(affect_dict[word]))])

    # average
    #hedonmeter_score = (float(sum(values)) / float(len(values))) * 10
    
    return values
In [5]:
def get_bing_sentiment(text):
    negative = [line.rstrip('\n') for line in open('affect/negative-words.txt')]
    positive = [line.rstrip('\n') for line in open('affect/positive-words.txt')]

    sentiment={}
    sentiment['negative']=[]
    sentiment['positive']=[]
    
    negative_table={}
    positive_table={}

    for word in negative:
        negative_table[word] = text.count(word)
    for word in positive:
        positive_table[word] = text.count(word)
        
    sentiment['negative'].append(sum(negative_table.values()))
    sentiment['positive'].append(sum(positive_table.values()))
    return sentiment
In [6]:
jacobs_first_paragraph=""""I WAS born a slave; but I never knew it till six years of
    happy childhood had passed away. My father was a carpenter, 
    and considered so intelligent and skilful in his trade, 
    that, when buildings out of the common line were to be erected, he was sent for from 
    long distances, to be head workman. On condition of paying his mistress two hundred 
    dollars a year, and supporting himself, he was allowed to work at his trade, and 
    manage his own affairs. His strongest wish was to purchase his children; but, though 
    he several times offered his hard earnings for that purpose, he never succeeded. 
    In complexion my parents were a light shade of brownish yellow, and were termed 
    mulattoes. They lived together in a comfortable home; and, though we were all 
    slaves, I was so fondly shielded that I never dreamed I was a piece of merchandise,
    trusted to them for safe keeping, and liable to be demanded of them at any moment. 
    I had one brother, William, who was two years younger than myself—a bright, 
    affectionate child. I had also a great treasure in my maternal grandmother, who was a 
    remarkable woman in many respects. She was the daughter of a planter in South 
    Carolina, who, at his death, left her mother and his three children free, with money 
    to go to St. Augustine, where they had relatives. It was during the Revolutionary War; 
    and they were captured on their passage, carried back, and sold to different 
    purchasers. Such was the story my grandmother used to tell me; but I do not 
    remember all the particulars. She was a little girl when she was captured and sold 
    to the keeper of a large hotel. I have often heard her tell how hard she fared during
    childhood. But as she grew older she evinced so much intelligence, and was so 
    faithful, that her master and mistress could not help seeing it was for their 
    interest to take care of such a valuable piece of property. She became an 
    indispensable personage in the household, officiating in all capacities, from cook 
    and wet nurse to seamstress. She was much praised for her cooking; and her nice 
    crackers became so famous in the neighborhood that many people were desirous of 
    obtaining them. In consequence of numerous requests of this kind, she asked permission
    of her mistress to bake crackers at night, after all the household work was done; 
    and she obtained leave to do it, provided she would clothe herself and her children 
    from the profits. Upon these terms, after working hard all day for her mistress, 
    she began her midnight bakings, assisted by her two oldest children. The business 
    proved profitable; and each year she laid by a little, which was saved for a fund to 
    purchase her children. Her master died, and the property was divided among his heirs. 
    The widow had her dower in the hotel, which she continued to keep open. My grandmother 
    remained in her service as a slave; but her children were divided among her master's 
    children. As she had five, Benjamin, the youngest one, was sold, in order that each 
    heir might have an equal portion of dollars and cents. There was so little difference 
    in our ages that he seemed more like my brother than my uncle. He was a bright, handsome
    lad, nearly white; for he inherited the complexion my grandmother had derived from 
    Anglo-Saxon ancestors. Though only ten years old, seven hundred and twenty dollars were 
    paid for him. His sale was a terrible blow to my grandmother; but she was naturally 
    hopeful, and she went to work with renewed energy, trusting in time to be able to 
    purchase some of her children. She had laid up three hundred dollars, which her 
    mistress one day begged as a loan, promising to pay her soon. The reader probably 
    knows that no promise or writing given to a slave is legally binding; for, according 
    to Southern laws, a slave, being property, can hold no property. When my grandmother 
    lent her hard earnings to her mistress, she trusted solely to her honor. The honor of 
    a slaveholder to a slave!"""
raw_tokens = nltk.word_tokenize(jacobs_first_paragraph)
print("Total words:", len(raw_tokens))
preprocessed_tokens=preprocess(raw_tokens,options='nostop')
print("Preprocessed words (without stopwords): ",len(preprocess(raw_tokens,options='none')))
Total words: 796
Preprocessed words (without stopwords):  327
In [7]:
# obtain scaled values
print("Mean Jockers Sentiment:",get_jockers_sentiment(preprocessed_tokens))
print("Mean Hedonometer Sentiment:",get_hedonometer_sentiment(preprocessed_tokens))
(pattern_score,pattern_subjectivity)=pattern_sentiment(preprocessed_tokens)
print("Mean Pattern Sentiment:",(float((pattern_score) - -1) / (1 - -1)) * (100 - 0) + 0)
Mean Jockers Sentiment: 64.66494845360825
Mean Hedonometer Sentiment: 54.94524180967224
Mean Pattern Sentiment: 58.751594387755105

Display Comparative Sentiment Dictionary Sizes

In [8]:
# Jockers
jockers_dict_length = len([line.rstrip('\n').split(",") for line in open('affect/jockers-sentiment.csv')])

# Hedonometer
hedonometer_dict_length = len([line.rstrip('\n').split(",") for line in open('affect/word_score.csv')])

# Pattern
import xml.etree.ElementTree
root = xml.etree.ElementTree.parse('affect/en-sentiment.xml').getroot()
pattern_dict_length = len(root)

# Plot
import matplotlib.pyplot as plt
import numpy as np

plt.bar(np.arange(3),[jockers_dict_length,hedonometer_dict_length,pattern_dict_length])
plt.ylabel('Dictionary Size')
plt.xticks(np.arange(3),['Jockers', 'Hedonometer', 'Pattern'])
plt.show()
In [9]:
# Examine Pattern sentiment for sample sentences
# Total number of matching words
print("Matched",len(pattern_sentiment(preprocessed_tokens).assessments),"words")
Matched 56 words
In [10]:
pattern_sentiment(preprocessed_tokens).assessments
Out[10]:
[(['happy'], 0.8, 1.0, None),
 (['intelligent'], 0.8, 0.9, None),
 (['common'], -0.3, 0.5, None),
 (['long'], -0.05, 0.4, None),
 (['supporting'], 0.25, 0.25, None),
 (['own'], 0.6, 1.0, None),
 (['several'], 0.0, 0.0, None),
 (['hard'], -0.2916666666666667, 0.5416666666666666, None),
 (['light'], 0.4, 0.7, None),
 (['yellow'], 0.0, 0.0, None),
 (['comfortable'], 0.4, 0.8, None),
 (['safe'], 0.5, 0.5, None),
 (['liable'], -0.1, 0.5, None),
 (['younger'], 0.0, 0.0, None),
 (['bright'], 0.7000000000000001, 0.7999999999999999, None),
 (['great'], 0.8, 0.75, None),
 (['remarkable'], 0.75, 0.75, None),
 (['many'], 0.5, 0.5, None),
 (['left'], 0.0, 0.0, None),
 (['free'], 0.4, 0.8, None),
 (['back'], 0.0, 0.0, None),
 (['different'], 0.0, 0.6, None),
 (['such'], 0.0, 0.5, None),
 (['little'], -0.1875, 0.5, None),
 (['large'], 0.21428571428571427, 0.42857142857142855, None),
 (['hard'], -0.2916666666666667, 0.5416666666666666, None),
 (['older'], 0.16666666666666666, 0.3333333333333333, None),
 (['much'], 0.2, 0.2, None),
 (['such'], 0.0, 0.5, None),
 (['indispensable'], 0.4, 0.9, None),
 (['wet'], -0.1, 0.4, None),
 (['much'], 0.2, 0.2, None),
 (['nice'], 0.6, 1.0, None),
 (['famous'], 0.5, 1.0, None),
 (['many'], 0.5, 0.5, None),
 (['numerous'], 0.0, 0.5, None),
 (['kind'], 0.6, 0.9, None),
 (['hard'], -0.2916666666666667, 0.5416666666666666, None),
 (['little'], -0.1875, 0.5, None),
 (['open'], 0.0, 0.5, None),
 (['equal'], 0.0, 0.25, None),
 (['little'], -0.1875, 0.5, None),
 (['more'], 0.5, 0.5, None),
 (['bright'], 0.7000000000000001, 0.7999999999999999, None),
 (['handsome'], 0.5, 1.0, None),
 (['nearly', 'white'], 0.0, 0.0, None),
 (['only'], 0.0, 1.0, None),
 (['ten'], 0.0, 0.0, None),
 (['old'], 0.1, 0.2, None),
 (['terrible'], -1.0, 1.0, None),
 (['naturally'], 0.1, 0.4, None),
 (['able'], 0.5, 0.625, None),
 (['promising'], 0.2, 0.5, None),
 (['legally'], 0.2, 0.2, None),
 (['southern'], 0.0, 0.0, None),
 (['hard'], -0.2916666666666667, 0.5416666666666666, None)]
In [11]:
# Examine Jockers sentiment for sample sentences
# Total number of matching words
print("Matched",len(get_jockers_sentiment_verbose(preprocessed_tokens)),"words")
Matched 97 words
In [12]:
get_jockers_sentiment_verbose(preprocessed_tokens)
Out[12]:
[['slave', 12.5],
 ['happy', 87.5],
 ['childhood', 80.0],
 ['intelligent', 100.0],
 ['mistress', 25.0],
 ['supporting', 87.5],
 ['work', 62.5],
 ['manage', 70.0],
 ['strongest', 87.5],
 ['wish', 90.0],
 ['purchase', 62.5],
 ['hard', 37.5],
 ['succeeded', 75.0],
 ['comfortable', 75.0],
 ['slaves', 0.0],
 ['fondly', 75.0],
 ['trusted', 75.0],
 ['safe', 87.5],
 ['liable', 37.5],
 ['demanded', 30.0],
 ['brother', 70.0],
 ['younger', 62.5],
 ['bright', 75.0],
 ['affectionate', 75.0],
 ['child', 80.0],
 ['great', 75.0],
 ['treasure', 87.5],
 ['grandmother', 80.0],
 ['remarkable', 87.5],
 ['respects', 90.0],
 ['daughter', 80.0],
 ['death', 12.5],
 ['free', 75.0],
 ['money', 80.0],
 ['revolutionary', 62.5],
 ['war', 25.0],
 ['grandmother', 80.0],
 ['hard', 37.5],
 ['childhood', 80.0],
 ['intelligence', 75.0],
 ['faithful', 87.5],
 ['mistress', 25.0],
 ['interest', 75.0],
 ['care', 100.0],
 ['valuable', 75.0],
 ['household', 80.0],
 ['nurse', 37.5],
 ['praised', 62.5],
 ['nice', 75.0],
 ['famous', 75.0],
 ['desirous', 87.5],
 ['obtaining', 90.0],
 ['kind', 75.0],
 ['permission', 90.0],
 ['mistress', 25.0],
 ['household', 80.0],
 ['work', 62.5],
 ['obtained', 90.0],
 ['leave', 37.5],
 ['clothe', 80.0],
 ['working', 62.5],
 ['hard', 37.5],
 ['mistress', 25.0],
 ['saved', 90.0],
 ['purchase', 62.5],
 ['died', 25.0],
 ['grandmother', 80.0],
 ['slave', 12.5],
 ['like', 75.0],
 ['brother', 70.0],
 ['bright', 75.0],
 ['handsome', 100.0],
 ['white', 70.0],
 ['grandmother', 80.0],
 ['ancestors', 62.5],
 ['terrible', 12.5],
 ['blow', 30.0],
 ['grandmother', 80.0],
 ['hopeful', 87.5],
 ['work', 62.5],
 ['renewed', 62.5],
 ['trusting', 100.0],
 ['purchase', 62.5],
 ['mistress', 25.0],
 ['promising', 87.5],
 ['pay', 45.0],
 ['promise', 87.5],
 ['slave', 12.5],
 ['legally', 90.0],
 ['slave', 12.5],
 ['grandmother', 80.0],
 ['hard', 37.5],
 ['mistress', 25.0],
 ['trusted', 75.0],
 ['honor', 100.0],
 ['honor', 100.0],
 ['slave', 12.5]]
In [13]:
# Examine Hedonometer sentiment for sample sentences
# Total number of matching words
print("Matched",len(get_hedonometer_sentiment_verbose(preprocessed_tokens)),"words")
Matched 641 words