Note: This is one in a series of documents and notebooks that will document and evaluate various machine learning and text mining tools for use in literary studies. These notebooks form the practical and critical archive of my book-in-progress, Digital Humanities and the Search for a Method. I have published a critique of some existing methods (Dobson 2016) that takes up some of these concerns and provides some theoretical background for my account of computational methods as used within the humanities.
09/05/2017: Initial version (james.e.dobson@dartmouth.edu)
import csv
import os
import nltk
from textblob.en import sentiment as pattern_sentiment
from textblob import TextBlob
# This function defines a set of steps to be used in each input text.
# It does the following:
# - removes all non-alpha characters (numbers, stray punctuation, etc)
# - converts all words to lowercase
# - removes the above 127 NLTK-defined stopwords
# - removes an additional set of stopwords
def preprocess(text,options):
# default: drop to lowercase and remove non alpha characters
pp_text = [word for word in text if word.isalpha() ]
pp_text = [word.lower() for word in pp_text]
# enable an option for preserving stopwords
if options != "nostop":
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
custom_stopwords="""like go going gone one would got still really get"""
stopwords += custom_stopwords.split()
pp_text = [word for word in pp_text if word not in stopwords]
return pp_text
# Jockers Sentiment
# input: NLTK "Text" object, just lowercase
# returns score
#
def get_jockers_sentiment(tokens):
jockers_sentiment={}
with open('affect/jockers-sentiment.csv') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in reader:
jockers_sentiment[row[1]] = row[2]
jockers_table=list()
for word in tokens:
if jockers_sentiment.__contains__(word) == True:
# scale values
jockers_sentiment_score=((float(jockers_sentiment[word]) - -1) / (1 - -1)) * (100 - 0) + 0
jockers_table.append(float(jockers_sentiment_score))
jockers_score = sum(jockers_table) / len(jockers_table)
return jockers_score
def get_jockers_sentiment_verbose(tokens):
jockers_sentiment={}
with open('affect/jockers-sentiment.csv') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in reader:
jockers_sentiment[row[1]] = row[2]
jockers_table=list()
for word in tokens:
if jockers_sentiment.__contains__(word) == True:
# scale values
jockers_sentiment_score=((float(jockers_sentiment[word]) - -1) / (1 - -1)) * (100 - 0) + 0
jockers_table.append([word,(float(jockers_sentiment_score))])
#jockers_score = sum(jockers_table) / len(jockers_table)
return (jockers_table)
def get_hedonometer_sentiment(text):
# load scores
hedonometer_affect=[line.rstrip('\n').split(",") for line in open('affect/word_score.csv')]
# produce the dictionary
affect_dict={}
for word in hedonometer_affect:
affect_dict[word[0]] = word[1]
# determine sentiment
word_count=len(text)
total=0
values=[]
for word in text:
if affect_dict.__contains__(word) == True:
values.append(float(affect_dict[word]))
# average
hedonmeter_score = (float(sum(values)) / float(len(values))) * 10
return hedonmeter_score
def get_hedonometer_sentiment_verbose(text):
# load scores
hedonometer_affect=[line.rstrip('\n').split(",") for line in open('affect/word_score.csv')]
# produce the dictionary
affect_dict={}
for word in hedonometer_affect:
affect_dict[word[0]] = word[1]
# determine sentiment
word_count=len(text)
total=0
values=[]
for word in text:
if affect_dict.__contains__(word) == True:
values.append([word,(float(affect_dict[word]))])
# average
#hedonmeter_score = (float(sum(values)) / float(len(values))) * 10
return values
def get_bing_sentiment(text):
negative = [line.rstrip('\n') for line in open('affect/negative-words.txt')]
positive = [line.rstrip('\n') for line in open('affect/positive-words.txt')]
sentiment={}
sentiment['negative']=[]
sentiment['positive']=[]
negative_table={}
positive_table={}
for word in negative:
negative_table[word] = text.count(word)
for word in positive:
positive_table[word] = text.count(word)
sentiment['negative'].append(sum(negative_table.values()))
sentiment['positive'].append(sum(positive_table.values()))
return sentiment
jacobs_first_paragraph=""""I WAS born a slave; but I never knew it till six years of
happy childhood had passed away. My father was a carpenter,
and considered so intelligent and skilful in his trade,
that, when buildings out of the common line were to be erected, he was sent for from
long distances, to be head workman. On condition of paying his mistress two hundred
dollars a year, and supporting himself, he was allowed to work at his trade, and
manage his own affairs. His strongest wish was to purchase his children; but, though
he several times offered his hard earnings for that purpose, he never succeeded.
In complexion my parents were a light shade of brownish yellow, and were termed
mulattoes. They lived together in a comfortable home; and, though we were all
slaves, I was so fondly shielded that I never dreamed I was a piece of merchandise,
trusted to them for safe keeping, and liable to be demanded of them at any moment.
I had one brother, William, who was two years younger than myself—a bright,
affectionate child. I had also a great treasure in my maternal grandmother, who was a
remarkable woman in many respects. She was the daughter of a planter in South
Carolina, who, at his death, left her mother and his three children free, with money
to go to St. Augustine, where they had relatives. It was during the Revolutionary War;
and they were captured on their passage, carried back, and sold to different
purchasers. Such was the story my grandmother used to tell me; but I do not
remember all the particulars. She was a little girl when she was captured and sold
to the keeper of a large hotel. I have often heard her tell how hard she fared during
childhood. But as she grew older she evinced so much intelligence, and was so
faithful, that her master and mistress could not help seeing it was for their
interest to take care of such a valuable piece of property. She became an
indispensable personage in the household, officiating in all capacities, from cook
and wet nurse to seamstress. She was much praised for her cooking; and her nice
crackers became so famous in the neighborhood that many people were desirous of
obtaining them. In consequence of numerous requests of this kind, she asked permission
of her mistress to bake crackers at night, after all the household work was done;
and she obtained leave to do it, provided she would clothe herself and her children
from the profits. Upon these terms, after working hard all day for her mistress,
she began her midnight bakings, assisted by her two oldest children. The business
proved profitable; and each year she laid by a little, which was saved for a fund to
purchase her children. Her master died, and the property was divided among his heirs.
The widow had her dower in the hotel, which she continued to keep open. My grandmother
remained in her service as a slave; but her children were divided among her master's
children. As she had five, Benjamin, the youngest one, was sold, in order that each
heir might have an equal portion of dollars and cents. There was so little difference
in our ages that he seemed more like my brother than my uncle. He was a bright, handsome
lad, nearly white; for he inherited the complexion my grandmother had derived from
Anglo-Saxon ancestors. Though only ten years old, seven hundred and twenty dollars were
paid for him. His sale was a terrible blow to my grandmother; but she was naturally
hopeful, and she went to work with renewed energy, trusting in time to be able to
purchase some of her children. She had laid up three hundred dollars, which her
mistress one day begged as a loan, promising to pay her soon. The reader probably
knows that no promise or writing given to a slave is legally binding; for, according
to Southern laws, a slave, being property, can hold no property. When my grandmother
lent her hard earnings to her mistress, she trusted solely to her honor. The honor of
a slaveholder to a slave!"""
raw_tokens = nltk.word_tokenize(jacobs_first_paragraph)
print("Total words:", len(raw_tokens))
preprocessed_tokens=preprocess(raw_tokens,options='nostop')
print("Preprocessed words (without stopwords): ",len(preprocess(raw_tokens,options='none')))
# obtain scaled values
print("Mean Jockers Sentiment:",get_jockers_sentiment(preprocessed_tokens))
print("Mean Hedonometer Sentiment:",get_hedonometer_sentiment(preprocessed_tokens))
(pattern_score,pattern_subjectivity)=pattern_sentiment(preprocessed_tokens)
print("Mean Pattern Sentiment:",(float((pattern_score) - -1) / (1 - -1)) * (100 - 0) + 0)
# Jockers
jockers_dict_length = len([line.rstrip('\n').split(",") for line in open('affect/jockers-sentiment.csv')])
# Hedonometer
hedonometer_dict_length = len([line.rstrip('\n').split(",") for line in open('affect/word_score.csv')])
# Pattern
import xml.etree.ElementTree
root = xml.etree.ElementTree.parse('affect/en-sentiment.xml').getroot()
pattern_dict_length = len(root)
# Plot
import matplotlib.pyplot as plt
import numpy as np
plt.bar(np.arange(3),[jockers_dict_length,hedonometer_dict_length,pattern_dict_length])
plt.ylabel('Dictionary Size')
plt.xticks(np.arange(3),['Jockers', 'Hedonometer', 'Pattern'])
plt.show()
# Examine Pattern sentiment for sample sentences
# Total number of matching words
print("Matched",len(pattern_sentiment(preprocessed_tokens).assessments),"words")
pattern_sentiment(preprocessed_tokens).assessments
# Examine Jockers sentiment for sample sentences
# Total number of matching words
print("Matched",len(get_jockers_sentiment_verbose(preprocessed_tokens)),"words")
get_jockers_sentiment_verbose(preprocessed_tokens)
# Examine Hedonometer sentiment for sample sentences
# Total number of matching words
print("Matched",len(get_hedonometer_sentiment_verbose(preprocessed_tokens)),"words")