{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## How much does the minimum document frequency or \"sparsity\" influence document similarity? \n", "\n", "This notebook does the following using the ten chapters plus appendix of Frederick Douglass's _Narrative of the Life of Frederick Douglass_ (1845):\n", " - compares similarity scores using different minimum document frequency (df) values\n", " - compare cosine similarity scores with the Euclidean distance metric\n", " - uses MDS to show (dis)similarity between chapters\n", " - visualizes the matrix\n", " - clusters together using Seaborn\n", " - calcuates variance between selected chapters and displays top fifty most variant terms\n", "\n", "Revisions:\n", "
\n", "12/05/2018: James E. Dobson (james.e.dobson@dartmouth.edu): Created initial notebook.\n", "12/06/2018: Added Cosine Similarity Comparison\n", "" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import sklearn\n", "import numpy as np\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "from sklearn.metrics import euclidean_distances\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "from operator import itemgetter\n", "import nltk\n", "import pandas\n", "\n", "import seaborn as sn\n", "sn.set(style=\"white\")\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "texts=[\"Douglass/01\",\n", " \"Douglass/02\",\n", " \"Douglass/03\",\n", " \"Douglass/04\",\n", " \"Douglass/05\",\n", " \"Douglass/06\",\n", " \"Douglass/07\",\n", " \"Douglass/08\", \n", " \"Douglass/09\", \n", " \"Douglass/10\", \n", " \"Douglass/11\"]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DF: 1 Documents: 11 Vocabulary: 3996\n", "DF: 2 Documents: 11 Vocabulary: 1527\n", "DF: 3 Documents: 11 Vocabulary: 805\n", "DF: 4 Documents: 11 Vocabulary: 463\n", "DF: 5 Documents: 11 Vocabulary: 287\n", "DF: 6 Documents: 11 Vocabulary: 183\n", "DF: 7 Documents: 11 Vocabulary: 114\n", "DF: 8 Documents: 11 Vocabulary: 68\n", "DF: 9 Documents: 11 Vocabulary: 38\n", "DF: 10 Documents: 11 Vocabulary: 20\n", "DF: 11 Documents: 11 Vocabulary: 9\n" ] } ], "source": [ "similarity_chart=list()\n", "\n", "# check minimum for minimum document frequencies from 1-13\n", "for df_factor in range(1,12):\n", " vectorizer = CountVectorizer(input='filename', stop_words='english',\n", " strip_accents='unicode',lowercase=True,\n", " min_df=df_factor)\n", " dtm_matrix = vectorizer.fit_transform(texts).toarray()\n", " \n", " # tell us about the model\n", " documents, vocabulary = dtm_matrix.shape\n", " print(\"DF: {0} Documents: {1} Vocabulary: {2}\".format(df_factor,documents,vocabulary))\n", "\n", " # calculate distances between texts\n", " dist_matrix = euclidean_distances(dtm_matrix)\n", " \n", " row = [df_factor]\n", " \n", " #display distance from the first text\n", " for x,y in sorted(enumerate(np.round(dist_matrix[0],3)), key=itemgetter(1)):\n", " b,d,c = texts[x].partition('/')\n", " v = '{0} ({1:.1f})'.format(c,y)\n", " row.append(v)\n", " similarity_chart.append(row) " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "10 | \n", "11 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "01 (0.0) | \n", "06 (44.7) | \n", "08 (48.0) | \n", "05 (48.2) | \n", "09 (49.2) | \n", "03 (50.6) | \n", "02 (53.2) | \n", "04 (53.6) | \n", "07 (55.2) | \n", "11 (90.6) | \n", "10 (194.7) | \n", "
1 | \n", "2 | \n", "01 (0.0) | \n", "06 (40.9) | \n", "08 (43.1) | \n", "05 (44.4) | \n", "09 (44.6) | \n", "03 (46.4) | \n", "04 (47.0) | \n", "02 (49.0) | \n", "07 (49.2) | \n", "11 (82.5) | \n", "10 (185.5) | \n", "
2 | \n", "3 | \n", "01 (0.0) | \n", "06 (38.7) | \n", "08 (39.7) | \n", "09 (41.6) | \n", "05 (41.9) | \n", "03 (44.0) | \n", "04 (44.3) | \n", "02 (44.5) | \n", "07 (45.9) | \n", "11 (76.1) | \n", "10 (168.4) | \n", "
3 | \n", "4 | \n", "01 (0.0) | \n", "06 (35.5) | \n", "08 (36.4) | \n", "09 (38.8) | \n", "05 (39.4) | \n", "04 (41.0) | \n", "02 (41.0) | \n", "03 (41.7) | \n", "07 (41.7) | \n", "11 (72.1) | \n", "10 (162.0) | \n", "
4 | \n", "5 | \n", "01 (0.0) | \n", "06 (31.6) | \n", "08 (33.2) | \n", "09 (35.8) | \n", "05 (36.0) | \n", "07 (37.4) | \n", "02 (37.5) | \n", "03 (38.2) | \n", "04 (38.8) | \n", "11 (63.2) | \n", "10 (153.4) | \n", "
5 | \n", "6 | \n", "01 (0.0) | \n", "06 (28.2) | \n", "08 (29.9) | \n", "09 (32.8) | \n", "07 (32.9) | \n", "05 (33.0) | \n", "02 (34.4) | \n", "03 (35.9) | \n", "04 (36.8) | \n", "11 (60.4) | \n", "10 (146.5) | \n", "
6 | \n", "7 | \n", "01 (0.0) | \n", "06 (24.9) | \n", "08 (27.4) | \n", "07 (29.9) | \n", "05 (30.4) | \n", "09 (30.5) | \n", "02 (32.0) | \n", "03 (34.2) | \n", "04 (34.7) | \n", "11 (54.2) | \n", "10 (137.1) | \n", "
7 | \n", "8 | \n", "01 (0.0) | \n", "06 (20.2) | \n", "03 (22.9) | \n", "08 (23.1) | \n", "07 (25.1) | \n", "09 (26.5) | \n", "02 (27.2) | \n", "05 (27.8) | \n", "04 (30.0) | \n", "11 (48.2) | \n", "10 (128.7) | \n", "
8 | \n", "9 | \n", "01 (0.0) | \n", "06 (15.9) | \n", "08 (17.8) | \n", "07 (19.0) | \n", "05 (19.8) | \n", "03 (20.3) | \n", "02 (23.5) | \n", "09 (23.9) | \n", "04 (27.3) | \n", "11 (31.7) | \n", "10 (119.4) | \n", "
9 | \n", "10 | \n", "01 (0.0) | \n", "06 (14.0) | \n", "07 (14.6) | \n", "08 (14.7) | \n", "05 (15.1) | \n", "03 (17.7) | \n", "02 (19.0) | \n", "09 (19.5) | \n", "04 (26.1) | \n", "11 (29.1) | \n", "10 (110.8) | \n", "
10 | \n", "11 | \n", "01 (0.0) | \n", "08 (4.9) | \n", "05 (9.8) | \n", "07 (11.1) | \n", "06 (12.0) | \n", "02 (12.2) | \n", "03 (14.5) | \n", "09 (16.2) | \n", "11 (23.8) | \n", "04 (24.9) | \n", "10 (102.1) | \n", "