{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

Interpreting Computational Models for the Humanities

\n", "

Examining Contribution to Distance Metrics

\n", "\n", "\n", "This notebook uses Cosine similarities to produce a distance matrix using a maximum number of tokens to determine the contribution of words to the model. \n", "\n", "12/06/2018: James E. Dobson (james.e.dobson@dartmouth.edu): Created initial notebook." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import sklearn\n", "import numpy as np\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "from operator import itemgetter" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Input dataset was created from the ten chapters plus appendix \n", "# of Frederick Douglass's Narrative of the Life of Frederick Douglass (1845)\n", "texts=[\"Douglass/01\",\n", " \"Douglass/02\",\n", " \"Douglass/03\",\n", " \"Douglass/04\",\n", " \"Douglass/05\",\n", " \"Douglass/06\",\n", " \"Douglass/07\",\n", " \"Douglass/08\", \n", " \"Douglass/09\", \n", " \"Douglass/10\", \n", " \"Douglass/11\"]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Documents: 11 Vocabulary: 3996\n" ] } ], "source": [ "# vectorize using Scikit Learn\n", "vectorizer = CountVectorizer(input='filename', \n", " stop_words='english',\n", " strip_accents='unicode',\n", " lowercase=True)\n", "# fit texts to model\n", "dtm_matrix = vectorizer.fit_transform(texts)\n", "\n", "# convert from sparse object to numpy array\n", "dtm_matrix = dtm_matrix.toarray()\n", "\n", "# tell us about the model\n", "documents, vocabulary = dtm_matrix.shape\n", "print(\"Documents: {0} Vocabulary: {1}\".format(documents,vocabulary))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Douglass/01 (0.0)\n", "Douglass/09 (0.5037)\n", "Douglass/08 (0.5212)\n", "Douglass/10 (0.5432)\n", "Douglass/02 (0.5898)\n", "Douglass/07 (0.6201)\n", "Douglass/11 (0.6268)\n", "Douglass/03 (0.6286)\n", "Douglass/06 (0.6296)\n", "Douglass/05 (0.6543)\n", "Douglass/04 (0.6901)\n" ] } ], "source": [ "# calculate distance between texts using cosine_similarity\n", "dist_matrix = 1- cosine_similarity(dtm_matrix)\n", "\n", "# display distance from the first text\n", "for x, y in sorted(enumerate(np.round(dist_matrix[0],4)), key=itemgetter(1)):\n", " print('{0} ({1})'.format(texts[x],y))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlAAAAJCCAYAAAAP/PnVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XuYZWddJ/rvL91AN+JE5DaQoIkYRkEFhIkeGCOIQHLw\nEASFwCO3QZsZQdABMQIDDQJPcEDEkVFqICgwEJBrHwk35XYGFBIUhATBJCgkXEcQxqGDdPidP3pH\ni6J2V7/p3rV2dX0+z7Of3vtda737W727qr691tprV3cHAIDDd9zUAQAAthoFCgBgkAIFADBIgQIA\nGKRAAQAMUqAAAAYpUAAAgxQoAIBBChQAwKCdIyvvvt2jFnLZ8v1/+bu1iHkBABZhqEDluB0LinFo\nVXV6kucl2ZHkhd19zprlz01yl9nD6ya5cXd/x2zZVUk+PFv2ye6+1+akBgCOVWMFase1FhRjvqra\nkeT5Se6W5PIkF1TVvu6++Op1uvtXVq3/S0lut2qK/d19283KCwAc+wYL1NjqR8mpSS7p7suSpKrO\nS3JmkovnrP+AJE/ZpGwAwDa09HugkpyQ5FOrHl+e5EfWW7GqvjvJyUnevmp4V1VdmORAknO6+/WL\nCgoAbA9LUaCqak+SPauGVrp75RpMdVaSV3f3VavGvru7r6iq70ny9qr6cHdfeiR5WR5VdXaSXVPn\ngC3syrXnlQIbGytQOxdzCG9WluYVpiuS3HzV4xNnY+s5K8kj18x9xezPy6rqnTl4fpQCdezY1d17\npw4BW1VV7Z06A2xFS7EHagMXJDmlqk7OweJ0VpIHrl2pqr4vyfWT/Nmqsesn+Wp3f62qbpjkTkl+\nc1NSAwDHrKU/iby7D1TVo5K8JQcvY3Bud19UVU9LcmF375utelaS87p79bWqvj/JC6rqGzl40dBz\nVr97DwDgmtgKe6DS3ecnOX/N2JPXPN67znbvTfKDCw0HAGw7W6JAAQAsk6U/hAcAsGyGGtFxx/ns\nYQCAsQK1Q4ECNpdrfS3cSS5lsFCus3WMUqCAZedaX2xZyumxa6hA7dixY1E5AAC2DHugAAAGKVAA\nAIO2xCG8qjo9yfNy8ErkL1zvhLyqul+SvUk6yYe6+4Gz8YckedJstad39x9uSmgADssx/kaBY/kk\n/W19gvzS74Gqqh1Jnp/kbkkuT3JBVe1b/ZEsVXVKkl9Pcqfu/lJV3Xg2/p1JnpLkDjlYrD4w2/ZL\nm/11wFY28S+4qX4BbetfDpvMGwW2oGO4GB6WpS9QSU5Nckl3X5YkVXVekjOTrP5Mu19I8vyri1F3\nf342fo8kb+vuL862fVuS05O8YpOyw7Fi2/2C2+6/HIBDW4pDeFW1J8meVUMr3b0yu39Ckk+tWnZ5\nkh9ZM8UtZ/O8JwcP8+3t7jfP2faEoxgdANiGlmIP1KwsrWy44nw7k5yS5M5JTkzy7qryIcIAwEIM\n7oGa5BDeFUluvurxibOx1S5P8r7u/nqST1TVx3OwUF2Rg6Vq9bbvXFhSAGBbGGpEO3bUQm4buCDJ\nKVV1clVdO8lZSfatWef1mRWlqrphDh7SuyzJW5LcvaquX1XXT3L32RgAwDU2tAdq587N3wPV3Qeq\n6lE5WHx2JDm3uy+qqqclubC79+VfitLFSa5K8qvd/fdJUlW/kYMlLEmedvUJ5QAA19RWOISX7j4/\nyflrxp686n4n+U+z29ptz01y7qIzbmfe4g6w/Bbws3oRP3+3zM/WwQK14eE2tidvcQdYfkv/s3or\n/WzdEnugAACWiT1QAACD7IECABikQLEtbIGTJ7fMiZPLymsMbCaH8Ngulvrkya104uQS8xofAe+m\nhTFj14GyBwrgWLXUBXQRlr3UstyGGtHOHbWQ20aq6vSq+lhVXTL7X9K89e5bVV1Vd5g9Pqmq9lfV\nB2e33x/5egEA1jN2CO+4zd8DVVU7kjw/yd1y8DPvLqiqfd198Zr1vj3JY5K8b80Ul3b3bTclLACw\nLQwewpvkHKhTk1zS3ZclSVWdl+TMJBevWe83kjwrya9ubjwAYLsZPIR33EJuVbWnqi5cdduz6mlP\nSPKpVY8vn439s6r64SQ37+43rhP75Kr6y6p6V1X92MjXCwCwnrE9UMctZg9Ud68kWbkm21bVcUl+\nK8lD11n8mSTf1d1/X1W3T/L6qrp1d3/lGocFALa9wXOgJjmEd0WSm696fOJs7GrfnuQHkryzqpLk\nXyfZV1X36u4Lk3wtSbr7A1V1aZJbJrlwM4IDAMemrXAhzQuSnFJVJ+dgcToryQOvXtjdX05yw6sf\nV9U7kzyuuy+sqhsl+WJ3X1VV35PklCSXbWZ4AODYsxSH8A6luw9U1aOSvCXJjiTndvdFVfW0JBd2\n975DbH5akqdV1deTfCPJf+juLy4+NQBwLNsSF9Ls7vOTnL9m7Mlz1r3zqvuvSfKahYYDALadrXAZ\nAwDweYcslcFDeD7KBYDJLPXHzfhomO1lqEBdyx4oAAAFCgBg1FiBmuY6UAAAS8UeKACAQUt/HSgA\ngGUz9La6a+2ohdw2UlWnV9XHquqS2dtY1y7/D1X14ar6YFX9z6q61aplvz7b7mNVdY+RrxcAYD1L\nfwivqnYkeX6SuyW5PMkFVbWvuy9etdrLu/v3Z+vfKwc/XPj0WZE6K8mtk9wsyZ9U1S27+6pN/SIA\ngGPKVjiJ/NQkl3T3ZUlSVeclOTPJPxeo7v7KqvW/LUnP7p+Z5Lzu/lqST1TVJbP5/mwzggMAx6al\n3wOV5IQkn1r1+PIkP7J2pap6ZJL/lOTaSX5i1bZ/vmbbExYTEwDYLobOgdq5oxZyq6o9VXXhqtue\n0S+ku5/f3bdI8mtJnjS6PQDA4Rp7F14tZg9Ud68kWZmz+IokN1/1+MTZ2DznJfm9a7gtAMCGxvZA\nHVcLuW3ggiSnVNXJVXXtHDwpfN/qFarqlFUP75nkb2b39yU5q6quU1UnJzklyftHvmYAgLWW/sOE\nu/tAVT0qyVuS7EhybndfVFVPS3Jhd+9L8qiq+skkX0/ypSQPmW17UVW9KgdPOD+Q5JHegQcAHKml\nOIS3ke4+P8n5a8aevOr+Yw6x7TOSPGNx6QCA7WaoQO1wJXIAgNE9UJt/CA8AYNn4LDwAgEEKFADA\noLFzoBzCAwCwBwoAYNSWuIwBAMAyGSpQpUABAIx9lMtxxy3mtpGqOr2qPlZVl1TV2essP62q/qKq\nDlTVz6xZdlVVfXB227d2WwCAUYMnkW/+Hqiq2pHk+UnuluTyJBdU1b7uvnjVap9M8tAkj1tniv3d\nfduFBwUAto2hAnXcNCeRn5rkku6+LEmq6rwkZ+bg59slSbr7b2fLvjFFQABgexk7hFe1kFtV7amq\nC1fd9qx62hOSfGrV48tnY4dr12zOP6+qe498vQAA6xncA7WYEN29kmRlMbPnu7v7iqr6niRvr6oP\nd/elC3ouAGAbWPpzoJJckeTmqx6fOBs7LN19xezPy6rqnUlul0SBAgCuscF34dVCbhu4IMkpVXVy\nVV07yVlJDuvddFV1/aq6zuz+DZPcKavOnQIAuCbGDuFNsAequw9U1aOSvCXJjiTndvdFVfW0JBd2\n976q+rdJXpfk+kn+n6p6anffOsn3J3nB7OTy45Kcs+bdewAAw5biHKiNdPf5Sc5fM/bkVfcvyMFD\ne2u3e2+SH1x4QABgW9kK50ABACyVrXAdKACApbL050ABACybsUN4E50DBQCwTOyBAgAY5BwoAIBB\n3oUHADBo8Erki7ltpKpOr6qPVdUlVXX2OsuvU1WvnC1/X1WdtGrZr8/GP1ZV9xj5egEA1rP0e6Cq\nakeS5ye5W5LLk1xQVfvWXFH84Um+1N3fW1VnJXlWkvtX1a1y8KNfbp3kZkn+pKpu2d1Xbe5XAQAc\nS7bCZ+GdmuSS7r6su/8pyXlJzlyzzplJ/nB2/9VJ7lpVNRs/r7u/1t2fSHLJbD4AgGtsK7wL74Qk\nn1r1+PIkPzJvndln5305yQ1m43++ZtsTFhcVANgOhgrU7mtlIQ2qqvYk2bNqaKW7VxbxXAAAR2qo\nQC3KrCzNK0xXJLn5qscnzsbWW+fyqtqZ5Pgkf3+Y2wIADNkK1xa/IMkpVXVyVV07B08K37dmnX1J\nHjK7/zNJ3t7dPRs/a/YuvZOTnJLk/ZuUGwA4Ri3FHqhDmZ3T9Kgkb0myI8m53X1RVT0tyYXdvS/J\ni5K8tKouSfLFHCxZma33qiQXJzmQ5JHegQcAHKmlL1BJ0t3nJzl/zdiTV92/MsnPztn2GUmesdCA\nAMC2shUO4QEALBUFCgBgkAIFADBIgQIAGKRAAQAMUqAAAAYpUAAAgxQoAIBBChQAwCAFCgBgkAIF\nADBIgQIAGKRAAQAMUqAAAAYpUAAAgxQoAIBBChQAwCAFCgBgUHX31BkAALYUe6AAAAbtnDoAk7Db\nEYDtoBY1sQK1TV15YOoE8+3amdzisW+aOsZclz7njCTJHZ7+jomTrO/CJ90lSbL7jk+YOMl8+9/7\nzOy+z4umjjHX/tc+PEmy+05PnDjJ+va/5xlJkt0//cKJk8y3/3U/n1s94a1Tx5jr4mfePckW+D6+\n98rESebb//o92X27R00dY679f/m7C53fITwAgEEKFADAIAUKAGCQAgUAMEiBAgAYpEABAAxSoAAA\nBilQAACDFCgAgEEKFADAIAUKAGCQAgUAMEiBAgAYpEABAAxSoAAABu2cOgAAwGarqlOTdHdfUFW3\nSnJ6kr/u7vMPZ3sFCgDYVqrqKUnOSLKzqt6W5EeSvCPJ2VV1u+5+xkZzLGWBqqqzk+yaOsexprv3\nTp0BADZLVe1JsmfV0Ep3ryT5mSS3TXKdJJ9NcmJ3f6Wqnp3kfUm2ZoFKsssvewDgSMzK0so6iw50\n91VJvlpVl3b3V2br76+qbxzO3E4iBwC2m3+qquvO7t/+6sGqOj7JYRWoZd0DBQCwKKd199eSpLtX\nF6ZrJXnI4UygQAEA28rV5Wmd8f+V5H8dzhwO4QEADFKgAAAGKVAAAIMUKACAQQoUAMAgBQoAYJAC\nBQAwSIECABikQAEADFKgAAAGKVAAAIMUKACAQQoUAMAgBQoAYFB199QZvkVV7e3uvVPnOIYt34sO\nAEdfLWpie6AAAAbtnDoA07jFY980dYS5Ln3OGbnywNQp5ts1+6459ZnvnDLGXO9/wp2TJLd8/Jun\nDXIIH//N05f+32CSHP+Al06cZH1ffsWDkiS7770ycZL59r9+z5Z4jW/y8380cZL1fe6FP5sk+Vdn\nvWTiJPN95bwHZ/dpe6eOMdf+d+9d6Pz2QAEADFKgAAAGKVAAAIMUKACAQQoUAMAgBQoAYJACBQAw\nSIECABh0xBfSrKqzk+w6CllWO+kozwcAcNQcjSuR7zran1tXVUd1PgCAo8khPACAQQoUAMAgBQoA\nYJACBQAwSIECABikQAEADFKgAAAGKVAAAKtU1cM2WudoXEhzUyzoiufbytG+4CkALLOq2pNkz6qh\nle5eOYxNn5rkxYdaYcsUqCzgiucAwLFrVpbWLUxV9VdzNqskN9lo7q1UoAAAjpabJLlHki+tGa8k\n791oYwUKANiO/jjJ9br7g2sXVNU7N9pYgQIAtp3ufvghlj1wo+29Cw8AYJACBQAwSIECABikQAEA\nDFKgAAAGKVAAAIMUKACAQQoUAMAgBQoAYJACBQAwSIECABikQAEADFKgAAAGKVAAAIOqu49sgqq9\n3b336MT55znPTrJrzfBJ3f3Qo/k829iRvegAsDXUoibeuaiJj0R3n7N2rKr2ThDlmHWHp79j6ghz\nXfiku+TUZ75z6hhzvf8Jd06SXHlg2hzz7Jp9V+++x7OnDXII+9/yuNx0z2umjjHXZ1bumyTZfff/\nMnGS9e1/668mSXaf8dyJk8y3/02/ku/6pX1Tx5jrk//1XkmSGz/8VRMnWd/nX3S/JMnuM18wcZL5\n9r/hEdl9pydOHWOu/e95xkLndwgPAGCQAgUAMEiBAgAYpEABAAxSoAAABilQAACDFCgAgEEKFADA\nIAUKAGCQAgUAMEiBAgAYpEABAAxSoAAABilQAACDFCgAgEEKFADAIAUKANh2qur7ququVXW9NeOn\nH872ChQAsK1U1aOTvCHJLyX5SFWduWrxMw9njp2LCLYgV1bV3qlDbGXdvXfqDACwWapqT5I9q4ZW\nunslyS8kuX13/2NVnZTk1VV1Unc/L0kdztxbpkB19zlTZwAAto5ZWVpZZ9Fx3f2Ps3X+tqrunIMl\n6rtzmAXKITwAYLv5XFXd9uoHszL1U0lumOQHD2cCBQoA2G4enOSzqwe6+0B3PzjJaYczwZY5hAcA\ncDR09+WHWPaew5nDHigAgEEKFADAIAUKAGCQAgUAMEiBAgAYpEABAAxSoAAABilQAACDFCgAgEEK\nFADAIAUKAGCQAgUAMEiBAgAYpEABAAyq7j6yCar2dvfeoxOHTXJkLzoAbA21qIl3LmpiltvuOz5h\n6ghz7X/vM3PLx7956hhzffw3T0+S7L7HsydOsr79b3lckuTKAxMHOYRdO5MbPPgVU8eY6+9f8oAk\nyW2e8qcTJ1nfh5561yTJib/4+omTzHf5f7t3jn/gS6eOMdeXX/6gJMnNHvHaiZOs79MvuE+SZPc9\nf2fiJPPtf+Ojlz7fIjmEBwAwSIECABikQAEADFKgAAAGKVAAAIMUKACAQQoUAMAgBQoAYJACBQAw\nSIECABikQAEADFKgAAAGKVAAAIMUKACAQQoUAMAgBQoAYJACBQBse1X1kpH1dy4qCADAMqqqfWuH\nktylqr4jSbr7XhvNMVmBqqqzk+ya6vm3o+7eO3UGANgsVbUnyZ5VQyvdvZLkxCQXJ3lhks7BAnWH\nJM853Lmn3AO1yy90AGBRZmVpZZ1Fd0jymCRPTPKr3f3Bqtrf3e863LkdwgMAtpXu/kaS51bVH83+\n/FwGO5ECBQBsS919eZKfrap7JvnKyLYKFACwrXX3G5O8cWQblzEAABikQAEADFKgAAAGKVAAAIMU\nKACAQQoUAMAgBQoAYJACBQAwSIECABikQAEADFKgAAAGKVAAAIMUKACAQQoUAMCg6u4jm6Bqb3fv\n3aztOCqO7EUHgK2hFjWxPVAAAIN2Th2Aaey+z4umjjDX/tc+PLd47JumjjHXpc85I0ly0z2vmTjJ\n+j6zct8kyQ0e/IqJk8z39y95QK48MHWK+XbNfjLe8TffPW2QOd77+NOSJDd7xGsnTjLfp19wn/z4\nc98zdYy53vUrd0qS3P433jFxkvV94D/fJcny/pxJDv6sOeE/vm7qGHNd8Xs/vdD57YECABikQAEA\nDFKgAAAGKVAAAIMUKACAQQoUAMAgBQoAYJACBQAwaOEX0qyqs5PsWmfRSYt+bgCARdiMK5HvWu8z\n76rqW8YAALYCh/AAAAYpUAAAgxQoAIBBChQAwCAFCgBg0Ga8Cw8AYKlU1fclOTPJCbOhK5Ls6+6P\nHs729kABANtKVf1akvOSVJL3z26V5BWz61duyB4oAGC7eXiSW3f311cPVtVvJbkoyTkbTbBlCtQh\nrmjOYVrvgqYAcKyqqj1J9qwaWunulSTfSHKzJH+3ZpObzpZtaMsUqMy5ojkAwHpmZWllnUW/nORP\nq+pvknxqNvZdSb43yaMOZ+6tVKAAAI5Yd7+5qm6Z5NR880nkF3T3VYczhwIFAGw73f2NJH9+Tbf3\nLjwAgEEKFADAIAUKAGCQAgUAMEiBAgAYpEABAAxSoAAABilQAACDFCgAgEEKFADAIAUKAGCQAgUA\nMEiBAgAYpEABAAyq7j6yCar2dvfe0eUbbXek63NIR/aiA8DWUIuaeOeiJj4MV1bV3oH1T1pQDgCA\nIZMVqO4+Z2T9wbLFBnbf6YlTR5hr/3uekeMf8NKpY8z15Vc8KEmy++7/ZeIk69v/1l9NktzmKX86\ncZL5PvTUu+aOv/nuqWPM9d7Hn5YkufLAxEHm2DX7yX38A5f4++TlD8rJv/zGqWPM9YnfvmeS5KZ7\nXjNxkvV9ZuW+SZLd93nRxEnm2//ah+dGD3vl1DHm+sKL77/Q+Z0DBQAwSIECABikQAEADFKgAAAG\nKVAAAIMUKACAQQoUAMAgBQoAYJACBQAwSIECABikQAEADFKgAAAGKVAAAIMUKACAQQoUAMAgBQoA\nYJACBQBsO1X16Kq6+TXdXoECALaj30jyvqr6/6rqF6vqRiMb71xQqEW4sqr2Th1iK+vuvVNnAIDN\nUlV7kuxZNbTS3Suz+5cluX2Sn0xy/yRPraoPJHlFktd29/8+1NxbpkB19zlTZwAAto5ZWVqZv7i/\nkeStSd5aVddKckaSByR5dpJD7pHaMgUKAOAoqtUPuvvrSfYl2VdV191oY+dAAQDb0f3nLejur260\nsQIFAGw73f3xI9legQIAGKRAAQAMUqAAAAYpUAAAgxQoAIBBChQAwCAFCgBgkAIFADBIgQIAGKRA\nAQAMUqAAAAYpUAAAgxQoAIBBChQAwKDq7iOboGpvd++9psuZxJG96ACwNdSiJt65qIlZbrt/+oVT\nR5hr/+t+PrvvvTJ1jLn2v35PkmT3Gc+dOMn69r/pV5IkJ/7i6ydOMt/l/+3eudkjXjt1jLk+/YL7\nJEmOf+BLJ06yvi+//EFJkisPTBzkEHbtTL73cW+aOsZclzz7jCTJQ1/xVxMnWd8fPOCHkiQ3etgr\nJ04y3xdefP/c4rHL+xpf+pwzFjq/Q3gAAIMUKACAQQoUAMAgBQoAYJACBQAwSIECABikQAEADFKg\nAAAGKVAAAIMUKACAQQoUAMAgBQoAYJACBQAwSIECABikQAEADFKgAAAG7TwKc1xZVXsPsfyko/Ac\nAABL44gLVHefc6jlG5QrAIAt52jsgdo0VXV2kl1T59iqunvv1BkAYLNU1Z4ke1YNrXT3ymzZ8Ul+\nPcm9k9w4SSf5fJI3JDmnu//hUHNvqQKVZJcSAAAcjllZWpmz+FVJ3p7kzt392SSpqn+d5CGzZXc/\n1NxOIgcAtqOTuvtZV5enJOnuz3b3s5J890YbK1AAwHb0d1X1+Kq6ydUDVXWTqvq1JJ/aaGMFCgDY\nju6f5AZJ3lVVX6yqLyZ5Z5LvTPKzG2281c6BAgA4Yt39pSS/Nrt9k6p6WJIXH2p7e6AAAL7ZUzda\nwR4oAGDbqaq/mrcoyU3mLPtnChQAsB3dJMk9knxpzXglee9GGytQAMB29MdJrtfdH1y7oKreudHG\nChQAsO1098MPseyBG23vJHIAgEEKFADAIAUKAGCQAgUAMEiBAgAYpEABAAxSoAAABilQAACDqrsX\n+wRVe7t777LNtc0t9kUHgOVQi5rYlci3qVs94a1TR5jr4mfePbd47JumjjHXpc85I0nyXb+0b+Ik\n6/vkf71XkuT4B7504iTzffnlD8qPP/c9U8eY612/cqckycm//MaJk6zvE799zyTJ9z5ueb9PLnn2\nGbnywNQp5ts1++33cy/70LRB5njZz90mSZb+Z+GN//2rpo4x1+fPvd9C53cIDwBgkAIFADBIgQIA\nGKRAAQAMUqAAAAYpUAAAgxQoAIBBChQAwCAFCgBg0CRXIq+qs5PsugabnnSUowAADJvqo1x2XZPP\ntKuq4W0AAI42h/AAAAYpUAAAgxQoAIBBChQAwCAFCgBg0FTvwgMAmERVXTvJWUk+3d1/UlUPTHLH\nJB9NstLdX99oDgUKANhuXpyDHei6VfWQJNdL8tokd01yapKHbDTBMV2gjuCCnceka3LtLQDYqqpq\nT5I9q4ZWunslyQ929w9V1c4kVyS5WXdfVVUvS/Khw5n7mC5QuYYX7AQAtr5ZWVpZZ9Fxs8N435bk\nukmOT/LFJNdJcq3DmftYL1AAAGu9KMlfJ9mR5IlJ/qiqLkvyo0nOO5wJFCgAYFvp7udW1Stn9z9d\nVS9J8pNJ/nt3v/9w5lCgAIBtp7s/ver+PyR59cj2rgMFADBIgQIAGKRAAQAMUqAAAAYpUAAAgxQo\nAIBBChQAwCAFCgBgkAIFADBIgQIAGKRAAQAMUqAAAAYpUAAAgxQoAIBB1d2LfYKqvd29d6OxazrX\n0Vx/G1nsiw4Ay6EWNfHORU28IFdW1d6B9U9aUA4AYBvbUgWqu88ZWX+wbG0rd3j6O6aOMNeFT7pL\nbvLzfzR1jLk+98KfTZLc+OGvmjjJ+j7/ovslSW72iNdOnGS+T7/gPrn9byzvv8EP/Oe7JEluuuc1\nEydZ32dW7pskeegr/mriJPP9wQN+KD/3sg9NHWOul/3cbZIkVx6YOMgcu2a/nX/gSW+bNsghfOTp\nd1v63yWL5BwoAIBBChQAwCAFCgBgkAIFADBIgQIAGKRAAQAMUqAAAAYpUAAAgxQoAIBBChQAwCAF\nCgBgkAIFADBIgQIAGKRAAQAMUqAAAAbtnDoAAMBmq6rvSXKfJDdPclWSjyd5eXd/5XC2twcKANhW\nqurRSX4/ya4k/zbJdXKwSP15Vd35cOawBwoA2G5+Icltu/uqqvqtJOd3952r6gVJ3pDkdhtNcKwX\nqCurau/UIZZFd++dOgMAbJaq2pNkz6qhle5emd3fmYOH7q6T5HpJ0t2frKprHc7cx3SB6u5zps4A\nAExjVpZW1ln0wiQXVNX7kvxYkmclSVXdKMkXD2fuY7pAAQCs1d3Pq6o/SfL9SZ7T3X89G/9CktMO\nZw4FCgDYdrr7oiQXXdPtvQsPAGDQZuyBWu9E7pM24XkBABZi4QVqvRO5vTMOANjKHMIDABikQAEA\nDFKgAAAGKVAAAIMUKACAQQoUAMAgBQoAYJACBQAwSIECABikQAEADFKgAAAGKVAAAIOquzf/Sav2\ndvfeTX9irrb5LzoAbL5a1MQ7FzUxy233vVemjjDX/tfvyb866yVTx5jrK+c9OEmy+8wXTJxkffvf\n8Igkye57/s7ESebb/8ZH56Z7XjN1jLk+s3LfJMnu+7xo4iTr2//ahydJbvSwV06cZL4vvPj+ucVj\n3zR1jLkufc4ZSZIfeNLbJk6yvo88/W5JkisPTBzkEHbtTG79xLdOHWOui55x94XO7xAeAMAgBQoA\nYJACBQAUpepUAAAI9klEQVQwSIECABikQAEADFKgAAAGKVAAAIMUKACAQQoUAMAgBQoAYJACBQAw\nSIECABikQAEADFKgAAAGKVAAAIMUKACAQTunDgAAMLWq+ndJTk3yke5+60br2wMFAGw7VfX+Vfd/\nIcnvJvn2JE+pqrM32n4p90DNgu+aOsexprv3Tp0BADZLVe1JsmfV0Ep3r8zuX2vV+J4kd+vuL1TV\ns5P8eZJzDjX3UhaoJLv8sgcAjsSsLK3MWXxcVV0/B4/GVXd/YbbN/6mqAxvNvawFCgBgkY5P8oEk\nlaSr6qbd/Zmqut5s7JAUKABg2+nuk+Ys+kaSn95oewUKAGCmu7+a5BMbreddeAAAgxQoAIBBChQA\nwCAFCgBgkAIFADBIgQIAGKRAAQAMUqAAAAYpUAAAgxQoAIBBChQAwCAFCgBgkAIFADBIgQIAGFTd\nvflPWrW3u/de0+Ucsc1/0QFg89WiJt65qIk3cGVV7T3E8pM2Kce2tft2j5o6wlz7//J3s/u0vVPH\nmGv/u/cmSXbf6YnTBplj/3uekSTZfc/fmTjJfPvf+Oic8B9fN3WMua74vZ9OktzoYa+cOMn6vvDi\n+ydJbvHYN02cZL5Ln3NGbvzvXzV1jLk+f+79kiR3ePo7Jk6yvgufdJckya2f+NaJk8x30TPunisP\nTJ1ivl0LbjiTFKjuPudQyzcoVwAAk3IOFADAIAUKAGCQAgUAMEiBAgAYpEABAAxSoAAABilQAACD\nFCgAgEEKFADAIAUKAGCQAgUAMEiBAgAYpEABAAxSoAAABilQAACDFCgAYFupqh+pqn81u7+7qp5a\nVf9vVT2rqo4/nDkUKABguzk3yVdn95+X5Pgkz5qNvfhwJti5mFxH7Mqq2jt1iGNNd++dOgMAbJaq\n2pNkz6qhle5eSXJcdx+Yjd2hu394dv9/VtUHD2fupSxQ3X3O1BkAgK1tVpZW1ln0kap6WHe/OMmH\nquoO3X1hVd0yydcPZ26H8ACA7ebnk/x4VV2a5FZJ/qyqLkvy32fLNrSUe6AAABalu7+c5KGzE8lP\nzsE+dHl3f+5w51CgAIBtqbu/kuRD12Rbh/AAAAYpUAAAgxQoAIBBChQAwCAFCgBgkAIFADBIgQIA\nGKRAAQAMUqAAAAYpUAAAgxQoAIBBChQAwCAFCgBgkAIFADCounvqDGw+LzoA20EtamJ7oLanOpq3\nqnrE0Z5zu2WU79jPKN+xn1G+pcy4MAoUR8OeqQMchmXPKN+RW/aM8h25Zc8o35HbChmTKFAAAMMU\nKACAQQoUR8PK1AEOw7JnlO/ILXtG+Y7csmeU78hthYxJ4l14AACj7IECABikQHGNVdWuqnp/VX2o\nqi6qqqdOnWmtqjq3qj5fVR+ZOst6qurmVfWOqrp49nf4mKkzraeqdlTVX1bVH0+dZa2qOr2qPlZV\nl1TV2VPnWauq/k1VfXDV7StV9csTZ/qW74uq+s6qeltV/c3sz+tPmXG1qvqV2ffHR6rqFVW1a+pM\nq1XVY2bZLpr6tZ2nqv62qj48+zd44dR51qqq76iqV1fVX1fVR6vq/5o600YUKI7E15L8RHffJslt\nk5xeVT86caa1/iDJ6VOHOIQDSR7b3bdK8qNJHllVt5o403oek+SjU4dYq6p2JHl+kjOS3CrJA5bt\n76+7P9bdt+3u2ya5fZKvJnndxLH+IN/6fXF2kj/t7lOS/Ons8eSq6oQkj05yh+7+gSQ7kpw1bap/\nUVU/kOQXkpya5DZJfqqqvnfaVHPdZfZv8Q5TB1nH85K8ubu/Lwf/Hpfu581aChTXWB/0j7OH15rd\nluqkuu5+d5IvTp1jnu7+THf/xez+/87BHxonTJvqm1XViUnumeSFU2dZx6lJLunuy7r7n5Kcl+TM\niTMdyl2TXNrdfzdliDnfF2cm+cPZ/T9Mcu9NDXVoO5PsrqqdSa6b5NMT51nt+5O8r7u/2t0Hkrwr\nyX0mzrSlVNXxSU5L8qIk6e5/6u5/mDbVxhQojsjs0M4Hk3w+ydu6+31TZ9qqquqkJLdLsmx/h7+d\n5PFJvjF1kHWckORTqx5fniUroGucleQVU4eY4ybd/ZnZ/c8mucmUYa7W3VckeXaSTyb5TJIvd/db\np031TT6S5Meq6gZVdd0k/3eSm0+caT2d5K1V9YGqWraLVZ6c5AtJXjw7VeCFVfVtU4faiALFEenu\nq2aHJk5McupsdzaDqup6SV6T5Je7+ytT57laVf1Uks939wemzrLVVdW1k9wryR9NnWUjffDt2Uux\nN3l2LtaZOfhL9mZJvq2qfm7aVP+iuz+a5FlJ3prkzUk+mOSqSUOt79919w/n4OHuR1bVaVMHWmVn\nkh9O8nvdfbsk/ydLcgj5UBQojorZ7tZ3ZLnPN1pKVXWtHCxP/6O7Xzt1njXulOReVfW3OXh47Ceq\n6mXTRvomV+Sb/7d/4mxsGZ2R5C+6+3NTB5njc1V10ySZ/fn5ifNc7SeTfKK7v9DdX0/y2iR3nDjT\nN+nuF3X37bv7tCRfSvLxqTOtNduTl+7+fA6eg3fqtIm+yeVJLl91BOPVOViolpoCxTVWVTeqqu+Y\n3d+d5G5J/nraVFtLVVUOHvf/aHf/1tR51uruX+/uE7v7pBw8/PT27l6a//0nuSDJKVV18mwPz1lJ\n9k2caZ4HZHkP3yUH/94eMrv/kCRvmDDLap9M8qNVdd3Z98tds2QnGFfVjWd/flcOnv/08mkTfbOq\n+raq+var7ye5ew4eelwK3f3ZJJ+qqn8zG7prkosnjHRYdk4dgC3tpkn+cPZOqOOSvKq7l+pt7lX1\niiR3TnLDqro8yVO6+0XTpvomd0ryoCQfnp1LliRP6O7zJ8y0ZXT3gap6VJK35OC7s87t7osmjvUt\nZr+07pbkEVNnSdb/vkhyTpJXVdXDk/xdkvtNl/BfdPf7qurVSf4iB9+1+pdZvqtVv6aqbpDk60ke\nuYQnQN8kyesO9s/sTPLy7n7ztJG+xS8l+R+z/whdluRhE+fZkCuRAwAMcggPAGCQAgUAMEiBAgAY\npEABAAxSoAAABilQAACDFCgAgEEKFADAoP8f02Can0LGRs4AAAAASUVORK5CYII=\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# display the similarity matrix (white=similar, dark blue=different)\n", "import seaborn as sn\n", "%matplotlib inline\n", "sn.clustermap(dist_matrix,cmap='Blues',fmt='g',linewidths=.5)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mr 110.25\n", "gore 56.25\n", "master 56.25\n", "old 49.0\n", "children 30.25\n", "baltimore 16.0\n", "colonel 16.0\n", "lloyd 16.0\n", "demby 12.25\n", "gone 12.25\n", "man 12.25\n", "overseer 12.25\n", "slaves 12.25\n", "hands 9.0\n", "little 9.0\n", "age 6.25\n", "andrew 6.25\n", "death 6.25\n", "died 6.25\n", "girl 6.25\n", "grandmother 6.25\n", "left 6.25\n", "lucretia 6.25\n", "property 6.25\n", "grandchildren 4.0\n", "known 4.0\n", "leaving 4.0\n", "men 4.0\n", "punishment 4.0\n", "return 4.0\n", "slavery 4.0\n", "time 4.0\n", "took 4.0\n", "valuation 4.0\n", "words 4.0\n", "years 4.0\n", "accused 2.25\n", "bondly 2.25\n", "brother 2.25\n", "captain 2.25\n", "change 2.25\n", "come 2.25\n", "community 2.25\n", "condition 2.25\n", "convicted 2.25\n", "crime 2.25\n", "daughter 2.25\n", "did 2.25\n", "division 2.25\n", "felt 2.25\n" ] } ], "source": [ "# sort and display top fifty most variant words by \n", "# comparing just rows 3 and 6 (chapters 4 & 7) as the far ends of the above \n", "\n", "variance_table=dict()\n", "for t,y in enumerate(np.var(np.array([dtm_matrix[3,...],dtm_matrix[7,...]]), axis=0)):\n", " variance_table[t] = y\n", " \n", "for k,v in sorted(variance_table.items(), key=itemgetter(1), reverse=True)[:50]:\n", " print(vectorizer.get_feature_names()[k],v)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mr 1282.94214876\n", "covey 833.109504132\n", "time 250.214876033\n", "slave 216.892561983\n", "did 155.115702479\n", "master 126.051652893\n", "said 113.132231405\n", "man 109.297520661\n", "henry 100.0\n", "gave 94.6198347107\n", "work 94.6198347107\n", "went 93.7376033058\n", "hands 85.1425619835\n", "come 73.8037190083\n", "came 73.0247933884\n", "year 69.1921487603\n", "free 60.4152892562\n", "white 60.4152892562\n", "just 59.7107438017\n", "told 57.6219008264\n", "freeland 52.8925619835\n", "long 52.2334710744\n", "hand 48.3657024793\n", "day 47.1095041322\n", "freedom 45.8698347107\n", "way 44.6466942149\n", "woods 44.6466942149\n", "getting 41.6611570248\n", "morning 41.6611570248\n", "think 41.076446281\n", "slaves 39.9194214876\n", "large 39.347107438\n", "home 37.6549586777\n", "like 36.5475206612\n", "slavery 34.9173553719\n", "blood 31.7685950413\n", "death 31.7685950413\n", "half 30.7520661157\n", "got 29.2582644628\n", "night 28.7685950413\n", "succeeded 25.9173553719\n", "whipped 25.0\n", "gardner 24.5475206612\n", "john 24.5475206612\n", "left 22.347107438\n", "make 22.347107438\n", "eat 21.076446281\n", "head 21.076446281\n", "fred 20.6611570248\n", "michael 20.6611570248\n" ] } ], "source": [ "# sort and display top fifty most variant words by \n", "# comparing chapter 10 (row 9) to the mean \n", "\n", "mean_rows = np.mean(dtm_matrix,axis=0)\n", "\n", "variance_table=dict()\n", "for t,y in enumerate(np.var(np.array([mean_rows,dtm_matrix[9,...]]), axis=0)):\n", " variance_table[t] = y\n", " \n", "for k,v in sorted(variance_table.items(), key=itemgetter(1), reverse=True)[:50]:\n", " print(vectorizer.get_feature_names()[k],v)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# now calculate based on standard deviation of the mean value for each token\n", "mean_rows = np.mean(dtm_matrix,axis=0)\n", "std_dev = np.std([mean_rows,dtm_matrix[9,...]],axis=0)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mr 35.8181818182\n", "covey 28.8636363636\n", "time 15.8181818182\n", "slave 14.7272727273\n", "did 12.4545454545\n", "master 11.2272727273\n", "said 10.6363636364\n", "man 10.4545454545\n", "henry 10.0\n", "gave 9.72727272727\n", "work 9.72727272727\n", "went 9.68181818182\n", "hands 9.22727272727\n", "come 8.59090909091\n", "came 8.54545454545\n", "year 8.31818181818\n", "free 7.77272727273\n", "white 7.77272727273\n", "just 7.72727272727\n", "told 7.59090909091\n", "freeland 7.27272727273\n", "long 7.22727272727\n", "hand 6.95454545455\n", "day 6.86363636364\n", "freedom 6.77272727273\n", "way 6.68181818182\n", "woods 6.68181818182\n", "getting 6.45454545455\n", "morning 6.45454545455\n", "think 6.40909090909\n", "slaves 6.31818181818\n", "large 6.27272727273\n", "home 6.13636363636\n", "like 6.04545454545\n", "slavery 5.90909090909\n", "blood 5.63636363636\n", "death 5.63636363636\n", "half 5.54545454545\n", "got 5.40909090909\n", "night 5.36363636364\n", "succeeded 5.09090909091\n", "whipped 5.0\n", "gardner 4.95454545455\n", "john 4.95454545455\n", "left 4.72727272727\n", "make 4.72727272727\n", "eat 4.59090909091\n", "head 4.59090909091\n", "fred 4.54545454545\n", "michael 4.54545454545\n" ] } ], "source": [ "# now determine which words contribute to distance \n", "# via standard deviation from the mean value of each token\n", "mean_rows = np.mean(dtm_matrix,axis=0)\n", "std_dev = np.std([mean_rows,dtm_matrix[9,...]],axis=0)\n", "\n", "std_dev_table=dict()\n", "for t,y in enumerate(np.std([mean_rows,dtm_matrix[9,...]],axis=0)):\n", " std_dev_table[t] = y\n", " \n", "# sort and display top fifty most deviant words\n", "for k,v in sorted(std_dev_table.items(), key=itemgetter(1), reverse=True)[:50]:\n", " print(vectorizer.get_feature_names()[k],v)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.5" } }, "nbformat": 4, "nbformat_minor": 2 }