{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Document retrieval from wikipedia data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fire up GraphLab Create" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import graphlab" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load some text data - from wikipedia, pages on people" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[INFO] This non-commercial license of GraphLab Create is assigned to danielbenitezr@yahoo.com and will expire on September 27, 2016. For commercial licensing options, visit https://dato.com/buy/.\n", "\n", "[INFO] Start server at: ipc:///tmp/graphlab_server-14503 - Server binary: /home/balrog/anaconda/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1450640289.log\n", "[INFO] GraphLab Server Version: 1.6.1\n" ] } ], "source": [ "people = graphlab.SFrame('people_wiki.gl/')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Data contains: link to wikipedia article, name of person, text of article." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametext
<http://dbpedia.org/resou
rce/Digby_Morrell> ...
Digby Morrelldigby morrell born 10
october 1979 is a former ...
<http://dbpedia.org/resou
rce/Alfred_J._Lewy> ...
Alfred J. Lewyalfred j lewy aka sandy
lewy graduated from ...
<http://dbpedia.org/resou
rce/Harpdog_Brown> ...
Harpdog Brownharpdog brown is a singer
and harmonica player who ...
<http://dbpedia.org/resou
rce/Franz_Rottensteiner> ...
Franz Rottensteinerfranz rottensteiner born
in waidmannsfeld lower ...
<http://dbpedia.org/resou
rce/G-Enka> ...
G-Enkahenry krvits born 30
december 1974 in tallinn ...
<http://dbpedia.org/resou
rce/Sam_Henderson> ...
Sam Hendersonsam henderson born
october 18 1969 is an ...
<http://dbpedia.org/resou
rce/Aaron_LaCrate> ...
Aaron LaCrateaaron lacrate is an
american music producer ...
<http://dbpedia.org/resou
rce/Trevor_Ferguson> ...
Trevor Fergusontrevor ferguson aka john
farrow born 11 november ...
<http://dbpedia.org/resou
rce/Grant_Nelson> ...
Grant Nelsongrant nelson born 27
april 1971 in london ...
<http://dbpedia.org/resou
rce/Cathy_Caruth> ...
Cathy Caruthcathy caruth born 1955 is
frank h t rhodes ...
\n", "[10 rows x 3 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------------------------------+---------------------+\n", "| URI | name |\n", "+-------------------------------+---------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametext
<http://dbpedia.org/resou
rce/Barack_Obama> ...
Barack Obamabarack hussein obama ii
brk husen bm born august ...
\n", "[? rows x 3 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use len(sf) to force materialization.\n", "" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\n", "Rows: Unknown\n", "\n", "Data:\n", "+-------------------------------+--------------+-------------------------------+\n", "| URI | name | text |\n", "+-------------------------------+--------------+-------------------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
normalize1
sought1
combat1
continued1
unconstitutional1
81
californias1
19961
marriage1
defense1
\n", "[10 rows x 2 columns]
\n", "" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\tcount\tint\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+------------------+-------+\n", "| word | count |\n", "+------------------+-------+\n", "| normalize | 1 |\n", "| sought | 1 |\n", "| combat | 1 |\n", "| continued | 1 |\n", "| unconstitutional | 1 |\n", "| 8 | 1 |\n", "| californias | 1 |\n", "| 1996 | 1 |\n", "| marriage | 1 |\n", "| defense | 1 |\n", "+------------------+-------+\n", "[10 rows x 2 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obama_word_count_table.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
the40
in30
and21
of18
to14
his11
obama9
act8
a7
he7
\n", "[273 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.\n", "
" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\tcount\tint\n", "\n", "Rows: 273\n", "\n", "Data:\n", "+-------+-------+\n", "| word | count |\n", "+-------+-------+\n", "| the | 40 |\n", "| in | 30 |\n", "| and | 21 |\n", "| of | 18 |\n", "| to | 14 |\n", "| his | 11 |\n", "| obama | 9 |\n", "| act | 8 |\n", "| a | 7 |\n", "| he | 7 |\n", "+-------+-------+\n", "[273 rows x 2 columns]\n", "Note: Only the head of the SFrame is printed.\n", "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns." ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obama_word_count_table.sort('count',ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Most common words include uninformative words like \"the\", \"in\", \"and\",..." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Compute TF-IDF for the corpus \n", "\n", "To give more weight to informative words, we weigh them by their TF-IDF scores." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametextword_count
<http://dbpedia.org/resou
rce/Digby_Morrell> ...
Digby Morrelldigby morrell born 10
october 1979 is a former ...
{'since': 1, 'carltons':
1, 'being': 1, '2005' ...
<http://dbpedia.org/resou
rce/Alfred_J._Lewy> ...
Alfred J. Lewyalfred j lewy aka sandy
lewy graduated from ...
{'precise': 1, 'thomas':
1, 'closely': 1, ...
<http://dbpedia.org/resou
rce/Harpdog_Brown> ...
Harpdog Brownharpdog brown is a singer
and harmonica player who ...
{'just': 1, 'issued': 1,
'mainly': 1, 'nominat ...
<http://dbpedia.org/resou
rce/Franz_Rottensteiner> ...
Franz Rottensteinerfranz rottensteiner born
in waidmannsfeld lower ...
{'all': 1,
'bauforschung': 1, ...
<http://dbpedia.org/resou
rce/G-Enka> ...
G-Enkahenry krvits born 30
december 1974 in tallinn ...
{'legendary': 1,
'gangstergenka': 1, ...
<http://dbpedia.org/resou
rce/Sam_Henderson> ...
Sam Hendersonsam henderson born
october 18 1969 is an ...
{'now': 1, 'currently':
1, 'less': 1, 'being' ...
<http://dbpedia.org/resou
rce/Aaron_LaCrate> ...
Aaron LaCrateaaron lacrate is an
american music producer ...
{'exclusive': 2,
'producer': 1, 'tribe': ...
<http://dbpedia.org/resou
rce/Trevor_Ferguson> ...
Trevor Fergusontrevor ferguson aka john
farrow born 11 november ...
{'taxi': 1, 'salon': 1,
'gangs': 1, 'being': 1, ...
<http://dbpedia.org/resou
rce/Grant_Nelson> ...
Grant Nelsongrant nelson born 27
april 1971 in london ...
{'houston': 1, 'frankie':
1, 'labels': 1, ...
<http://dbpedia.org/resou
rce/Cathy_Caruth> ...
Cathy Caruthcathy caruth born 1955 is
frank h t rhodes ...
{'phenomenon': 1,
'deborash': 1, ...
\n", "[10 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\tword_count\tdict\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------------------------------+---------------------+\n", "| URI | name |\n", "+-------------------------------+---------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordtfidf
obama43.2956530721
act27.678222623
iraq17.747378588
control14.8870608452
law14.7229357618
ordered14.5333739509
military13.1159327785
involvement12.7843852412
response12.7843852412
democratic12.4106886973
\n", "[273 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.\n", "" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\ttfidf\tfloat\n", "\n", "Rows: 273\n", "\n", "Data:\n", "+-------------+---------------+\n", "| word | tfidf |\n", "+-------------+---------------+\n", "| obama | 43.2956530721 |\n", "| act | 27.678222623 |\n", "| iraq | 17.747378588 |\n", "| control | 14.8870608452 |\n", "| law | 14.7229357618 |\n", "| ordered | 14.5333739509 |\n", "| military | 13.1159327785 |\n", "| involvement | 12.7843852412 |\n", "| response | 12.7843852412 |\n", "| democratic | 12.4106886973 |\n", "+-------------+---------------+\n", "[273 rows x 2 columns]\n", "Note: Only the head of the SFrame is printed.\n", "You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns." ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Words with highest TF-IDF are much more informative." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Manually compute distances between a few people\n", "\n", "Let's manually compare the distances between the articles for a few famous people. " ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "clinton = people[people['name'] == 'Bill Clinton']" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "beckham = people[people['name'] == 'David Beckham']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Is Obama closer to Clinton than to Beckham?\n", "\n", "We will use cosine distance, which is given by\n", "\n", "(1-cosine_similarity) \n", "\n", "and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham." ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.8339854936884276" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.9791305844747478" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Build a nearest neighbor model for document retrieval\n", "\n", "We now create a nearest-neighbors model and apply it to document retrieval. " ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Starting brute force nearest neighbors model training.\n" ] } ], "source": [ "knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Applying the nearest-neighbors model for retrieval" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Who is closest to Obama?" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Starting pairwise querying.\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | 0 | 1 | 0.00169288 | 18.181ms |\n", "PROGRESS: | Done | | 100 | 767.505ms |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n" ] }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Barack Obama0.01
0Joe Biden0.7941176470592
0Joe Lieberman0.7946859903383
0Kelly Ayotte0.8119891008174
0Bill Clinton0.8138528138535
\n", "[5 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+-----------------+----------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+-----------------+----------------+------+\n", "| 0 | Barack Obama | 0.0 | 1 |\n", "| 0 | Joe Biden | 0.794117647059 | 2 |\n", "| 0 | Joe Lieberman | 0.794685990338 | 3 |\n", "| 0 | Kelly Ayotte | 0.811989100817 | 4 |\n", "| 0 | Bill Clinton | 0.813852813853 | 5 |\n", "+-------------+-----------------+----------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model.query(obama)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Other examples of document retrieval" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "swift = people[people['name'] == 'Taylor Swift']" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Starting pairwise querying.\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | 0 | 1 | 0.00169288 | 19.485ms |\n", "PROGRESS: | Done | | 100 | 654.766ms |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n" ] }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Taylor Swift0.01
0Carrie Underwood0.762318840582
0Alicia Keys0.7647058823533
0Jordin Sparks0.7696335078534
0Leona Lewis0.7761194029855
\n", "[5 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+------------------+----------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+------------------+----------------+------+\n", "| 0 | Taylor Swift | 0.0 | 1 |\n", "| 0 | Carrie Underwood | 0.76231884058 | 2 |\n", "| 0 | Alicia Keys | 0.764705882353 | 3 |\n", "| 0 | Jordin Sparks | 0.769633507853 | 4 |\n", "| 0 | Leona Lewis | 0.776119402985 | 5 |\n", "+-------------+------------------+----------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model.query(swift)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "jolie = people[people['name'] == 'Angelina Jolie']" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Starting pairwise querying.\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | 0 | 1 | 0.00169288 | 28.843ms |\n", "PROGRESS: | 0 | 32576 | 55.1472 | 1.02s |\n", "PROGRESS: | Done | | 100 | 1.83s |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n" ] }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Angelina Jolie0.01
0Brad Pitt0.7840236686392
0Julianne Moore0.7958579881663
0Billy Bob Thornton0.8030690537084
0George Clooney0.80468755
\n", "[5 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+--------------------+----------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+--------------------+----------------+------+\n", "| 0 | Angelina Jolie | 0.0 | 1 |\n", "| 0 | Brad Pitt | 0.784023668639 | 2 |\n", "| 0 | Julianne Moore | 0.795857988166 | 3 |\n", "| 0 | Billy Bob Thornton | 0.803069053708 | 4 |\n", "| 0 | George Clooney | 0.8046875 | 5 |\n", "+-------------+--------------------+----------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model.query(jolie)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "arnold = people[people['name'] == 'Arnold Schwarzenegger']" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Starting pairwise querying.\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | 0 | 1 | 0.00169288 | 57.904ms |\n", "PROGRESS: | 0 | 30515 | 51.6582 | 1.05s |\n", "PROGRESS: | 0 | 55092 | 93.264 | 2.06s |\n", "PROGRESS: | Done | | 100 | 2.29s |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n" ] }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Arnold Schwarzenegger0.01
0Jesse Ventura0.8189189189192
0John Kitzhaber0.8246153846153
0Lincoln Chafee0.8338762214984
0Anthony Foxx0.8339100346025
\n", "[5 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+-----------------------+----------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+-----------------------+----------------+------+\n", "| 0 | Arnold Schwarzenegger | 0.0 | 1 |\n", "| 0 | Jesse Ventura | 0.818918918919 | 2 |\n", "| 0 | John Kitzhaber | 0.824615384615 | 3 |\n", "| 0 | Lincoln Chafee | 0.833876221498 | 4 |\n", "| 0 | Anthony Foxx | 0.833910034602 | 5 |\n", "+-------------+-----------------------+----------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model.query(arnold)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Programming assignment\n", "## 1. Compare top words according to word counts to TF-IDF." ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------------------------------+------------+-------------------------------+\n", "| URI | name | text |\n", "+-------------------------------+------------+-------------------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
the27
in18
and15
of13
a10
has9
he7
john7
on6
since5
\n", "[10 rows x 2 columns]
\n", "" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\tcount\tint\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+-------+-------+\n", "| word | count |\n", "+-------+-------+\n", "| the | 27 |\n", "| in | 18 |\n", "| and | 15 |\n", "| of | 13 |\n", "| a | 10 |\n", "| has | 9 |\n", "| he | 7 |\n", "| john | 7 |\n", "| on | 6 |\n", "| since | 5 |\n", "+-------+-------+\n", "[10 rows x 2 columns]" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "elton_word_count = elton[['word_count']].stack('word_count',new_column_name=['word','count']).sort('count',ascending=False)\n", "elton_word_count.head()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false, "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordtfidf
furnish18.38947184
elton17.48232027
billboard17.3036809575
john13.9393127924
songwriters11.250406447
overallelton10.9864953892
tonightcandle10.9864953892
1970200010.2933482087
fivedecade10.2933482087
aids10.262846934
\n", "[10 rows x 2 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tword\tstr\n", "\ttfidf\tfloat\n", "\n", "Rows: 10\n", "\n", "Data:\n", "+---------------+---------------+\n", "| word | tfidf |\n", "+---------------+---------------+\n", "| furnish | 18.38947184 |\n", "| elton | 17.48232027 |\n", "| billboard | 17.3036809575 |\n", "| john | 13.9393127924 |\n", "| songwriters | 11.250406447 |\n", "| overallelton | 10.9864953892 |\n", "| tonightcandle | 10.9864953892 |\n", "| 19702000 | 10.2933482087 |\n", "| fivedecade | 10.2933482087 |\n", "| aids | 10.262846934 |\n", "+---------------+---------------+\n", "[10 rows x 2 columns]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "elton_tfidf = elton[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)\n", "elton_tfidf.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Measuring distance." ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametextword_count
<http://dbpedia.org/resou
rce/Victoria_Beckham> ...
Victoria Beckhamvictoria caroline beckham
ne adams born 17 april ...
{'millionin': 1,
'saying': 1, 'cameo': 1, ...
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "
tfidf
{'millionin':
7.728398851203712, ...
\n", "[? rows x 5 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use len(sf) to force materialization.\n", "
" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\tword_count\tdict\n", "\ttfidf\tdict\n", "\n", "Rows: Unknown\n", "\n", "Data:\n", "+-------------------------------+------------------+\n", "| URI | name |\n", "+-------------------------------+------------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URInametextword_count
<http://dbpedia.org/resou
rce/Paul_McCartney> ...
Paul McCartneysir james paul mccartney
mbe born 18 june 1942 is ...
{'all': 1, 'gold': 1,
'over': 1, 'kintyre': 1, ...
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "
tfidf
{'all':
1.6431112434912472, ...
\n", "[? rows x 5 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use len(sf) to force materialization.\n", "" ], "text/plain": [ "Columns:\n", "\tURI\tstr\n", "\tname\tstr\n", "\ttext\tstr\n", "\tword_count\tdict\n", "\ttfidf\tdict\n", "\n", "Rows: Unknown\n", "\n", "Data:\n", "+-------------------------------+----------------+\n", "| URI | name |\n", "+-------------------------------+----------------+\n", "| \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Elton John2.22044604925e-161
0Cliff Richard0.161424152592
0Sandro Petrone0.168225427513
0Rod Stewart0.1683271655874
0Malachi O'Doherty0.1773155459795
\n", "[5 rows x 4 columns]
\n", "" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+-------------------+-------------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+-------------------+-------------------+------+\n", "| 0 | Elton John | 2.22044604925e-16 | 1 |\n", "| 0 | Cliff Richard | 0.16142415259 | 2 |\n", "| 0 | Sandro Petrone | 0.16822542751 | 3 |\n", "| 0 | Rod Stewart | 0.168327165587 | 4 |\n", "| 0 | Malachi O'Doherty | 0.177315545979 | 5 |\n", "+-------------+-------------------+-------------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model_word_count.query(elton)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Starting pairwise querying.\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | 0 | 1 | 0.00169288 | 21.798ms |\n", "PROGRESS: | Done | | 100 | 840.998ms |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n" ] }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Elton John-2.22044604925e-161
0Rod Stewart0.7172196678932
0George Michael0.7476009989693
0Sting (musician)0.7476719544314
0Phil Collins0.751193248795
\n", "[5 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+------------------+--------------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+------------------+--------------------+------+\n", "| 0 | Elton John | -2.22044604925e-16 | 1 |\n", "| 0 | Rod Stewart | 0.717219667893 | 2 |\n", "| 0 | George Michael | 0.747600998969 | 3 |\n", "| 0 | Sting (musician) | 0.747671954431 | 4 |\n", "| 0 | Phil Collins | 0.75119324879 | 5 |\n", "+-------------+------------------+--------------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model_tfidf.query(elton)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Starting pairwise querying.\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | 0 | 1 | 0.00169288 | 12.335ms |\n", "PROGRESS: | Done | | 100 | 698.639ms |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n" ] }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Victoria Beckham-2.22044604925e-161
0Mary Fitzgerald (artist)0.2073070361152
0Adrienne Corri0.2145097827883
0Beverly Jane Fry0.2174664687414
0Raman Mundair0.2176954749925
\n", "[5 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+--------------------------+--------------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+--------------------------+--------------------+------+\n", "| 0 | Victoria Beckham | -2.22044604925e-16 | 1 |\n", "| 0 | Mary Fitzgerald (artist) | 0.207307036115 | 2 |\n", "| 0 | Adrienne Corri | 0.214509782788 | 3 |\n", "| 0 | Beverly Jane Fry | 0.217466468741 | 4 |\n", "| 0 | Raman Mundair | 0.217695474992 | 5 |\n", "+-------------+--------------------------+--------------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model_word_count.query(victoria)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PROGRESS: Starting pairwise querying.\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n", "PROGRESS: | 0 | 1 | 0.00169288 | 13.599ms |\n", "PROGRESS: | Done | | 100 | 882.039ms |\n", "PROGRESS: +--------------+---------+-------------+--------------+\n" ] }, { "data": { "text/html": [ "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
query_labelreference_labeldistancerank
0Victoria Beckham1.11022302463e-161
0David Beckham0.5481696102632
0Stephen Dow Beckham0.7849867068283
0Mel B0.8095855234094
0Caroline Rush0.8198264229195
\n", "[5 rows x 4 columns]
\n", "
" ], "text/plain": [ "Columns:\n", "\tquery_label\tint\n", "\treference_label\tstr\n", "\tdistance\tfloat\n", "\trank\tint\n", "\n", "Rows: 5\n", "\n", "Data:\n", "+-------------+---------------------+-------------------+------+\n", "| query_label | reference_label | distance | rank |\n", "+-------------+---------------------+-------------------+------+\n", "| 0 | Victoria Beckham | 1.11022302463e-16 | 1 |\n", "| 0 | David Beckham | 0.548169610263 | 2 |\n", "| 0 | Stephen Dow Beckham | 0.784986706828 | 3 |\n", "| 0 | Mel B | 0.809585523409 | 4 |\n", "| 0 | Caroline Rush | 0.819826422919 | 5 |\n", "+-------------+---------------------+-------------------+------+\n", "[5 rows x 4 columns]" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "knn_model_tfidf.query(victoria)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }