By Toby Segaran
Book Price: $39.99 USD
£24.99 GBP
PDF Price: $31.99
Cover | Table of Contents | Colophon
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 3.5},
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
'The Night Listener': 4.5, 'Superman Returns': 4.0,
'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
'You, Me and Dupree': 2.0},
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
pow(n,2) function to square a number and take the square root with the sqrt function:>>from math import sqrt >> sqrt(pow(5−4,2)+pow(4−1,2)) 3.1622776601683795
>>1/(1+sqrt(pow(5−4,2)+pow(4−1,2)))
0.2402530733520421from math import sqrt
# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
# Get the list of shared_items
si={}
for item in prefs[person1]:
if item in prefs[person2]:
si[item]=1
# if they have no ratings in common, return 0
if len(si)==0: return 0
# Add up the squares of all the differences
sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)
for item in prefs[person1] if item in prefs[person2]])
return 1/(1+sum_of_squares)topMatches.Critic | Similarity | Night | S.xNight | Lady | S.xLady | Luck | S.xLuck |
|---|---|---|---|---|---|---|---|
Rose | 0.99 | 3.0 | 2.97 | 2.5 | 2.48 | 3.0 | 2.97 |
Seymour | 0.38 | 3.0 | 1.14 | 3.0 | 1.14 | 1.5 | 0.57 |
Puig | 0.89 | 4.5 | 4.02 | 3.0 | 2.68 | ||
LaSalle | 0.92 | 3.0 | 2.77 | 3.0 | 2.77 | 2.0 | 1.85 |
Matthews | 0.66 | 3.0 | 1.99 | 3.0 | 1.99 | ||
Total | 12.89 | 8.38 | 8.07 | ||||
Sim.Sum | 3.84 | 2.95 | 3.18 | ||||
Total/Sim. Sum | 3.35 | 2.83 | 2.53 |
{'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5}}{'Lady in the Water':{'Lisa Rose':2.5,'Gene Seymour':3.0},
'Snakes on a Plane':{'Lisa Rose':3.5,'Gene Seymour':3.5}} etc..def transformPrefs(prefs):
result={}
for person in prefs:
for item in prefs[person]:
result.setdefault(item,{})
# Flip item and person
result[item][person]=prefs[person][item]
return resulttopMatches function used earlier to find the set of movies most similar to Superman Returns:>>reload(recommendations) >> movies=recommendations.transformPrefs(recommendations.critics) >> recommendations.topMatches(movies,'Superman Returns') [(0.657, 'You, Me and Dupree'), (0.487, 'Lady in the Water'), (0.111, 'Snakes on a Plane'), (−0.179, 'The Night Listener'), (−0.422, 'Just My Luck')]
get_popular call:>>import pydelicious >> pydelicious.get_popular(tag='programming') [{'count': '', 'extended': '', 'hash': '', 'description': u'How To Write Unmaintainable Code', 'tags': '', 'href': u'http://thc.segfault.net/root/phun/ unmaintain.html', 'user': u'dorsia', 'dt': u'2006-08-19T09:48:56Z'}, {'count': '', 'extended': '', 'hash': '', 'description': u'Threading in C#', 'tags': '', 'href': u'http://www.albahari.com/threading/', 'user': u'mmihale', 'dt': u'2006-05-17T18: 09:24Z'}, ...etc...
def calculateSimilarItems(prefs,n=10):
# Create a dictionary of items showing which other items they
# are most similar to.
result={}
# Invert the preference matrix to be item-centric
itemPrefs=transformPrefs(prefs)
c=0
for item in itemPrefs:
# Status updates for large datasets
c+=1
if c%100==0: print "%d / %d" % (c,len(itemPrefs))
# Find the most similar items to this one
scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
result[item]=scores
return result196 242 3 881250949 186 302 3 891717742 22 377 1 878887116 244 51 2 880606923 166 346 1 886397596 298 474 4 884182806
loadMovieLens in recommendations.py to load this dataset:def loadMovieLens(path='/data/movielens'):
# Get movie titles
movies={}
for line in open(path+'/u.item'):
(id,title)=line.split('|')[0:2]
movies[id]=title
# Load data
prefs={}
for line in open(path+'/u.data'):
(user,movieid,rating,ts)=line.split('\t')
prefs.setdefault(user,{})
prefs[user][movies[movieid]]=float(rating)
return prefs>>>reload(recommendations) >>> prefs=recommendations.loadMovieLens( ) >>> prefs['87'] {'Birdcage, The (1996)': 4.0, 'E.T. the Extra-Terrestrial (1982)': 3.0, 'Bananas (1971)': 5.0, 'Sting, The (1973)': 5.0, 'Bad Boys (1995)': 4.0, 'In the Line of Fire (1993)': 5.0, 'Star Trek: The Wrath of Khan (1982)': 5.0, 'Speechless (1994)': 4.0, etc...
>>>recommendations.getRecommendations(prefs,'87')[0:30]
[(5.0, 'They Made Me a Criminal (1939)'), (5.0, 'Star Kid (1997)'),
(5.0, 'Santa with Muscles (1996)'), (5.0, 'Saint of Fort Washington (1993)'),
etc...]"china" | "kids" | "music" | "yahoo" | |
|---|---|---|---|---|
Gothamist | 0 | 3 | 3 | 0 |
GigaOM | 6 | 0 | 0 | 2 |
Quick Online Tips | 0 | 2 | 2 | 22 |
import feedparser
import re
# Returns title and dictionary of word counts for an RSS feed
def getwordcounts(url):
# Parse the feed
d=feedparser.parse(url)
wc={}
# Loop over all the entries
for e in d.entries:
if 'summary' in e: summary=e.summary
else: summary=e.description
# Extract a list of words
words=getwords(e.title+' '+summary)
for word in words:
wc.setdefault(word,0)
wc[word]+=1
return d.feed.title,wc
def readfile(filename):
lines=[line for line in file(filename)]
# First line is the column titles
colnames=lines[0].strip( ).split('\t')[1:]
rownames=[]
data=[]
for line in lines[1:]:
p=line.strip( ).split('\t')
# First column in each row is the rowname
rownames.append(p[0])
# The data for this row is the remainder of the row
data.append([float(x) for x in p[1:]])
return rownames,colnames,dataimport statement to the beginning of clusters.py:from PIL import Image,ImageDraw
def getheight(clust): # Is this an endpoint? Then the height is just 1 if clust.left==None and clust.right==None: return 1 # Otherwise the height is the same of the heights of # each branch return getheight(clust.left)+getheight(clust.right)
def getdepth(clust): # The distance of an endpoint is 0.0 if clust.left==None and clust.right==None: return 0 # The distance of a branch is the greater of its two sides # plus its own distance return max(getdepth(clust.left),getdepth(clust.right))+clust.distance
drawdendrogram function creates a new image allowing 20 pixels in height and a fixed width for each final cluster. The scaling factor is determined by dividing the fixed width by the total depth. The function creates a draw object for this image and then calls def rotatematrix(data):
newdata=[]
for i in range(len(data[0])):
newrow=[data[j][i] for j in range(len(data))]
newdata.append(newrow)
return newdata>>reload(clusters) >> rdata=clusters.rotatematrix(data) >> wordclust=clusters.hcluster(rdata) >> clusters.drawdendrogram(wordclust,labels=words,jpeg='wordclust.jpg')