-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkNN.py
83 lines (60 loc) · 2.61 KB
/
kNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
class kNN:
def __init__(self, path, mov_file, ratings_file):
"""
Initialize an instance of the kNN recommender.
Pass in the general path, the name of the movies file, the name of the
ratings file as three strings.
"""
self.mov_path = path + mov_file
self.ratings_path = path + ratings_file
def prep_data(self):
"""
Clean the ratings data by filtering out unpopular movies and inactive
users.
Save the id: title/title: id mappings constructed upon the filtered set
of movies.
Return a sparse csr matrix constructed upon the filtered ratings data
"""
movies = pd.read_csv(self.mov_path)
ratings = pd.read_csv(self.ratings_path)
movies = movies.drop('genres', axis=1)
ratings = ratings.drop('timestamp', axis=1)
ratings_active_usrs = self.handle_sparcity(ratings, 'userId', 100)
ratings_popular_movs = self.handle_sparcity(ratings_active_usrs, 'movieId', 100)
mov_usr_table = ratings_popular_movs.pivot(index='movieId', columns='userId', values='rating').fillna(0)
mov_usr_csr = csr_matrix(mov_usr_table)
title_ind_map = {movie: i
for i, movie
in enumerate(list(movies.set_index('movieId').loc[mov_usr_table.index].title))}
ind_title_map = {i: movie for movie, i in title_ind_map.items()}
self.mappings = [title_ind_map, ind_title_map]
return mov_usr_csr
def handle_sparcity(self, df, group, threshold):
"""
Helper function to filter out rows of a dataframe given a column
to group by and a minimum count threshold to be met.
"""
return df.groupby(group).filter(lambda x: len(x) >= threshold)
def recommend(self, base_movie, k=10, metric='cosine'):
"""
"""
mov_usr_csr = self.prep_data()
model = NearestNeighbors(n_neighbors=k+1, algorithm='brute', metric=metric, n_jobs=-1)
model.fit(mov_usr_csr)
base_ind = self.title_to_id(base_movie)
recommendations = model.kneighbors(X=mov_usr_csr[base_ind], return_distance=False)[0][1:]
rec_titles = [self.id_to_title(id) for id in recommendations]
return rec_titles
def title_to_id(self, title):
"""
Return the id of a movie given its title.
"""
return self.mappings[0][title]
def id_to_title(self, id):
"""
Return the title of a movie given its id.
"""
return self.mappings[1][id]