-
Notifications
You must be signed in to change notification settings - Fork 0
/
similarmovies.py
44 lines (30 loc) · 1.16 KB
/
similarmovies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 12 23:33:52 2018
@author: blaze
"""
import pandas as pd
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols, usecols=range(3), encoding="ISO-8859-1" )
m_cols = ['movie_id', 'title']
movies = pd.read_csv('u.item', sep='|', names=m_cols, usecols=range(2), encoding="ISO-8859-1")
ratings = pd.merge(movies, ratings)
ratings.head()
movieRatings=ratings.pivot_table(index=['user_id'], columns=['title'], values='rating')
movieRatings.head()
starWarsRatings=movieRatings['Star Wars (1977)']
starWarsRatings.head()
similarMovies=movieRatings.corrwith(starWarsRatings)
similarMovies.dropna()
df=pd.DataFrame(similarMovies)
df.head()
similarMovies.sort_values(ascending=False)
import numpy as np
movieStats = ratings.groupby('title').agg({'rating': [np.size, np.mean]})
movieStats.head()
popularMovies=movieStats['rating']['size']>=100
movieStats[popularMovies].sort_values([('rating','mean')],ascending=False)[:15]
r=pd.DataFrame(similarMovies, columns=['similarity'])
df = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['similarity']))
df.head()