-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathprocess-10m.py
89 lines (74 loc) · 3.01 KB
/
process-10m.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Benchmark based on Greg Redas's previous work:
# http://www.gregreda.com/2013/10/26/using-pandas-on-the-movielens-dataset/
# The original MovieLens datasets are over here:
# http://www.grouplens.org/datasets/movielens
from time import time
import os.path
import numpy as np
import bcolz
import pandas as pd
bcolz.print_versions()
dset = 'ml-10m'
ftags = os.path.join(dset, 'tags.dat')
fdata = os.path.join(dset, 'ratings.dat.gz')
fitem = os.path.join(dset, 'movies.dat')
# Global settings for bcolz and pandas
bcolz.defaults.cparams['cname'] = 'blosclz'
bcolz.defaults.cparams['clevel'] = 1
# bcolz.defaults.eval_vm = "numexpr"
#bcolz.blosc_set_nthreads(1)
#bcolz.numexpr.set_num_threads(1)
# from pandas.computation import expressions as expr
# expr.set_use_numexpr(True)
# expr.set_numexpr_threads(1)
t0 = time()
# pass in column names for each CSV
t_cols = ['user_id', 'movie_id', 'tag', 'unix_timestamp']
tags = pd.read_csv(ftags, sep=';', names=t_cols)
#print("Info for tags:", tags.info())
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(fdata, sep=';', names=r_cols, compression='gzip')
m_cols = ['movie_id', 'title', 'genres']
movies = pd.read_csv(fitem, sep=';', names=m_cols,
dtype={'title': "S100", 'genres': "S100"})
print("Time for parsing the data: %.2f" % (time()-t0,))
t0 = time()
# create one merged DataFrame
movie_ratings = pd.merge(movies, ratings)
# You probably need more than 8 GB in RAM for the next merge
#lens = pd.merge(movie_ratings, tags, on='user_id')
lens = movie_ratings
print("Time for dataframe merges: %.2f" % (time()-t0,))
#print("Info for movie_ratings:", movie_ratings.info())
#print("Info for lens:", lens.info())
#most_rated = lens.groupby('title').size().order(ascending=False)[:25]
#print(most_rated)
t0 = time()
#result = lens[lens['title'] == 'Tom and Huck (1995)']
result = lens.query("title == 'Tom and Huck (1995)'")
print("time (and length) for simple query with pandas: %.2f (%d)" %
(time()-t0, len(result)))
#print repr(result)
t0 = time()
#result = lens[(lens['title'] == 'Tom and Huck (1995)') & (lens['rating'] == 5)]['user_id']
result = lens.query("(title == 'Tom and Huck (1995)') & (rating == 5)")['user_id']
print("time (and length) for complex query with pandas: %.2f (%d)" %
(time()-t0, len(result)))
#print repr(result)
t0 = time()
zlens = bcolz.ctable.fromdataframe(lens)
print("time (and compress ratio) for ctable conversion: %.2f (%.1fx)" %
(time()-t0, zlens.nbytes / float(zlens.cbytes)))
#print repr(zlens)
t0 = time()
result = zlens["title == 'Tom and Huck (1995)'"]
print("time (and length) for simple query with bcolz: %.2f (%d)" %
(time()-t0, len(result)))
#print repr(result)
t0 = time()
#result = zlens["(title == 'Tom and Huck (1995)') & (rating == 5)"]['user_id']
result = [r.user_id for r in zlens.where(
"(title == 'Tom and Huck (1995)') & (rating == 5)", outcols=['user_id'])]
print("time (and length) for complex query with bcolz: %.2f (%d)" %
(time()-t0, len(result)))
#print(repr(result))