831 строка
22 KiB
Plaintext
831 строка
22 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"<i>Copyright (c) Microsoft Corporation. All rights reserved.</i>\n",
|
|
"\n",
|
|
"<i>Licensed under the MIT License.</i>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## FastAI Recommender\n",
|
|
"\n",
|
|
"This notebook shows how to use the [FastAI](https://fast.ai) recommender which is using [Pytorch](https://pytorch.org/) under the hood. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"System version: 3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43) \n",
|
|
"[GCC 7.3.0]\n",
|
|
"Pandas version: 0.23.4\n",
|
|
"Fast AI version: 1.0.46\n",
|
|
"Torch version: 1.0.0\n",
|
|
"Cuda Available: True\n",
|
|
"CuDNN Enabled: True\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# set the environment path to find Recommenders\n",
|
|
"import sys\n",
|
|
"sys.path.append(\"notebooks\")\n",
|
|
"import time\n",
|
|
"import os\n",
|
|
"import itertools\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import papermill as pm\n",
|
|
"import torch, fastai\n",
|
|
"from fastai.collab import EmbeddingDotBias, collab_learner, CollabDataBunch, load_learner\n",
|
|
"\n",
|
|
"from reco_utils.dataset import movielens\n",
|
|
"from reco_utils.dataset.python_splitters import python_random_split\n",
|
|
"from reco_utils.recommender.fastai.fastai_utils import cartesian_product, score\n",
|
|
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
|
|
"from reco_utils.evaluation.python_evaluation import rmse, mae, rsquared, exp_var\n",
|
|
"\n",
|
|
"print(\"System version: {}\".format(sys.version))\n",
|
|
"print(\"Pandas version: {}\".format(pd.__version__))\n",
|
|
"print(\"Fast AI version: {}\".format(fastai.__version__))\n",
|
|
"print(\"Torch version: {}\".format(torch.__version__))\n",
|
|
"print(\"Cuda Available: {}\".format(torch.cuda.is_available()))\n",
|
|
"print(\"CuDNN Enabled: {}\".format(torch.backends.cudnn.enabled))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Defining some constants to refer to the different columns of our dataset."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"USER, ITEM, RATING, TIMESTAMP, PREDICTION, TITLE = 'UserId', 'MovieId', 'Rating', 'Timestamp', 'Prediction', 'Title'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"tags": [
|
|
"parameters"
|
|
]
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# top k items to recommend\n",
|
|
"TOP_K = 10\n",
|
|
"\n",
|
|
"# Select Movielens data size: 100k, 1m, 10m, or 20m\n",
|
|
"MOVIELENS_DATA_SIZE = '100k'\n",
|
|
"\n",
|
|
"# Model parameters\n",
|
|
"N_FACTORS = 40\n",
|
|
"EPOCHS = 5"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>UserId</th>\n",
|
|
" <th>MovieId</th>\n",
|
|
" <th>Rating</th>\n",
|
|
" <th>Timestamp</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>196</td>\n",
|
|
" <td>242</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>881250949</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>186</td>\n",
|
|
" <td>302</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>891717742</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>22</td>\n",
|
|
" <td>377</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>878887116</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>244</td>\n",
|
|
" <td>51</td>\n",
|
|
" <td>2.0</td>\n",
|
|
" <td>880606923</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>166</td>\n",
|
|
" <td>346</td>\n",
|
|
" <td>1.0</td>\n",
|
|
" <td>886397596</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" UserId MovieId Rating Timestamp\n",
|
|
"0 196 242 3.0 881250949\n",
|
|
"1 186 302 3.0 891717742\n",
|
|
"2 22 377 1.0 878887116\n",
|
|
"3 244 51 2.0 880606923\n",
|
|
"4 166 346 1.0 886397596"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"ratings_df = movielens.load_pandas_df(\n",
|
|
" size=MOVIELENS_DATA_SIZE,\n",
|
|
" header=[USER,ITEM,RATING,TIMESTAMP]\n",
|
|
")\n",
|
|
"\n",
|
|
"# make sure the IDs are loaded as strings to better prevent confusion with embedding ids\n",
|
|
"ratings_df[USER] = ratings_df[USER].astype('str')\n",
|
|
"ratings_df[ITEM] = ratings_df[ITEM].astype('str')\n",
|
|
"\n",
|
|
"ratings_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_valid_df, test_df = python_random_split(ratings_df, ratio=[0.75, 0.25])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Training"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# fix random seeds to make sure our runs are reproducible\n",
|
|
"np.random.seed(101)\n",
|
|
"torch.manual_seed(101)\n",
|
|
"torch.cuda.manual_seed_all(101)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"start_time = time.time()\n",
|
|
"\n",
|
|
"data = CollabDataBunch.from_df(train_valid_df, user_name=USER, item_name=ITEM, rating_name=RATING)\n",
|
|
"\n",
|
|
"preprocess_time = time.time() - start_time"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th>UserId</th>\n",
|
|
" <th>MovieId</th>\n",
|
|
" <th>target</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <td>94</td>\n",
|
|
" <td>732</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <td>524</td>\n",
|
|
" <td>97</td>\n",
|
|
" <td>5.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <td>385</td>\n",
|
|
" <td>603</td>\n",
|
|
" <td>5.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <td>184</td>\n",
|
|
" <td>642</td>\n",
|
|
" <td>4.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <td>314</td>\n",
|
|
" <td>1520</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"data.show_batch()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Now we will create a `collab_learner` for the data, which by default uses the [EmbeddingDotBias](https://docs.fast.ai/collab.html#EmbeddingDotBias) model. We will be using 40 latent factors. This will create an embedding for the users and the items that will map each of these to 40 floats as can be seen below. Note that the embedding parameters are not predefined, but are learned by the model.\n",
|
|
"\n",
|
|
"Although ratings can only range from 1-5, we are setting the range of possible ratings to a range from 0 to 5.5 -- that will allow the model to predict values around 1 and 5, which improves accuracy. Lastly, we set a value for weight-decay for regularization."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"EmbeddingDotBias(\n",
|
|
" (u_weight): Embedding(944, 40)\n",
|
|
" (i_weight): Embedding(1607, 40)\n",
|
|
" (u_bias): Embedding(944, 1)\n",
|
|
" (i_bias): Embedding(1607, 1)\n",
|
|
")"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"learn = collab_learner(data, n_factors=N_FACTORS, y_range=[0,5.5], wd=1e-1)\n",
|
|
"learn.model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Now train the model for 5 epochs setting the maximal learning rate. The learner will reduce the learning rate with each epoch using cosine annealing."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"Total time: 00:29 <p><table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: left;\">\n",
|
|
" <th>epoch</th>\n",
|
|
" <th>train_loss</th>\n",
|
|
" <th>valid_loss</th>\n",
|
|
" <th>time</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1.003403</td>\n",
|
|
" <td>0.962295</td>\n",
|
|
" <td>00:05</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <td>2</td>\n",
|
|
" <td>0.860715</td>\n",
|
|
" <td>0.886302</td>\n",
|
|
" <td>00:05</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <td>3</td>\n",
|
|
" <td>0.761810</td>\n",
|
|
" <td>0.845166</td>\n",
|
|
" <td>00:06</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <td>4</td>\n",
|
|
" <td>0.617062</td>\n",
|
|
" <td>0.833720</td>\n",
|
|
" <td>00:05</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <td>5</td>\n",
|
|
" <td>0.524949</td>\n",
|
|
" <td>0.833384</td>\n",
|
|
" <td>00:06</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>"
|
|
],
|
|
"text/plain": [
|
|
"<IPython.core.display.HTML object>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Took 29.59385323524475 seconds for training.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"start_time = time.time()\n",
|
|
"\n",
|
|
"learn.fit_one_cycle(EPOCHS, max_lr=5e-3)\n",
|
|
"\n",
|
|
"train_time = time.time() - start_time + preprocess_time\n",
|
|
"print(\"Took {} seconds for training.\".format(train_time))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Save the learner so it can be loaded back later for inferencing / generating recommendations"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"learn.export('movielens_model.pkl')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Generating Recommendations\n",
|
|
"\n",
|
|
"Load the learner from disk."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"learner = load_learner(path=\".\", fname='movielens_model.pkl')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Get all users and items that the model knows"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"total_users, total_items = learner.data.train_ds.x.classes.values()\n",
|
|
"total_items = total_items[1:]\n",
|
|
"total_users = total_users[1:]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Get all users from the test set and remove any users that were know in the training set"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"test_users = test_df[USER].unique()\n",
|
|
"test_users = np.intersect1d(test_users, total_users)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Build the cartesian product of test set users and all items known to the model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"users_items = cartesian_product(np.array(test_users),np.array(total_items))\n",
|
|
"users_items = pd.DataFrame(users_items, columns=[USER,ITEM])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"source": [
|
|
"\n",
|
|
"Lastly, remove the user/items combinations that are in the training set -- we don't want to propose a movie that the user has already watched."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"training_removed = pd.concat([users_items, train_valid_df[[USER,ITEM]]]).drop_duplicates(keep=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Score the model to find the top K recommendation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Took 1.967883825302124 seconds for 1439504 predictions.\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"start_time = time.time()\n",
|
|
"\n",
|
|
"top_k_scores = score(learner, \n",
|
|
" test_df=training_removed,\n",
|
|
" user_col=USER, \n",
|
|
" item_col=ITEM, \n",
|
|
" prediction_col=PREDICTION, top_k=TOP_K)\n",
|
|
"\n",
|
|
"test_time = time.time() - start_time\n",
|
|
"print(\"Took {} seconds for {} predictions.\".format(test_time, len(training_removed)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Calculate some metrics for our model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"eval_map = map_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n",
|
|
" col_rating=RATING, col_prediction=PREDICTION, \n",
|
|
" relevancy_method=\"top_k\", k=TOP_K)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n",
|
|
" col_rating=RATING, col_prediction=PREDICTION, \n",
|
|
" relevancy_method=\"top_k\", k=TOP_K)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n",
|
|
" col_rating=RATING, col_prediction=PREDICTION, \n",
|
|
" relevancy_method=\"top_k\", k=TOP_K)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM, \n",
|
|
" col_rating=RATING, col_prediction=PREDICTION, \n",
|
|
" relevancy_method=\"top_k\", k=TOP_K)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Model:\tCollabLearner\n",
|
|
"Top K:\t10\n",
|
|
"MAP:\t0.021576\n",
|
|
"NDCG:\t0.136680\n",
|
|
"Precision@K:\t0.127147\n",
|
|
"Recall@K:\t0.050106\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(\"Model:\\t\" + learn.__class__.__name__,\n",
|
|
" \"Top K:\\t%d\" % TOP_K,\n",
|
|
" \"MAP:\\t%f\" % eval_map,\n",
|
|
" \"NDCG:\\t%f\" % eval_ndcg,\n",
|
|
" \"Precision@K:\\t%f\" % eval_precision,\n",
|
|
" \"Recall@K:\\t%f\" % eval_recall, sep='\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"The above numbers are lower than [SAR](../sar_single_node_movielens.ipynb), but expected, since the model is explicitly trying to generalize the users and items to the latent factors. Next look at how well the model predicts how the user would rate the movie. Need to score `test_df`, but this time don't ask for top_k. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"scores = score(learner, test_df=test_df, \n",
|
|
" user_col=USER, item_col=ITEM, prediction_col=PREDICTION)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Now calculate some regression metrics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Model:\tCollabLearner\n",
|
|
"RMSE:\t0.921269\n",
|
|
"MAE:\t0.729055\n",
|
|
"Explained variance:\t0.348939\n",
|
|
"R squared:\t0.348134\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"eval_r2 = rsquared(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n",
|
|
"eval_rmse = rmse(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n",
|
|
"eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n",
|
|
"eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)\n",
|
|
"\n",
|
|
"print(\"Model:\\t\" + learn.__class__.__name__,\n",
|
|
" \"RMSE:\\t%f\" % eval_rmse,\n",
|
|
" \"MAE:\\t%f\" % eval_mae,\n",
|
|
" \"Explained variance:\\t%f\" % eval_exp_var,\n",
|
|
" \"R squared:\\t%f\" % eval_r2, sep='\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"That RMSE is actually quite good when compared to these benchmarks: https://www.librec.net/release/v1.3/example.html"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"map": 0.021576468861591765
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"ndcg": 0.13668004147814297
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"precision": 0.1271474019088017
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"recall": 0.050105810550793446
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"rmse": 0.9212688307302583
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"mae": 0.7290550522309626
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"exp_var": 0.34893854021632575
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"rsquared": 0.3481341641867869
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"train_time": 29.59385323524475
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"data": {
|
|
"application/papermill.record+json": {
|
|
"test_time": 1.967883825302124
|
|
}
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Record results with papermill for tests\n",
|
|
"pm.record(\"map\", eval_map)\n",
|
|
"pm.record(\"ndcg\", eval_ndcg)\n",
|
|
"pm.record(\"precision\", eval_precision)\n",
|
|
"pm.record(\"recall\", eval_recall)\n",
|
|
"pm.record(\"rmse\", eval_rmse)\n",
|
|
"pm.record(\"mae\", eval_mae)\n",
|
|
"pm.record(\"exp_var\", eval_exp_var)\n",
|
|
"pm.record(\"rsquared\", eval_r2)\n",
|
|
"pm.record(\"train_time\", train_time)\n",
|
|
"pm.record(\"test_time\", test_time)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"celltoolbar": "Tags",
|
|
"kernelspec": {
|
|
"display_name": "Python (reco_gpu)",
|
|
"language": "python",
|
|
"name": "reco_gpu"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|