rename common to utils
This commit is contained in:
Родитель
3fb90955c0
Коммит
1eb2a93e4a
|
@ -7,64 +7,64 @@ Common utilities module
|
|||
General utilities
|
||||
===============================
|
||||
|
||||
.. automodule:: reco_utils.common.general_utils
|
||||
.. automodule:: reco_utils.utils.general_utils
|
||||
:members:
|
||||
|
||||
|
||||
GPU utilities
|
||||
===============================
|
||||
|
||||
.. automodule:: reco_utils.common.gpu_utils
|
||||
.. automodule:: reco_utils.utils.gpu_utils
|
||||
:members:
|
||||
|
||||
|
||||
Kubernetes utilities
|
||||
===============================
|
||||
|
||||
.. automodule:: reco_utils.common.k8s_utils
|
||||
.. automodule:: reco_utils.utils.k8s_utils
|
||||
:members:
|
||||
|
||||
|
||||
Notebook utilities
|
||||
===============================
|
||||
|
||||
.. automodule:: reco_utils.common.notebook_utils
|
||||
.. automodule:: reco_utils.utils.notebook_utils
|
||||
:members:
|
||||
|
||||
.. automodule:: reco_utils.common.notebook_memory_management
|
||||
.. automodule:: reco_utils.utils.notebook_memory_management
|
||||
:members:
|
||||
|
||||
|
||||
Python utilities
|
||||
===============================
|
||||
|
||||
.. automodule:: reco_utils.common.python_utils
|
||||
.. automodule:: reco_utils.utils.python_utils
|
||||
:members:
|
||||
|
||||
|
||||
Spark utilities
|
||||
===============================
|
||||
|
||||
.. automodule:: reco_utils.common.spark_utils
|
||||
.. automodule:: reco_utils.utils.spark_utils
|
||||
:members:
|
||||
|
||||
|
||||
Tensorflow utilities
|
||||
===============================
|
||||
|
||||
.. automodule:: reco_utils.common.tf_utils
|
||||
.. automodule:: reco_utils.utils.tf_utils
|
||||
:members:
|
||||
|
||||
|
||||
Timer
|
||||
===============================
|
||||
|
||||
.. automodule:: reco_utils.common.timer
|
||||
.. automodule:: reco_utils.utils.timer
|
||||
:members:
|
||||
|
||||
|
||||
Plot utilities
|
||||
===============================
|
||||
|
||||
.. automodule:: reco_utils.common.plot
|
||||
.. automodule:: reco_utils.utils.plot
|
||||
:members:
|
|
@ -11,7 +11,7 @@ evaluating recommender systems.
|
|||
:maxdepth: 1
|
||||
:caption: Contents:
|
||||
|
||||
Common <common>
|
||||
Utils <utils>
|
||||
Dataset <dataset>
|
||||
Evaluation <evaluation>
|
||||
Recommender algorithms <recommender>
|
||||
|
|
|
@ -52,12 +52,12 @@
|
|||
"from pyspark.sql.types import StructType, StructField\n",
|
||||
"from pyspark.sql.types import StringType, FloatType, IntegerType, LongType\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.common.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.utils.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
|
||||
"from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation\n",
|
||||
"from reco_utils.common.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
|
||||
"\n",
|
||||
"print(\"System version: {}\".format(sys.version))\n",
|
||||
"print(\"Spark version: {}\".format(pyspark.__version__))\n"
|
||||
|
|
|
@ -49,7 +49,7 @@
|
|||
"import torch, fastai\n",
|
||||
"from fastai.collab import EmbeddingDotBias, collab_learner, CollabDataBunch, load_learner\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
|
||||
"from reco_utils.recommender.fastai.fastai_utils import cartesian_product, score\n",
|
||||
|
|
|
@ -52,11 +52,11 @@
|
|||
"import tensorflow as tf\n",
|
||||
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.recommender.ncf.ncf_singlenode import NCF\n",
|
||||
"from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.common.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.utils.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.dataset.python_splitters import python_chrono_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, \n",
|
||||
" recall_at_k, get_top_k_items)\n",
|
||||
|
|
|
@ -64,8 +64,8 @@
|
|||
"import scrapbook as sb\n",
|
||||
"from sklearn.preprocessing import minmax_scale\n",
|
||||
"\n",
|
||||
"from reco_utils.common.python_utils import binarize\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.python_utils import binarize\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import (\n",
|
||||
|
@ -497,7 +497,7 @@
|
|||
"source": [
|
||||
"### 2.3. Evaluate how well SAR performs\n",
|
||||
"\n",
|
||||
"We evaluate how well SAR performs for a few common ranking metrics provided in the `python_evaluation` module in reco_utils. We will consider the Mean Average Precision (MAP), Normalized Discounted Cumalative Gain (NDCG), Precision, and Recall for the top-k items per user we computed with SAR. User, item and rating column names are specified in each evaluation method."
|
||||
"We evaluate how well SAR performs for a few utils ranking metrics provided in the `python_evaluation` module in reco_utils. We will consider the Mean Average Precision (MAP), Normalized Discounted Cumalative Gain (NDCG), Precision, and Recall for the top-k items per user we computed with SAR. User, item and rating column names are specified in each evaluation method."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -337,7 +337,7 @@
|
|||
"from azureml.core import Run\n",
|
||||
"from sklearn.externals import joblib\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
|
||||
|
|
|
@ -63,8 +63,8 @@
|
|||
"import tensorflow as tf\n",
|
||||
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.common.constants import SEED\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.utils.constants import SEED\n",
|
||||
"from reco_utils.recommender.deeprec.deeprec_utils import (\n",
|
||||
" prepare_hparams\n",
|
||||
")\n",
|
||||
|
|
|
@ -65,14 +65,14 @@
|
|||
"import tensorflow as tf\n",
|
||||
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
|
||||
"\n",
|
||||
"from reco_utils.common.constants import (\n",
|
||||
"from reco_utils.utils.constants import (\n",
|
||||
" DEFAULT_USER_COL as USER_COL,\n",
|
||||
" DEFAULT_ITEM_COL as ITEM_COL,\n",
|
||||
" DEFAULT_RATING_COL as RATING_COL,\n",
|
||||
" DEFAULT_PREDICTION_COL as PREDICT_COL,\n",
|
||||
" SEED\n",
|
||||
")\n",
|
||||
"from reco_utils.common import tf_utils, gpu_utils, plot\n",
|
||||
"from reco_utils.utils import tf_utils, gpu_utils, plot\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.pandas_df_utils import user_item_pairs\n",
|
||||
"from reco_utils.dataset.python_splitters import python_random_split\n",
|
||||
|
|
|
@ -53,7 +53,7 @@
|
|||
"import tensorflow as tf\n",
|
||||
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
|
||||
"\n",
|
||||
"from reco_utils.common.constants import SEED\n",
|
||||
"from reco_utils.utils.constants import SEED\n",
|
||||
"from reco_utils.recommender.deeprec.deeprec_utils import (\n",
|
||||
" download_deeprec_resources, prepare_hparams\n",
|
||||
")\n",
|
||||
|
|
|
@ -56,7 +56,7 @@
|
|||
"import numpy as np\n",
|
||||
"from datetime import datetime, timedelta\n",
|
||||
"\n",
|
||||
"from reco_utils.common.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.dataset.download_utils import maybe_download\n",
|
||||
"from reco_utils.dataset.python_splitters import (\n",
|
||||
" python_random_split, \n",
|
||||
|
|
|
@ -70,7 +70,7 @@
|
|||
"import itertools\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"from reco_utils.common.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.utils.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_random_split\n",
|
||||
"from reco_utils.dataset.pandas_df_utils import filter_by\n",
|
||||
|
|
|
@ -55,8 +55,8 @@
|
|||
"from reco_utils.dataset.python_splitters import python_random_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
|
||||
"from reco_utils.recommender.cornac.cornac_utils import predict_ranking\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.common.constants import SEED\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.utils.constants import SEED\n",
|
||||
"\n",
|
||||
"print(\"System version: {}\".format(sys.version))\n",
|
||||
"print(\"PyTorch version: {}\".format(torch.__version__))\n",
|
||||
|
|
|
@ -53,8 +53,8 @@
|
|||
"from reco_utils.dataset.python_splitters import python_random_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
|
||||
"from reco_utils.recommender.cornac.cornac_utils import predict_ranking\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.common.constants import SEED\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.utils.constants import SEED\n",
|
||||
"\n",
|
||||
"print(\"System version: {}\".format(sys.version))\n",
|
||||
"print(\"Cornac version: {}\".format(cornac.__version__))"
|
||||
|
@ -89,7 +89,7 @@
|
|||
"\n",
|
||||
"### 1.1 Personalized Ranking from Implicit Feedback\n",
|
||||
"\n",
|
||||
"The task of personalized ranking aims at providing each user a ranked list of items (recommendations). This is very common in scenarios where recommender systems are based on implicit user behavior (e.g. purchases, clicks). The available observations are only positive feedback where the non-observed ones are a mixture of real negative feedback and missing values.\n",
|
||||
"The task of personalized ranking aims at providing each user a ranked list of items (recommendations). This is very utils in scenarios where recommender systems are based on implicit user behavior (e.g. purchases, clicks). The available observations are only positive feedback where the non-observed ones are a mixture of real negative feedback and missing values.\n",
|
||||
"\n",
|
||||
"One usual approach for item recommendation is directly predicting a preference score $\\hat{x}_{u,i}$ given to item $i$ by user $u$. BPR uses a different approach by using item pairs $(i, j)$ and optimizing for the correct ranking given preference of user $u$, thus, there are notions of *positive* and *negative* items. The training data $D_S : U \\times I \\times I$ is defined as:\n",
|
||||
"\n",
|
||||
|
@ -118,7 +118,7 @@
|
|||
"\n",
|
||||
"The preference scoring function $\\hat{x}_{uij}(\\Theta)$ could be an arbitrary real-valued function of the model parameter $\\Theta$. Thus, it makes BPR a general framework for modeling the relationship between triplets $(u, i, j)$ where different model classes like matrix factorization could be used for estimating $\\hat{x}_{uij}(\\Theta)$.\n",
|
||||
"\n",
|
||||
"For the prior, one of the common pratices is to choose $p(\\Theta)$ following a normal distribution, which results in a nice form of L2 regularization in the final log-form of the objective function.\n",
|
||||
"For the prior, one of the utils pratices is to choose $p(\\Theta)$ following a normal distribution, which results in a nice form of L2 regularization in the final log-form of the objective function.\n",
|
||||
"\n",
|
||||
"$$ p(\\Theta) \\sim N(0, \\Sigma_{\\Theta}) $$\n",
|
||||
"\n",
|
||||
|
|
|
@ -52,13 +52,13 @@
|
|||
"import tensorflow as tf\n",
|
||||
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.recommender.deeprec.models.graphrec.lightgcn import LightGCN\n",
|
||||
"from reco_utils.recommender.deeprec.DataModel.ImplicitCF import ImplicitCF\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
|
||||
"from reco_utils.common.constants import SEED as DEFAULT_SEED\n",
|
||||
"from reco_utils.utils.constants import SEED as DEFAULT_SEED\n",
|
||||
"from reco_utils.recommender.deeprec.deeprec_utils import prepare_hparams\n",
|
||||
"\n",
|
||||
"print(\"System version: {}\".format(sys.version))\n",
|
||||
|
@ -728,7 +728,7 @@
|
|||
"\n",
|
||||
"Here there are the performances of LightGCN compared to [SAR](../00_quick_start/sar_movielens.ipynb) and [NCF](../00_quick_start/ncf_movielens.ipynb) on MovieLens dataset of 100k and 1m. The method of data loading and splitting is the same as that described above and the GPU used was a GeForce GTX 1080Ti.\n",
|
||||
"\n",
|
||||
"Settings common to the three models: `epochs=15, seed=42`.\n",
|
||||
"Settings utils to the three models: `epochs=15, seed=42`.\n",
|
||||
"\n",
|
||||
"Settings for LightGCN: `embed_size=64, n_layers=3, batch_size=1024, decay=0.0001, learning_rate=0.015 `.\n",
|
||||
"\n",
|
||||
|
|
|
@ -96,14 +96,14 @@
|
|||
"import tensorflow as tf\n",
|
||||
"import keras\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.split_utils import min_rating_filter_pandas\n",
|
||||
"from reco_utils.dataset.python_splitters import numpy_stratified_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
|
||||
"\n",
|
||||
"from reco_utils.dataset.sparse import AffinityMatrix\n",
|
||||
"from reco_utils.common.python_utils import binarize\n",
|
||||
"from reco_utils.utils.python_utils import binarize\n",
|
||||
"from reco_utils.recommender.vae.multinomial_vae import Mult_VAE\n",
|
||||
"\n",
|
||||
"from tempfile import TemporaryDirectory\n",
|
||||
|
|
|
@ -96,15 +96,15 @@
|
|||
"import tensorflow as tf\n",
|
||||
"import keras\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.split_utils import min_rating_filter_pandas\n",
|
||||
"from reco_utils.dataset.python_splitters import numpy_stratified_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
|
||||
"from reco_utils.common.constants import SEED as DEFAULT_SEED\n",
|
||||
"from reco_utils.utils.constants import SEED as DEFAULT_SEED\n",
|
||||
"\n",
|
||||
"from reco_utils.dataset.sparse import AffinityMatrix\n",
|
||||
"from reco_utils.common.python_utils import binarize\n",
|
||||
"from reco_utils.utils.python_utils import binarize\n",
|
||||
"from reco_utils.recommender.vae.standard_vae import StandardVAE\n",
|
||||
"\n",
|
||||
"print(\"System version: {}\".format(sys.version))\n",
|
||||
|
|
|
@ -105,7 +105,7 @@
|
|||
"import scrapbook as sb\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_random_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, \n",
|
||||
|
|
|
@ -71,8 +71,8 @@
|
|||
"import papermill as pm\n",
|
||||
"import scrapbook as sb\n",
|
||||
"\n",
|
||||
"from reco_utils.common.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.common.notebook_utils import is_databricks\n",
|
||||
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.utils.notebook_utils import is_databricks\n",
|
||||
"from reco_utils.dataset.criteo import load_spark_df\n",
|
||||
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
|
||||
"\n",
|
||||
|
|
|
@ -86,7 +86,7 @@
|
|||
"import papermill as pm\n",
|
||||
"import scrapbook as sb\n",
|
||||
"\n",
|
||||
"from reco_utils.common.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.utils.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.dataset.movielens import load_pandas_df\n",
|
||||
"from reco_utils.dataset.python_splitters import python_random_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import (rmse, mae, exp_var, rsquared, get_top_k_items,\n",
|
||||
|
|
|
@ -251,8 +251,8 @@
|
|||
"%matplotlib notebook\n",
|
||||
"from matplotlib import pyplot as plt\n",
|
||||
"\n",
|
||||
"from reco_utils.common.constants import SEED\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.constants import SEED\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset.download_utils import maybe_download, unzip_file\n",
|
||||
"from reco_utils.tuning.parameter_sweep import generate_param_grid\n",
|
||||
"from reco_utils.dataset.pandas_df_utils import LibffmConverter\n",
|
||||
|
|
|
@ -53,14 +53,14 @@
|
|||
"import tensorflow as tf\n",
|
||||
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.recommender.ncf.ncf_singlenode import NCF\n",
|
||||
"from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_chrono_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, \n",
|
||||
" recall_at_k, get_top_k_items)\n",
|
||||
"from reco_utils.common.constants import SEED as DEFAULT_SEED\n",
|
||||
"from reco_utils.utils.constants import SEED as DEFAULT_SEED\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(\"System version: {}\".format(sys.version))\n",
|
||||
|
@ -196,7 +196,7 @@
|
|||
"\n",
|
||||
"To evaluate the performance of item recommendation, we adopted the leave-one-out evaluation.\n",
|
||||
"\n",
|
||||
"For each user, we held out his/her latest interaction as the test set and utilized the remaining data for training. We use `python_chrono_split` to achieve this. And since it is too time-consuming to rank all items for every user during evaluation, we followed the common strategy that randomly samples 100 items that are not interacted by the user, ranking the test item among the 100 items. Our test samples will be constructed by `NCFDataset`."
|
||||
"For each user, we held out his/her latest interaction as the test set and utilized the remaining data for training. We use `python_chrono_split` to achieve this. And since it is too time-consuming to rank all items for every user during evaluation, we followed the utils strategy that randomly samples 100 items that are not interacted by the user, ranking the test item among the 100 items. Our test samples will be constructed by `NCFDataset`."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -68,12 +68,12 @@
|
|||
"from pyspark.sql.types import StructType, StructField\n",
|
||||
"from pyspark.sql.types import StringType, FloatType, IntegerType, LongType\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.common.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.utils.notebook_utils import is_jupyter\n",
|
||||
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
|
||||
"from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation\n",
|
||||
"from reco_utils.common.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
|
||||
"\n",
|
||||
"from reco_utils.evaluation.spark_diversity_evaluation import DiversityEvaluation\n",
|
||||
"from pyspark.sql.window import Window\n",
|
||||
|
|
|
@ -63,7 +63,7 @@
|
|||
"import pyspark\n",
|
||||
"from sklearn.preprocessing import minmax_scale\n",
|
||||
"\n",
|
||||
"from reco_utils.common.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.evaluation.spark_evaluation import SparkRankingEvaluation, SparkRatingEvaluation\n",
|
||||
"from reco_utils.evaluation.python_evaluation import auc, logloss\n",
|
||||
"\n",
|
||||
|
|
|
@ -85,9 +85,9 @@
|
|||
"import azureml.widgets as widgets\n",
|
||||
"import azureml.train.hyperdrive as hd\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.common.constants import SEED\n",
|
||||
"from reco_utils.common.tf_utils import pandas_input_fn_for_saved_model\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.utils.constants import SEED\n",
|
||||
"from reco_utils.utils.tf_utils import pandas_input_fn_for_saved_model\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.pandas_df_utils import user_item_pairs\n",
|
||||
"from reco_utils.dataset.python_splitters import python_random_split\n",
|
||||
|
|
|
@ -67,7 +67,7 @@
|
|||
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
|
||||
"\n",
|
||||
"import reco_utils\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_chrono_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import rmse, precision_at_k, ndcg_at_k\n",
|
||||
|
|
|
@ -61,7 +61,7 @@
|
|||
"from tempfile import TemporaryDirectory\n",
|
||||
"\n",
|
||||
"import reco_utils\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_random_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import rmse, precision_at_k, ndcg_at_k\n",
|
||||
|
|
|
@ -19,7 +19,7 @@ try:
|
|||
except ImportError:
|
||||
run = None
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
|
|
@ -81,8 +81,8 @@
|
|||
"from hyperopt.pyll.base import scope\n",
|
||||
"from hyperopt.pyll.stochastic import sample\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.common.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.evaluation.spark_evaluation import SparkRankingEvaluation, SparkRatingEvaluation\n",
|
||||
"from reco_utils.dataset.movielens import load_spark_df\n",
|
||||
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
|
||||
|
|
|
@ -78,7 +78,7 @@
|
|||
|
||||
"import urllib\n",
|
||||
"\n",
|
||||
"from azure.common.client_factory import get_client_from_cli_profile\n",
|
||||
"from azure.utils.client_factory import get_client_from_cli_profile\n",
|
||||
"import azure.mgmt.cosmosdb\n",
|
||||
"import azureml.core\n",
|
||||
"from azureml.core import Workspace\n",
|
||||
|
@ -96,14 +96,14 @@
|
|||
"from pyspark.sql.types import StructType, StructField\n",
|
||||
"from pyspark.sql.types import FloatType, IntegerType, LongType\n",
|
||||
"\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.common.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.cosmos_cli import find_collection, read_collection, read_database, find_database\n",
|
||||
"from reco_utils.dataset.download_utils import maybe_download\n",
|
||||
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
|
||||
"from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation\n",
|
||||
"from reco_utils.common.notebook_utils import is_databricks\n",
|
||||
"from reco_utils.utils.notebook_utils import is_databricks\n",
|
||||
"\n",
|
||||
"print(\"Azure SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
|
@ -165,7 +165,7 @@
|
|||
"1. [Azure ML Service](https://azure.microsoft.com/en-us/services/machine-learning-service/)\n",
|
||||
" 1. [Azure ML Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-workspace)\n",
|
||||
" 1. [Azure Application Insights](https://azure.microsoft.com/en-us/services/monitor/)\n",
|
||||
" 1. [Azure Storage](https://docs.microsoft.com/en-us/azure/storage/common/storage-account-overview)\n",
|
||||
" 1. [Azure Storage](https://docs.microsoft.com/en-us/azure/storage/utils/storage-account-overview)\n",
|
||||
" 1. [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) \n",
|
||||
"\n",
|
||||
"1. [Azure Cosmos DB](https://azure.microsoft.com/en-us/services/cosmos-db/)\n",
|
||||
|
|
|
@ -95,7 +95,7 @@
|
|||
"import shutil\n",
|
||||
"\n",
|
||||
"from reco_utils.dataset.criteo import get_spark_schema, load_spark_df\n",
|
||||
"from reco_utils.common.k8s_utils import qps_to_replicas, replicas_to_qps, nodes_to_replicas\n",
|
||||
"from reco_utils.utils.k8s_utils import qps_to_replicas, replicas_to_qps, nodes_to_replicas\n",
|
||||
"\n",
|
||||
"from azureml.core import Workspace\n",
|
||||
"from azureml.core import VERSION as azureml_version\n",
|
||||
|
|
|
@ -7,7 +7,7 @@ from fastai.collab import collab_learner, CollabDataBunch
|
|||
import surprise
|
||||
import cornac
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
COL_DICT,
|
||||
DEFAULT_K,
|
||||
DEFAULT_USER_COL,
|
||||
|
@ -17,8 +17,8 @@ from reco_utils.common.constants import (
|
|||
DEFAULT_TIMESTAMP_COL,
|
||||
SEED,
|
||||
)
|
||||
from reco_utils.common.timer import Timer
|
||||
from reco_utils.common.spark_utils import start_or_get_spark
|
||||
from reco_utils.utils.timer import Timer
|
||||
from reco_utils.utils.spark_utils import start_or_get_spark
|
||||
from reco_utils.recommender.sar.sar_singlenode import SARSingleNode
|
||||
from reco_utils.recommender.ncf.ncf_singlenode import NCF
|
||||
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
|
||||
|
@ -44,12 +44,7 @@ from reco_utils.evaluation.python_evaluation import (
|
|||
precision_at_k,
|
||||
recall_at_k,
|
||||
)
|
||||
from reco_utils.evaluation.python_evaluation import (
|
||||
rmse,
|
||||
mae,
|
||||
rsquared,
|
||||
exp_var
|
||||
)
|
||||
from reco_utils.evaluation.python_evaluation import rmse, mae, rsquared, exp_var
|
||||
|
||||
|
||||
def prepare_training_als(train, test):
|
||||
|
@ -82,8 +77,7 @@ def prepare_metrics_als(train, test):
|
|||
)
|
||||
)
|
||||
spark = start_or_get_spark()
|
||||
return spark.createDataFrame(train, schema), spark.createDataFrame(test,
|
||||
schema)
|
||||
return spark.createDataFrame(train, schema), spark.createDataFrame(test, schema)
|
||||
|
||||
|
||||
def predict_als(model, test):
|
||||
|
@ -172,10 +166,7 @@ def prepare_training_fastai(train, test):
|
|||
|
||||
def train_fastai(params, data):
|
||||
model = collab_learner(
|
||||
data,
|
||||
n_factors=params["n_factors"],
|
||||
y_range=params["y_range"],
|
||||
wd=params["wd"]
|
||||
data, n_factors=params["n_factors"], y_range=params["y_range"], wd=params["wd"]
|
||||
)
|
||||
with Timer() as t:
|
||||
model.fit_one_cycle(cyc_len=params["epochs"], max_lr=params["max_lr"])
|
||||
|
@ -267,10 +258,7 @@ def recommend_k_ncf(model, test, train, top_k=DEFAULT_K, remove_seen=True):
|
|||
}
|
||||
)
|
||||
merged = pd.merge(
|
||||
train,
|
||||
topk_scores,
|
||||
on=[DEFAULT_USER_COL, DEFAULT_ITEM_COL],
|
||||
how="outer"
|
||||
train, topk_scores, on=[DEFAULT_USER_COL, DEFAULT_ITEM_COL], how="outer"
|
||||
)
|
||||
topk_scores = merged[merged[DEFAULT_RATING_COL].isnull()].drop(
|
||||
DEFAULT_RATING_COL, axis=1
|
||||
|
@ -280,8 +268,7 @@ def recommend_k_ncf(model, test, train, top_k=DEFAULT_K, remove_seen=True):
|
|||
|
||||
def prepare_training_cornac(train, test):
|
||||
return cornac.data.Dataset.from_uir(
|
||||
train.drop(DEFAULT_TIMESTAMP_COL, axis=1).itertuples(index=False),
|
||||
seed=SEED
|
||||
train.drop(DEFAULT_TIMESTAMP_COL, axis=1).itertuples(index=False), seed=SEED
|
||||
)
|
||||
|
||||
|
||||
|
@ -344,11 +331,7 @@ def train_lightgcn(params, data):
|
|||
return model, t
|
||||
|
||||
|
||||
def recommend_k_lightgcn(model,
|
||||
test,
|
||||
train,
|
||||
top_k=DEFAULT_K,
|
||||
remove_seen=True):
|
||||
def recommend_k_lightgcn(model, test, train, top_k=DEFAULT_K, remove_seen=True):
|
||||
with Timer() as t:
|
||||
topk_scores = model.recommend_k_items(
|
||||
test, top_k=top_k, remove_seen=remove_seen
|
||||
|
|
|
@ -106,8 +106,8 @@
|
|||
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
|
||||
"import surprise\n",
|
||||
"\n",
|
||||
"from reco_utils.common.general_utils import get_number_processors\n",
|
||||
"from reco_utils.common.gpu_utils import get_cuda_version, get_cudnn_version\n",
|
||||
"from reco_utils.utils.general_utils import get_number_processors\n",
|
||||
"from reco_utils.utils.gpu_utils import get_cuda_version, get_cudnn_version\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
|
||||
"\n",
|
||||
|
|
|
@ -36,13 +36,13 @@
|
|||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import tensorflow as tf\n",
|
||||
"from reco_utils.common.timer import Timer\n",
|
||||
"from reco_utils.utils.timer import Timer\n",
|
||||
"from reco_utils.recommender.deeprec.models.graphrec.lightgcn import LightGCN\n",
|
||||
"from reco_utils.recommender.deeprec.DataModel.ImplicitCF import ImplicitCF\n",
|
||||
"from reco_utils.dataset import movielens\n",
|
||||
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
|
||||
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
|
||||
"from reco_utils.common.constants import SEED as DEFAULT_SEED\n",
|
||||
"from reco_utils.utils.constants import SEED as DEFAULT_SEED\n",
|
||||
"from reco_utils.recommender.deeprec.deeprec_utils import prepare_hparams\n",
|
||||
"from reco_utils.recommender.deeprec.deeprec_utils import cal_metric\n",
|
||||
"from utils.general import *\n",
|
||||
|
|
|
@ -84,7 +84,7 @@ It is also possible to install directly from GitHub. Or from a specific branch a
|
|||
|
||||
# Contents
|
||||
|
||||
## [Common](common)
|
||||
## [Utils](utils)
|
||||
|
||||
This submodule contains high-level utilities for defining constants used in most algorithms as well as helper functions for managing aspects of different frameworks: GPU, Spark, Jupyter notebook.
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ import random
|
|||
import logging
|
||||
import _pickle as cPickle
|
||||
|
||||
from reco_utils.common.constants import SEED
|
||||
from reco_utils.utils.constants import SEED
|
||||
from reco_utils.dataset.download_utils import maybe_download, download_path
|
||||
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ except ImportError:
|
|||
pass # so the environment without spark doesn't break
|
||||
|
||||
from reco_utils.dataset.download_utils import maybe_download, download_path
|
||||
from reco_utils.common.notebook_utils import is_databricks
|
||||
from reco_utils.utils.notebook_utils import is_databricks
|
||||
|
||||
|
||||
CRITEO_URL = {
|
||||
|
@ -40,11 +40,11 @@ def load_pandas_df(size="sample", local_cache_path=None, header=DEFAULT_HEADER):
|
|||
The schema is:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
<label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
|
||||
|
||||
More details (need to accept user terms to see the information):
|
||||
http://labs.criteo.com/2013/12/download-terabyte-click-logs/
|
||||
More details (need to accept user terms to see the information):
|
||||
http://labs.criteo.com/2013/12/download-terabyte-click-logs/
|
||||
|
||||
Args:
|
||||
size (str): Dataset size. It can be "sample" or "full".
|
||||
|
@ -80,13 +80,13 @@ def load_spark_df(
|
|||
onto 32 bits for anonymization purposes.
|
||||
|
||||
The schema is:
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
<label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
|
||||
|
||||
More details (need to accept user terms to see the information):
|
||||
http://labs.criteo.com/2013/12/download-terabyte-click-logs/
|
||||
More details (need to accept user terms to see the information):
|
||||
http://labs.criteo.com/2013/12/download-terabyte-click-logs/
|
||||
|
||||
Args:
|
||||
spark (pySpark.SparkSession): Spark session.
|
||||
|
@ -95,7 +95,7 @@ def load_spark_df(
|
|||
header (list): Dataset header names.
|
||||
dbfs_datapath (str): Where to store the extracted files on Databricks.
|
||||
dbutils (Databricks.dbutils): Databricks utility object.
|
||||
|
||||
|
||||
Returns:
|
||||
pyspark.sql.DataFrame: Criteo DAC training dataset.
|
||||
"""
|
||||
|
@ -145,10 +145,10 @@ def extract_criteo(size, compressed_file, path=None):
|
|||
size (str): Size of Criteo dataset. It can be "full" or "sample".
|
||||
compressed_file (str): Path to compressed file.
|
||||
path (str): Path to extract the file.
|
||||
|
||||
|
||||
Returns:
|
||||
str: Path to the extracted file.
|
||||
|
||||
|
||||
"""
|
||||
if path is None:
|
||||
folder = os.path.dirname(compressed_file)
|
||||
|
|
|
@ -8,8 +8,8 @@ import warnings
|
|||
import pandas as pd
|
||||
from zipfile import ZipFile
|
||||
from reco_utils.dataset.download_utils import maybe_download, download_path
|
||||
from reco_utils.common.notebook_utils import is_databricks
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.notebook_utils import is_databricks
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
@ -151,7 +151,7 @@ def load_pandas_df(
|
|||
"""Loads the MovieLens dataset as pd.DataFrame.
|
||||
|
||||
Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load.
|
||||
To load movie information only, you can use load_item_df function.
|
||||
To load movie information only, you can use load_item_df function.
|
||||
|
||||
Args:
|
||||
size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
|
||||
|
@ -165,12 +165,12 @@ def load_pandas_df(
|
|||
|
||||
Returns:
|
||||
pandas.DataFrame: Movie rating dataset.
|
||||
|
||||
|
||||
|
||||
**Examples**
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
# To load just user-id, item-id, and ratings from MovieLens-1M dataset,
|
||||
df = load_pandas_df('1m', ('UserId', 'ItemId', 'Rating'))
|
||||
|
||||
|
@ -345,14 +345,14 @@ def load_spark_df(
|
|||
|
||||
Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load as `pyspark.sql.DataFrame`.
|
||||
|
||||
To load movie information only, you can use `load_item_df` function.
|
||||
To load movie information only, you can use `load_item_df` function.
|
||||
|
||||
Args:
|
||||
spark (pyspark.SparkSession): Spark session.
|
||||
size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
|
||||
header (list or tuple): Rating dataset header.
|
||||
If schema is provided, this argument is ignored.
|
||||
schema (pyspark.StructType): Dataset schema.
|
||||
schema (pyspark.StructType): Dataset schema.
|
||||
local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
|
||||
If None, all the intermediate files will be stored in a temporary directory and removed after use.
|
||||
dbutils (Databricks.dbutils): Databricks utility object
|
||||
|
@ -363,11 +363,11 @@ def load_spark_df(
|
|||
|
||||
Returns:
|
||||
pyspark.sql.DataFrame: Movie rating dataset.
|
||||
|
||||
|
||||
**Examples**
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
|
||||
# To load just user-id, item-id, and ratings from MovieLens-1M dataset:
|
||||
spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'))
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import pandas as pd
|
|||
import numpy as np
|
||||
from functools import lru_cache, wraps
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
|
|
@ -4,7 +4,7 @@ import numpy as np
|
|||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split as sk_split
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_TIMESTAMP_COL,
|
||||
|
@ -245,8 +245,8 @@ def numpy_stratified_split(X, ratio=0.75, seed=42):
|
|||
seed (int): random seed
|
||||
|
||||
Returns:
|
||||
numpy.ndarray, numpy.ndarray:
|
||||
- Xtr: The train set user/item affinity matrix.
|
||||
numpy.ndarray, numpy.ndarray:
|
||||
- Xtr: The train set user/item affinity matrix.
|
||||
- Xtst: The test set user/item affinity matrix.
|
||||
"""
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ try:
|
|||
except ImportError:
|
||||
pass # skip this import if we are in pure python environment
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_TIMESTAMP_COL,
|
||||
|
@ -19,15 +19,15 @@ from reco_utils.dataset.split_utils import process_split_ratio, min_rating_filte
|
|||
|
||||
def spark_random_split(data, ratio=0.75, seed=42):
|
||||
"""Spark random splitter.
|
||||
|
||||
|
||||
Randomly split the data into several splits.
|
||||
|
||||
Args:
|
||||
data (pyspark.sql.DataFrame): Spark DataFrame to be split.
|
||||
ratio (float or list): Ratio for splitting data. If it is a single float number
|
||||
it splits data into two halves and the ratio argument indicates the ratio of
|
||||
training data set; if it is a list of float numbers, the splitter splits
|
||||
data into several portions corresponding to the split ratios. If a list
|
||||
it splits data into two halves and the ratio argument indicates the ratio of
|
||||
training data set; if it is a list of float numbers, the splitter splits
|
||||
data into several portions corresponding to the split ratios. If a list
|
||||
is provided and the ratios are not summed to 1, they will be normalized.
|
||||
seed (int): Seed.
|
||||
|
||||
|
@ -56,30 +56,30 @@ def _do_stratification_spark(
|
|||
):
|
||||
"""Helper function to perform stratified splits.
|
||||
|
||||
This function splits data in a stratified manner. That is, the same values for the
|
||||
filter_by column are retained in each split, but the corresponding set of entries
|
||||
are divided according to the ratio provided.
|
||||
This function splits data in a stratified manner. That is, the same values for the
|
||||
filter_by column are retained in each split, but the corresponding set of entries
|
||||
are divided according to the ratio provided.
|
||||
|
||||
Args:
|
||||
data (pyspark.sql.DataFrame): Spark DataFrame to be split.
|
||||
ratio (float or list): Ratio for splitting data. If it is a single float number
|
||||
it splits data into two sets and the ratio argument indicates the ratio of
|
||||
training data set; if it is a list of float numbers, the splitter splits
|
||||
data into several portions corresponding to the split ratios. If a list is
|
||||
provided and the ratios are not summed to 1, they will be normalized.
|
||||
min_rating (int): minimum number of ratings for user or item.
|
||||
filter_by (str): either "user" or "item", depending on which of the two is to filter
|
||||
with min_rating.
|
||||
is_partitioned (bool): flag to partition data by filter_by column
|
||||
is_random (bool): flag to make split randomly or use timestamp column
|
||||
seed (int): Seed.
|
||||
col_user (str): column name of user IDs.
|
||||
col_item (str): column name of item IDs.
|
||||
col_timestamp (str): column name of timestamps.
|
||||
Args:
|
||||
data (pyspark.sql.DataFrame): Spark DataFrame to be split.
|
||||
ratio (float or list): Ratio for splitting data. If it is a single float number
|
||||
it splits data into two sets and the ratio argument indicates the ratio of
|
||||
training data set; if it is a list of float numbers, the splitter splits
|
||||
data into several portions corresponding to the split ratios. If a list is
|
||||
provided and the ratios are not summed to 1, they will be normalized.
|
||||
min_rating (int): minimum number of ratings for user or item.
|
||||
filter_by (str): either "user" or "item", depending on which of the two is to filter
|
||||
with min_rating.
|
||||
is_partitioned (bool): flag to partition data by filter_by column
|
||||
is_random (bool): flag to make split randomly or use timestamp column
|
||||
seed (int): Seed.
|
||||
col_user (str): column name of user IDs.
|
||||
col_item (str): column name of item IDs.
|
||||
col_timestamp (str): column name of timestamps.
|
||||
|
||||
Args:
|
||||
Args:
|
||||
|
||||
Returns:
|
||||
Returns:
|
||||
"""
|
||||
# A few preliminary checks.
|
||||
if filter_by not in ["user", "item"]:
|
||||
|
@ -115,17 +115,16 @@ def _do_stratification_spark(
|
|||
window_spec = Window.partitionBy(partition_by).orderBy(order_by)
|
||||
|
||||
data = (
|
||||
data
|
||||
.withColumn("_count", F.count(split_by).over(window_count))
|
||||
.withColumn("_rank", F.row_number().over(window_spec) / F.col("_count"))
|
||||
.drop("_count")
|
||||
data.withColumn("_count", F.count(split_by).over(window_count))
|
||||
.withColumn("_rank", F.row_number().over(window_spec) / F.col("_count"))
|
||||
.drop("_count")
|
||||
)
|
||||
|
||||
multi_split, ratio = process_split_ratio(ratio)
|
||||
ratio = ratio if multi_split else [ratio, 1 - ratio]
|
||||
|
||||
splits = []
|
||||
prev_split = None
|
||||
prev_split = None
|
||||
for split in np.cumsum(ratio):
|
||||
condition = F.col("_rank") <= split
|
||||
if prev_split is not None:
|
||||
|
@ -156,8 +155,8 @@ def spark_chrono_split(
|
|||
data (pyspark.sql.DataFrame): Spark DataFrame to be split.
|
||||
ratio (float or list): Ratio for splitting data. If it is a single float number
|
||||
it splits data into two sets and the ratio argument indicates the ratio of
|
||||
training data set; if it is a list of float numbers, the splitter splits
|
||||
data into several portions corresponding to the split ratios. If a list is
|
||||
training data set; if it is a list of float numbers, the splitter splits
|
||||
data into several portions corresponding to the split ratios. If a list is
|
||||
provided and the ratios are not summed to 1, they will be normalized.
|
||||
seed (int): Seed.
|
||||
min_rating (int): minimum number of ratings for user or item.
|
||||
|
@ -183,6 +182,7 @@ def spark_chrono_split(
|
|||
col_timestamp=col_timestamp,
|
||||
)
|
||||
|
||||
|
||||
def spark_stratified_split(
|
||||
data,
|
||||
ratio=0.75,
|
||||
|
@ -228,6 +228,7 @@ def spark_stratified_split(
|
|||
col_item=col_item,
|
||||
)
|
||||
|
||||
|
||||
def spark_timestamp_split(
|
||||
data,
|
||||
ratio=0.75,
|
||||
|
|
|
@ -9,7 +9,7 @@ from scipy.sparse import coo_matrix
|
|||
import logging
|
||||
|
||||
# import default parameters
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
@ -62,7 +62,7 @@ class AffinityMatrix:
|
|||
map_users, map_items: dictionaries mapping the original user/item index to matrix indices
|
||||
map_back_users, map_back_items: dictionaries to map back the matrix elements to the original
|
||||
dataframe indices
|
||||
|
||||
|
||||
Basic mechanics:
|
||||
As a first step we retieve the unique elements in the dataset. In this way we can take care
|
||||
of either completely missing rows (a user with no ratings) or completely missing columns
|
||||
|
@ -108,7 +108,7 @@ class AffinityMatrix:
|
|||
np.save(self.save_path + "/item_back_dict", self.map_back_items)
|
||||
|
||||
def gen_affinity_matrix(self):
|
||||
"""Generate the user/item affinity matrix.
|
||||
"""Generate the user/item affinity matrix.
|
||||
|
||||
As a first step, two new columns are added to the input DF, containing the index maps
|
||||
generated by the gen_index() method. The new indices, together with the ratings, are
|
||||
|
|
|
@ -5,7 +5,7 @@ import pandas as pd
|
|||
import numpy as np
|
||||
import math
|
||||
|
||||
from reco_utils.common.constants import DEFAULT_ITEM_COL, DEFAULT_USER_COL
|
||||
from reco_utils.utils.constants import DEFAULT_ITEM_COL, DEFAULT_USER_COL
|
||||
|
||||
try:
|
||||
from pyspark.sql import functions as F, Window
|
||||
|
@ -62,10 +62,10 @@ def min_rating_filter_pandas(
|
|||
|
||||
Args:
|
||||
data (pandas.DataFrame): DataFrame of user-item tuples. Columns of user and item
|
||||
should be present in the DataFrame while other columns like rating,
|
||||
should be present in the DataFrame while other columns like rating,
|
||||
timestamp, etc. can be optional.
|
||||
min_rating (int): minimum number of ratings for user or item.
|
||||
filter_by (str): either "user" or "item", depending on which of the two is to
|
||||
filter_by (str): either "user" or "item", depending on which of the two is to
|
||||
filter with min_rating.
|
||||
col_user (str): column name of user ID.
|
||||
col_item (str): column name of item ID.
|
||||
|
@ -73,19 +73,12 @@ def min_rating_filter_pandas(
|
|||
Returns:
|
||||
pandas.DataFrame: DataFrame with at least columns of user and item that has been filtered by the given specifications.
|
||||
"""
|
||||
split_by_column = _get_column_name(
|
||||
filter_by, col_user, col_item
|
||||
)
|
||||
split_by_column = _get_column_name(filter_by, col_user, col_item)
|
||||
|
||||
if min_rating < 1:
|
||||
raise ValueError("min_rating should be integer and larger than or equal to 1.")
|
||||
|
||||
return (
|
||||
data
|
||||
.groupby(split_by_column)
|
||||
.filter(lambda x: len(x) >= min_rating)
|
||||
)
|
||||
|
||||
return data.groupby(split_by_column).filter(lambda x: len(x) >= min_rating)
|
||||
|
||||
|
||||
def min_rating_filter_spark(
|
||||
|
@ -103,10 +96,10 @@ def min_rating_filter_spark(
|
|||
|
||||
Args:
|
||||
data (pyspark.sql.DataFrame): DataFrame of user-item tuples. Columns of user and item
|
||||
should be present in the DataFrame while other columns like rating,
|
||||
should be present in the DataFrame while other columns like rating,
|
||||
timestamp, etc. can be optional.
|
||||
min_rating (int): minimum number of ratings for user or item.
|
||||
filter_by (str): either "user" or "item", depending on which of the two is to
|
||||
filter_by (str): either "user" or "item", depending on which of the two is to
|
||||
filter with min_rating.
|
||||
col_user (str): column name of user ID.
|
||||
col_item (str): column name of item ID.
|
||||
|
@ -115,9 +108,7 @@ def min_rating_filter_spark(
|
|||
pyspark.sql.DataFrame: DataFrame with at least columns of user and item that has been filtered by the given specifications.
|
||||
"""
|
||||
|
||||
split_by_column = _get_column_name(
|
||||
filter_by, col_user, col_item
|
||||
)
|
||||
split_by_column = _get_column_name(filter_by, col_user, col_item)
|
||||
|
||||
if min_rating < 1:
|
||||
raise ValueError("min_rating should be integer and larger than or equal to 1.")
|
||||
|
@ -125,12 +116,11 @@ def min_rating_filter_spark(
|
|||
if min_rating > 1:
|
||||
window = Window.partitionBy(split_by_column)
|
||||
data = (
|
||||
data
|
||||
.withColumn("_count", F.count(split_by_column).over(window))
|
||||
data.withColumn("_count", F.count(split_by_column).over(window))
|
||||
.where(F.col("_count") >= min_rating)
|
||||
.drop("_count")
|
||||
)
|
||||
|
||||
|
||||
return data
|
||||
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ from sklearn.metrics import (
|
|||
log_loss,
|
||||
)
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
@ -31,14 +31,14 @@ from reco_utils.dataset.pandas_df_utils import (
|
|||
def check_column_dtypes(func):
|
||||
"""Checks columns of DataFrame inputs
|
||||
|
||||
This includes the checks on:
|
||||
This includes the checks on:
|
||||
|
||||
* whether the input columns exist in the input DataFrames
|
||||
* whether the data types of col_user as well as col_item are matched in the two input DataFrames.
|
||||
|
||||
Args:
|
||||
func (function): function that will be wrapped
|
||||
|
||||
|
||||
Returns:
|
||||
function: Wrapper function for checking dtypes.
|
||||
"""
|
||||
|
@ -100,7 +100,7 @@ def merge_rating_true_pred(
|
|||
):
|
||||
"""Join truth and prediction data frames on userID and itemID and return the true
|
||||
and predicted rated with the correct index.
|
||||
|
||||
|
||||
Args:
|
||||
rating_true (pandas.DataFrame): True data
|
||||
rating_pred (pandas.DataFrame): Predicted data
|
||||
|
@ -210,7 +210,7 @@ def rsquared(
|
|||
col_item (str): column name for item
|
||||
col_rating (str): column name for rating
|
||||
col_prediction (str): column name for prediction
|
||||
|
||||
|
||||
Returns:
|
||||
float: R squared (min=0, max=1).
|
||||
"""
|
||||
|
@ -352,7 +352,7 @@ def merge_ranking_true_pred(
|
|||
k=DEFAULT_K,
|
||||
threshold=DEFAULT_THRESHOLD,
|
||||
):
|
||||
"""Filter truth and prediction data frames on common users
|
||||
"""Filter truth and prediction data frames on utils users
|
||||
|
||||
Args:
|
||||
rating_true (pandas.DataFrame): True DataFrame
|
||||
|
@ -361,7 +361,7 @@ def merge_ranking_true_pred(
|
|||
col_item (str): column name for item
|
||||
col_rating (str): column name for rating
|
||||
col_prediction (str): column name for prediction
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
top k items are directly provided, so there is no need to compute the relevancy operation.
|
||||
k (int): number of top k items per user (optional)
|
||||
threshold (float): threshold of top items per user (optional)
|
||||
|
@ -438,7 +438,7 @@ def precision_at_k(
|
|||
col_item (str): column name for item
|
||||
col_rating (str): column name for rating
|
||||
col_prediction (str): column name for prediction
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
top k items are directly provided, so there is no need to compute the relevancy operation.
|
||||
k (int): number of top k items per user
|
||||
threshold (float): threshold of top items per user (optional)
|
||||
|
@ -485,13 +485,13 @@ def recall_at_k(
|
|||
col_item (str): column name for item
|
||||
col_rating (str): column name for rating
|
||||
col_prediction (str): column name for prediction
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
top k items are directly provided, so there is no need to compute the relevancy operation.
|
||||
k (int): number of top k items per user
|
||||
threshold (float): threshold of top items per user (optional)
|
||||
|
||||
Returns:
|
||||
float: recall at k (min=0, max=1). The maximum value is 1 even when fewer than
|
||||
float: recall at k (min=0, max=1). The maximum value is 1 even when fewer than
|
||||
k items exist for a user in rating_true.
|
||||
"""
|
||||
|
||||
|
@ -525,9 +525,9 @@ def ndcg_at_k(
|
|||
threshold=DEFAULT_THRESHOLD,
|
||||
):
|
||||
"""Normalized Discounted Cumulative Gain (nDCG).
|
||||
|
||||
|
||||
Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain
|
||||
|
||||
|
||||
Args:
|
||||
rating_true (pandas.DataFrame): True DataFrame
|
||||
rating_pred (pandas.DataFrame): Predicted DataFrame
|
||||
|
@ -535,7 +535,7 @@ def ndcg_at_k(
|
|||
col_item (str): column name for item
|
||||
col_rating (str): column name for rating
|
||||
col_prediction (str): column name for prediction
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
top k items are directly provided, so there is no need to compute the relevancy operation.
|
||||
k (int): number of top k items per user
|
||||
threshold (float): threshold of top items per user (optional)
|
||||
|
@ -587,7 +587,7 @@ def map_at_k(
|
|||
threshold=DEFAULT_THRESHOLD,
|
||||
):
|
||||
"""Mean Average Precision at k
|
||||
|
||||
|
||||
The implementation of MAP is referenced from Spark MLlib evaluation metrics.
|
||||
https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems
|
||||
|
||||
|
@ -597,7 +597,7 @@ def map_at_k(
|
|||
Note:
|
||||
1. The evaluation function is named as 'MAP is at k' because the evaluation class takes top k items for
|
||||
the prediction items. The naming is different from Spark.
|
||||
|
||||
|
||||
2. The MAP is meant to calculate Avg. Precision for the relevant items, so it is normalized by the number of
|
||||
relevant items in the ground truth data, instead of k.
|
||||
|
||||
|
@ -608,7 +608,7 @@ def map_at_k(
|
|||
col_item (str): column name for item
|
||||
col_rating (str): column name for rating
|
||||
col_prediction (str): column name for prediction
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
|
||||
top k items are directly provided, so there is no need to compute the relevancy operation.
|
||||
k (int): number of top k items per user
|
||||
threshold (float): threshold of top items per user (optional)
|
||||
|
@ -649,7 +649,7 @@ def get_top_k_items(
|
|||
"""Get the input customer-item-rating tuple in the format of Pandas
|
||||
DataFrame, output a Pandas DataFrame in the dense format of top k items
|
||||
for each user.
|
||||
|
||||
|
||||
Note:
|
||||
If it is implicit rating, just append a column of constants to be
|
||||
ratings.
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
from pyspark.sql.types import *
|
||||
from pyspark.sql import functions as F
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
)
|
||||
|
@ -21,7 +21,7 @@ class DiversityEvaluation:
|
|||
col_item=DEFAULT_ITEM_COL,
|
||||
col_relevance=None,
|
||||
):
|
||||
"""Initializer.
|
||||
"""Initializer.
|
||||
|
||||
This is the Spark version of diversity metrics evaluator.
|
||||
The methods of this class calculate following diversity metrics:
|
||||
|
@ -163,7 +163,7 @@ class DiversityEvaluation:
|
|||
|
||||
def user_diversity(self):
|
||||
"""Calculate average diversity for recommendations for each user.
|
||||
|
||||
|
||||
The metric definition is based on formula (3) in the following reference:
|
||||
- Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing serendipity into music recommendation, WSDM 2012
|
||||
|
||||
|
@ -201,7 +201,7 @@ class DiversityEvaluation:
|
|||
The metric definition is based on following reference:
|
||||
- P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems: choice, discovery and relevance, ECIR 2011
|
||||
- Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems, eugeneyan.com, April 2020
|
||||
|
||||
|
||||
Returns:
|
||||
pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_item, item_novelty.
|
||||
"""
|
||||
|
@ -266,7 +266,7 @@ class DiversityEvaluation:
|
|||
The metric definition is based on following reference:
|
||||
- Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing serendipity into music recommendation, WSDM 2012
|
||||
- Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems, eugeneyan.com, April 2020
|
||||
|
||||
|
||||
Returns:
|
||||
pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_user, col_item, user_item_serendipity.
|
||||
"""
|
||||
|
@ -380,8 +380,6 @@ class DiversityEvaluation:
|
|||
"p(i)", F.col("count") / count_row_reco
|
||||
).withColumn("entropy(i)", F.col("p(i)") * F.log2(F.col("p(i)")))
|
||||
# distributional coverage
|
||||
d_coverage = -df_entropy.agg(
|
||||
F.sum("entropy(i)")
|
||||
).collect()[0][0]
|
||||
d_coverage = -df_entropy.agg(F.sum("entropy(i)")).collect()[0][0]
|
||||
|
||||
return d_coverage
|
||||
|
|
|
@ -10,7 +10,7 @@ try:
|
|||
except ImportError:
|
||||
pass # skip this import if we are in pure python environment
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_PREDICTION_COL,
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
|
@ -117,7 +117,7 @@ class SparkRatingEvaluation:
|
|||
|
||||
def rmse(self):
|
||||
"""Calculate Root Mean Squared Error.
|
||||
|
||||
|
||||
Returns:
|
||||
float: Root mean squared error.
|
||||
"""
|
||||
|
@ -125,7 +125,7 @@ class SparkRatingEvaluation:
|
|||
|
||||
def mae(self):
|
||||
"""Calculate Mean Absolute Error.
|
||||
|
||||
|
||||
Returns:
|
||||
float: Mean Absolute Error.
|
||||
"""
|
||||
|
@ -187,7 +187,7 @@ class SparkRankingEvaluation:
|
|||
col_rating (str): column name for rating.
|
||||
col_prediction (str): column name for prediction.
|
||||
k (int): number of items to recommend to each user.
|
||||
relevancy_method (str): method for determining relevant items. Possible
|
||||
relevancy_method (str): method for determining relevant items. Possible
|
||||
values are "top_k", "by_time_stamp", and "by_threshold".
|
||||
threshold (float): threshold for determining the relevant recommended items.
|
||||
This is used for the case that predicted ratings follow a known
|
||||
|
@ -305,7 +305,7 @@ class SparkRankingEvaluation:
|
|||
def recall_at_k(self):
|
||||
"""Get recall@K.
|
||||
|
||||
NOTE:
|
||||
NOTE:
|
||||
More details can be found `here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision>`_.
|
||||
|
||||
Return:
|
||||
|
@ -320,7 +320,7 @@ class SparkRankingEvaluation:
|
|||
def ndcg_at_k(self):
|
||||
"""Get Normalized Discounted Cumulative Gain (NDCG)
|
||||
|
||||
NOTE:
|
||||
NOTE:
|
||||
More details can be found `here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.ndcgAt>`_.
|
||||
|
||||
Return:
|
||||
|
@ -333,7 +333,7 @@ class SparkRankingEvaluation:
|
|||
def map_at_k(self):
|
||||
"""Get mean average precision at k.
|
||||
|
||||
NOTE:
|
||||
NOTE:
|
||||
More details can be found `here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision>`_.
|
||||
|
||||
Return:
|
||||
|
@ -356,7 +356,7 @@ def _get_top_k_items(
|
|||
DataFrame, output a Spark DataFrame in the dense format of top k items
|
||||
for each user.
|
||||
|
||||
NOTE:
|
||||
NOTE:
|
||||
if it is implicit rating, just append a column of constants to be ratings.
|
||||
|
||||
Args:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_PREDICTION_COL,
|
||||
|
|
|
@ -6,7 +6,7 @@ import numpy as np
|
|||
import pandas as pd
|
||||
import scipy.sparse as sp
|
||||
import time
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
@ -194,7 +194,7 @@ class ImplicitCF(object):
|
|||
batch_size (int): Batch size of users.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray, numpy.ndarray, numpy.ndarray:
|
||||
numpy.ndarray, numpy.ndarray, numpy.ndarray:
|
||||
- Sampled users.
|
||||
- Sampled positive items.
|
||||
- Sampled negative items.
|
||||
|
|
|
@ -13,14 +13,14 @@ from reco_utils.evaluation.python_evaluation import (
|
|||
precision_at_k,
|
||||
recall_at_k,
|
||||
)
|
||||
from reco_utils.common.python_utils import get_top_k_scored_items
|
||||
from reco_utils.utils.python_utils import get_top_k_scored_items
|
||||
|
||||
|
||||
class LightGCN(object):
|
||||
"""LightGCN model
|
||||
|
||||
:Citation:
|
||||
|
||||
|
||||
He, Xiangnan, Kuan Deng, Xiang Wang, Yan Li, Yongdong Zhang, and Meng Wang.
|
||||
"LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation." arXiv
|
||||
preprint arXiv:2002.02126, 2020.
|
||||
|
|
|
@ -8,7 +8,7 @@ import fastai
|
|||
import fastprogress
|
||||
from fastprogress.fastprogress import force_console_behavior
|
||||
|
||||
from reco_utils.common import constants as cc
|
||||
from reco_utils.utils import constants as cc
|
||||
|
||||
|
||||
def cartesian_product(*arrays):
|
||||
|
@ -38,7 +38,7 @@ def score(
|
|||
top_k=None,
|
||||
):
|
||||
"""Score all users+items provided and reduce to top_k items per user if top_k>0
|
||||
|
||||
|
||||
Args:
|
||||
learner (object): Model.
|
||||
test_df (pandas.DataFrame): Test dataframe.
|
||||
|
@ -48,7 +48,7 @@ def score(
|
|||
top_k (int): Number of top items to recommend.
|
||||
|
||||
Returns:
|
||||
pandas.DataFrame: Result of recommendation
|
||||
pandas.DataFrame: Result of recommendation
|
||||
"""
|
||||
# replace values not known to the model with NaN
|
||||
total_users, total_items = learner.data.train_ds.x.classes.values()
|
||||
|
|
|
@ -12,14 +12,15 @@ from sklearn import datasets
|
|||
from sklearn.preprocessing import normalize
|
||||
from numba import jit, prange
|
||||
|
||||
from reco_utils.common.python_utils import binarize
|
||||
from reco_utils.utils.python_utils import binarize
|
||||
from .geoimc_utils import length_normalize, reduce_dims
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("geoimc")
|
||||
|
||||
class DataPtr():
|
||||
|
||||
class DataPtr:
|
||||
"""
|
||||
Holds data and its respective indices
|
||||
"""
|
||||
|
@ -39,7 +40,6 @@ class DataPtr():
|
|||
self.data_indices = None
|
||||
self.entity_indices = [None, None]
|
||||
|
||||
|
||||
def get_data(self):
|
||||
"""
|
||||
Returns:
|
||||
|
@ -49,33 +49,26 @@ class DataPtr():
|
|||
return self.data
|
||||
return self.data[self.data_indices]
|
||||
|
||||
|
||||
def get_entity(self, of="row"):
|
||||
""" Get entity
|
||||
"""Get entity
|
||||
|
||||
Args:
|
||||
of (str): The entity, either 'row' or 'col'
|
||||
Returns:
|
||||
numpy.ndarray: Entity matrix (based on the entity_indices filter)
|
||||
"""
|
||||
idx = 0 if of=="row" else 1
|
||||
idx = 0 if of == "row" else 1
|
||||
if self.entity_indices[idx] is None:
|
||||
return self.entities[idx]
|
||||
return self.entities[idx][self.entity_indices[idx]]
|
||||
|
||||
|
||||
class Dataset():
|
||||
class Dataset:
|
||||
"""
|
||||
Base class that holds necessary (minimal) information needed
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name,
|
||||
features_dim=0,
|
||||
normalize=False,
|
||||
target_transform=''
|
||||
):
|
||||
def __init__(self, name, features_dim=0, normalize=False, target_transform=""):
|
||||
"""Initialize parameters
|
||||
|
||||
Args:
|
||||
|
@ -97,11 +90,8 @@ class Dataset():
|
|||
self.feat_normalize = normalize
|
||||
self.target_transform = target_transform
|
||||
|
||||
|
||||
def normalize(self):
|
||||
"""Normalizes the entity features
|
||||
|
||||
"""
|
||||
"""Normalizes the entity features"""
|
||||
if self.feat_normalize:
|
||||
for i in range(len(self.entities)):
|
||||
if isspmatrix_csr(self.entities[i]):
|
||||
|
@ -110,7 +100,6 @@ class Dataset():
|
|||
else:
|
||||
self.entities[i] = length_normalize(self.entities[i])
|
||||
|
||||
|
||||
def generate_train_test_data(self, data, test_ratio=0.3):
|
||||
"""Generate train, test split. The split is performed on the row
|
||||
entities. So, this essentially becomes a cold start row entity test.
|
||||
|
@ -127,16 +116,13 @@ class Dataset():
|
|||
np.array(range(0, data.shape[0])),
|
||||
test_size=test_ratio,
|
||||
shuffle=True,
|
||||
random_state=0
|
||||
random_state=0,
|
||||
)
|
||||
self.training_data.entity_indices[0] = self.training_data.data_indices
|
||||
self.test_data.entity_indices[0] = self.test_data.data_indices
|
||||
|
||||
|
||||
def reduce_dims(self):
|
||||
"""Reduces the dimensionality of entity features.
|
||||
|
||||
"""
|
||||
"""Reduces the dimensionality of entity features."""
|
||||
if self.features_dim != 0:
|
||||
self.entities[0] = reduce_dims(self.entities[0], self.features_dim)
|
||||
self.entities[1] = reduce_dims(self.entities[1], self.features_dim)
|
||||
|
@ -153,7 +139,6 @@ class ML_100K(Dataset):
|
|||
self.min_rating = 1
|
||||
self.max_rating = 5
|
||||
|
||||
|
||||
def df2coo(self, df):
|
||||
"""Convert the input dataframe into a coo matrix
|
||||
|
||||
|
@ -161,46 +146,57 @@ class ML_100K(Dataset):
|
|||
df (pandas.DataFrame): DataFrame containing the target matrix information.
|
||||
"""
|
||||
data = []
|
||||
row = list(df['user id']-1)
|
||||
col = list(df['item id']-1)
|
||||
row = list(df["user id"] - 1)
|
||||
col = list(df["item id"] - 1)
|
||||
for idx in range(0, len(df)):
|
||||
val = df['rating'].iloc[idx]
|
||||
val = df["rating"].iloc[idx]
|
||||
data += [val]
|
||||
|
||||
if self.target_transform == 'normalize':
|
||||
data = data/np.sqrt(np.sum(np.arange(self.min_rating, self.max_rating+1)**2))
|
||||
elif self.target_transform == 'binarize':
|
||||
if self.target_transform == "normalize":
|
||||
data = data / np.sqrt(
|
||||
np.sum(np.arange(self.min_rating, self.max_rating + 1) ** 2)
|
||||
)
|
||||
elif self.target_transform == "binarize":
|
||||
data = binarize(np.array(data), 3)
|
||||
|
||||
# TODO: Get this from `u.info`
|
||||
return coo_matrix((data, (row, col)), shape=(943, 1682))
|
||||
|
||||
|
||||
def _read_from_file(self, path):
|
||||
"""Read the traget matrix from file at path.
|
||||
|
||||
Args:
|
||||
path (str): Path to the target matrix
|
||||
"""
|
||||
df = pd.read_csv(path, delimiter='\t', names=['user id','item id','rating','timestamp'], encoding="ISO-8859-1")
|
||||
df.drop(['timestamp'], axis=1, inplace=True)
|
||||
df = pd.read_csv(
|
||||
path,
|
||||
delimiter="\t",
|
||||
names=["user id", "item id", "rating", "timestamp"],
|
||||
encoding="ISO-8859-1",
|
||||
)
|
||||
df.drop(["timestamp"], axis=1, inplace=True)
|
||||
return self.df2coo(df)
|
||||
|
||||
|
||||
def load_data(self, path):
|
||||
""" Load dataset
|
||||
"""Load dataset
|
||||
|
||||
Args:
|
||||
path (str): Path to the directory containing ML100K dataset
|
||||
e1_path (str): Path to the file containing row (user) features of ML100K dataset
|
||||
e2_path (str): Path to the file containing col (movie) features of ML100K dataset
|
||||
"""
|
||||
self.entities = [self._load_user_features(f"{path}/u.user"), self._load_item_features(f"{path}/u.item")]
|
||||
self.entities = [
|
||||
self._load_user_features(f"{path}/u.user"),
|
||||
self._load_item_features(f"{path}/u.item"),
|
||||
]
|
||||
self.normalize()
|
||||
self.reduce_dims()
|
||||
self.training_data = DataPtr(self._read_from_file(f"{path}/u1.base").tocsr(), self.entities)
|
||||
self.test_data = DataPtr(self._read_from_file(f"{path}/u1.test").tocsr(), self.entities)
|
||||
|
||||
self.training_data = DataPtr(
|
||||
self._read_from_file(f"{path}/u1.base").tocsr(), self.entities
|
||||
)
|
||||
self.test_data = DataPtr(
|
||||
self._read_from_file(f"{path}/u1.test").tocsr(), self.entities
|
||||
)
|
||||
|
||||
def _load_user_features(self, path):
|
||||
"""Load user features
|
||||
|
@ -209,21 +205,26 @@ class ML_100K(Dataset):
|
|||
path (str): Path to the file containing user features information
|
||||
|
||||
"""
|
||||
data = pd.read_csv(path, delimiter='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
|
||||
features_df = pd.concat(
|
||||
[data['user_id'],
|
||||
pd.get_dummies(data['user_id']),
|
||||
pd.get_dummies(data['age']),
|
||||
pd.get_dummies(data['gender']),
|
||||
pd.get_dummies(data['occupation']),
|
||||
pd.get_dummies(data['zip_code'])],
|
||||
axis=1
|
||||
data = pd.read_csv(
|
||||
path,
|
||||
delimiter="|",
|
||||
names=["user_id", "age", "gender", "occupation", "zip_code"],
|
||||
)
|
||||
features_df.drop(['user_id'], axis=1, inplace=True)
|
||||
features_df = pd.concat(
|
||||
[
|
||||
data["user_id"],
|
||||
pd.get_dummies(data["user_id"]),
|
||||
pd.get_dummies(data["age"]),
|
||||
pd.get_dummies(data["gender"]),
|
||||
pd.get_dummies(data["occupation"]),
|
||||
pd.get_dummies(data["zip_code"]),
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
features_df.drop(["user_id"], axis=1, inplace=True)
|
||||
user_features = np.nan_to_num(features_df.to_numpy())
|
||||
return user_features
|
||||
|
||||
|
||||
def _load_item_features(self, path):
|
||||
"""Load item features
|
||||
|
||||
|
@ -231,38 +232,43 @@ class ML_100K(Dataset):
|
|||
path (str): Path to the file containing item features information
|
||||
|
||||
"""
|
||||
header =[
|
||||
'movie_id',
|
||||
'movie_title',
|
||||
'release_date',
|
||||
'video_release_date',
|
||||
'IMDb_URL',
|
||||
'unknown',
|
||||
'Action',
|
||||
'Adventure',
|
||||
'Animation',
|
||||
'Childrens',
|
||||
'Comedy',
|
||||
'Crime',
|
||||
'Documentary',
|
||||
'Drama',
|
||||
'Fantasy',
|
||||
'Film-Noir',
|
||||
'Horror',
|
||||
'Musical',
|
||||
'Mystery',
|
||||
'Romance',
|
||||
'Sci-Fi',
|
||||
'Thriller',
|
||||
'War',
|
||||
'Western']
|
||||
data = pd.read_csv(path, delimiter='|', names=header, encoding="ISO-8859-1")
|
||||
header = [
|
||||
"movie_id",
|
||||
"movie_title",
|
||||
"release_date",
|
||||
"video_release_date",
|
||||
"IMDb_URL",
|
||||
"unknown",
|
||||
"Action",
|
||||
"Adventure",
|
||||
"Animation",
|
||||
"Childrens",
|
||||
"Comedy",
|
||||
"Crime",
|
||||
"Documentary",
|
||||
"Drama",
|
||||
"Fantasy",
|
||||
"Film-Noir",
|
||||
"Horror",
|
||||
"Musical",
|
||||
"Mystery",
|
||||
"Romance",
|
||||
"Sci-Fi",
|
||||
"Thriller",
|
||||
"War",
|
||||
"Western",
|
||||
]
|
||||
data = pd.read_csv(path, delimiter="|", names=header, encoding="ISO-8859-1")
|
||||
|
||||
features_df = pd.concat([
|
||||
pd.get_dummies(data['movie_title']),
|
||||
pd.get_dummies(data['release_date']),
|
||||
pd.get_dummies('video_release_date'),
|
||||
pd.get_dummies('IMDb_URL'),
|
||||
data[header[5:]]], axis=1)
|
||||
features_df = pd.concat(
|
||||
[
|
||||
pd.get_dummies(data["movie_title"]),
|
||||
pd.get_dummies(data["release_date"]),
|
||||
pd.get_dummies("video_release_date"),
|
||||
pd.get_dummies("IMDb_URL"),
|
||||
data[header[5:]],
|
||||
],
|
||||
axis=1,
|
||||
)
|
||||
item_features = np.nan_to_num(features_df.to_numpy())
|
||||
return item_features
|
||||
|
|
|
@ -6,7 +6,8 @@ from scipy.linalg import sqrtm
|
|||
from numba import njit, jit, prange
|
||||
|
||||
from .geoimc_utils import length_normalize
|
||||
from reco_utils.common.python_utils import binarize as conv_binary
|
||||
from reco_utils.utils.python_utils import binarize as conv_binary
|
||||
|
||||
|
||||
class PlainScalarProduct(object):
|
||||
"""
|
||||
|
@ -14,12 +15,7 @@ class PlainScalarProduct(object):
|
|||
as the retrieval criterion
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
X,
|
||||
Y,
|
||||
**kwargs
|
||||
):
|
||||
def __init__(self, X, Y, **kwargs):
|
||||
"""
|
||||
Args:
|
||||
X: numpy matrix of shape (users, features)
|
||||
|
@ -28,25 +24,18 @@ class PlainScalarProduct(object):
|
|||
self.X = X
|
||||
self.Y = Y
|
||||
|
||||
|
||||
def sim(self, **kwargs):
|
||||
"""Calculate the similarity score
|
||||
"""
|
||||
"""Calculate the similarity score"""
|
||||
sim = self.X.dot(self.Y.T)
|
||||
return sim
|
||||
|
||||
|
||||
class Inferer():
|
||||
class Inferer:
|
||||
"""
|
||||
Holds necessary (minimal) information needed for inference
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
method='dot',
|
||||
k=10,
|
||||
transformation=''
|
||||
):
|
||||
def __init__(self, method="dot", k=10, transformation=""):
|
||||
"""Initialize parameters
|
||||
|
||||
Args:
|
||||
|
@ -64,7 +53,6 @@ class Inferer():
|
|||
self.k = k
|
||||
self.transformation = transformation
|
||||
|
||||
|
||||
def _get_method(self, k):
|
||||
"""Get the inferer method
|
||||
|
||||
|
@ -74,13 +62,12 @@ class Inferer():
|
|||
Returns:
|
||||
class: A class object implementing the inferer 'k'
|
||||
"""
|
||||
if k == 'dot':
|
||||
if k == "dot":
|
||||
method = PlainScalarProduct
|
||||
else:
|
||||
raise ValueError(f"{k} is unknown.")
|
||||
return method
|
||||
|
||||
|
||||
def infer(self, dataPtr, W, **kwargs):
|
||||
"""Main inference method
|
||||
|
||||
|
@ -96,18 +83,15 @@ class Inferer():
|
|||
a = dataPtr.get_entity("row").dot(W[0]).dot(sqrtm(W[1]))
|
||||
b = dataPtr.get_entity("col").dot(W[2]).dot(sqrtm(W[1]))
|
||||
|
||||
sim_score = self.method(
|
||||
a,
|
||||
b
|
||||
).sim(**kwargs)
|
||||
sim_score = self.method(a, b).sim(**kwargs)
|
||||
|
||||
if self.transformation == 'mean':
|
||||
if self.transformation == "mean":
|
||||
prediction = conv_binary(sim_score, sim_score.mean())
|
||||
elif self.transformation == 'topk':
|
||||
elif self.transformation == "topk":
|
||||
masked_sim_score = sim_score.copy()
|
||||
|
||||
for i in range(sim_score.shape[0]):
|
||||
topKidx = np.argpartition(masked_sim_score[i], -self.k)[-self.k:]
|
||||
topKidx = np.argpartition(masked_sim_score[i], -self.k)[-self.k :]
|
||||
mask = np.ones(sim_score[i].size, dtype=bool)
|
||||
mask[topKidx] = False
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import numpy as np
|
|||
import pandas as pd
|
||||
import warnings
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
@ -27,21 +27,21 @@ class Dataset(object):
|
|||
binary=True,
|
||||
seed=None,
|
||||
):
|
||||
"""Constructor
|
||||
|
||||
"""Constructor
|
||||
|
||||
Args:
|
||||
train (pandas.DataFrame): Training data with at least columns (col_user, col_item, col_rating).
|
||||
test (pandas.DataFrame): Test data with at least columns (col_user, col_item, col_rating). test can be None,
|
||||
test (pandas.DataFrame): Test data with at least columns (col_user, col_item, col_rating). test can be None,
|
||||
if so, we only process the training data.
|
||||
n_neg (int): Number of negative samples for training set.
|
||||
n_neg_test (int): Number of negative samples for test set.
|
||||
col_user (str): User column name.
|
||||
col_item (str): Item column name.
|
||||
col_rating (str): Rating column name.
|
||||
col_rating (str): Rating column name.
|
||||
col_timestamp (str): Timestamp column name.
|
||||
binary (bool): If true, set rating > 0 to rating = 1.
|
||||
binary (bool): If true, set rating > 0 to rating = 1.
|
||||
seed (int): Seed.
|
||||
|
||||
|
||||
"""
|
||||
# initialize user and item index
|
||||
self.user_idx = None
|
||||
|
@ -66,14 +66,14 @@ class Dataset(object):
|
|||
"""Process the dataset to reindex userID and itemID, also set rating as binary feedback
|
||||
|
||||
Args:
|
||||
train (pandas.DataFrame): Training data with at least columns (col_user, col_item, col_rating).
|
||||
train (pandas.DataFrame): Training data with at least columns (col_user, col_item, col_rating).
|
||||
test (pandas.DataFrame): Test data with at least columns (col_user, col_item, col_rating)
|
||||
test can be None, if so, we only process the training data.
|
||||
binary (bool): If true, set rating>0 to rating = 1.
|
||||
|
||||
Returns:
|
||||
list: train and test pandas.DataFrame Dataset, which have been reindexed.
|
||||
|
||||
|
||||
"""
|
||||
# If testing dataset is None
|
||||
df = train if test is None else train.append(test)
|
||||
|
@ -109,12 +109,12 @@ class Dataset(object):
|
|||
"""Process dataset to reindex userID and itemID, also set rating as binary feedback
|
||||
|
||||
Args:
|
||||
df (pandas.DataFrame): dataframe with at least columns (col_user, col_item, col_rating)
|
||||
binary (bool): if true, set rating>0 to rating = 1
|
||||
df (pandas.DataFrame): dataframe with at least columns (col_user, col_item, col_rating)
|
||||
binary (bool): if true, set rating>0 to rating = 1
|
||||
|
||||
Returns:
|
||||
list: train and test pandas.DataFrame Dataset, which have been reindexed.
|
||||
|
||||
|
||||
"""
|
||||
|
||||
# If testing dataset is None
|
||||
|
@ -140,7 +140,7 @@ class Dataset(object):
|
|||
def _init_train_data(self):
|
||||
"""Return all negative items (in train dataset) and store them in self.interact_status[self.col_item + '_negative']
|
||||
store train dataset in self.users, self.items and self.ratings
|
||||
|
||||
|
||||
"""
|
||||
|
||||
self.item_pool = set(self.train[self.col_item].unique())
|
||||
|
@ -277,11 +277,11 @@ class Dataset(object):
|
|||
|
||||
def train_loader(self, batch_size, shuffle=True):
|
||||
"""Feed train data every batch.
|
||||
|
||||
|
||||
Args:
|
||||
batch_size (int): Batch size.
|
||||
shuffle (bool): Ff true, train data will be shuffled.
|
||||
|
||||
|
||||
Yields:
|
||||
list: A list of userID list, itemID list, and rating list. Public data loader returns the userID, itemID consistent with raw data.
|
||||
"""
|
||||
|
@ -305,7 +305,7 @@ class Dataset(object):
|
|||
|
||||
def test_loader(self):
|
||||
"""Feed leave-one-out data every user
|
||||
|
||||
|
||||
Generate test batch by every positive test instance,
|
||||
(eg. \[1, 2, 1\] is a positive user & item pair in test set
|
||||
(\[userID, itemID, rating\] for this tuple). This function
|
||||
|
|
|
@ -5,7 +5,7 @@ import numpy as np
|
|||
import pandas as pd
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
|
|
@ -7,14 +7,14 @@ import pandas as pd
|
|||
import logging
|
||||
from scipy import sparse
|
||||
|
||||
from reco_utils.common.python_utils import (
|
||||
from reco_utils.utils.python_utils import (
|
||||
jaccard,
|
||||
lift,
|
||||
exponential_decay,
|
||||
get_top_k_scored_items,
|
||||
rescale,
|
||||
)
|
||||
from reco_utils.common import constants
|
||||
from reco_utils.utils import constants
|
||||
|
||||
|
||||
COOCCUR = "cooccurrence"
|
||||
|
@ -111,7 +111,7 @@ class SARSingleNode:
|
|||
self.index2item = None
|
||||
|
||||
def compute_affinity_matrix(self, df, rating_col):
|
||||
""" Affinity matrix.
|
||||
"""Affinity matrix.
|
||||
|
||||
The user-affinity matrix can be constructed by treating the users and items as
|
||||
indices in a sparse matrix, and the events as the data. Here, we're treating
|
||||
|
@ -157,7 +157,7 @@ class SARSingleNode:
|
|||
return df.groupby([self.col_user, self.col_item]).sum().reset_index()
|
||||
|
||||
def compute_coocurrence_matrix(self, df):
|
||||
""" Co-occurrence matrix.
|
||||
"""Co-occurrence matrix.
|
||||
|
||||
The co-occurrence matrix is defined as :math:`C = U^T * U`
|
||||
|
||||
|
|
|
@ -5,12 +5,12 @@ import pandas as pd
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_PREDICTION_COL,
|
||||
)
|
||||
from reco_utils.common.general_utils import invert_dictionary
|
||||
from reco_utils.utils.general_utils import invert_dictionary
|
||||
|
||||
|
||||
def surprise_trainset_to_df(
|
||||
|
@ -25,7 +25,7 @@ def surprise_trainset_to_df(
|
|||
col_user (str): User column name.
|
||||
col_item (str): Item column name.
|
||||
col_rating (str): Rating column name.
|
||||
|
||||
|
||||
Returns:
|
||||
pandas.DataFrame: A dataframe with user column (str), item column (str), and rating column (float).
|
||||
"""
|
||||
|
@ -53,13 +53,13 @@ def predict(
|
|||
predcol=DEFAULT_PREDICTION_COL,
|
||||
):
|
||||
"""Computes predictions of an algorithm from Surprise on the data. Can be used for computing rating metrics like RMSE.
|
||||
|
||||
|
||||
Args:
|
||||
algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
|
||||
data (pandas.DataFrame): the data on which to predict
|
||||
usercol (str): name of the user column
|
||||
itemcol (str): name of the item column
|
||||
|
||||
|
||||
Returns:
|
||||
pandas.DataFrame: Dataframe with usercol, itemcol, predcol
|
||||
"""
|
||||
|
@ -84,14 +84,14 @@ def compute_ranking_predictions(
|
|||
):
|
||||
"""Computes predictions of an algorithm from Surprise on all users and items in data. It can be used for computing
|
||||
ranking metrics like NDCG.
|
||||
|
||||
|
||||
Args:
|
||||
algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
|
||||
data (pandas.DataFrame): the data from which to get the users and items
|
||||
usercol (str): name of the user column
|
||||
itemcol (str): name of the item column
|
||||
remove_seen (bool): flag to remove (user, item) pairs seen in the training data
|
||||
|
||||
|
||||
Returns:
|
||||
pandas.DataFrame: Dataframe with usercol, itemcol, predcol
|
||||
"""
|
||||
|
|
|
@ -12,7 +12,7 @@ from subprocess import run
|
|||
from tempfile import TemporaryDirectory
|
||||
import pandas as pd
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
@ -67,7 +67,7 @@ class VW:
|
|||
|
||||
Args:
|
||||
params (dict): key = parameter, value = value (use True if parameter is just a flag)
|
||||
|
||||
|
||||
Returns:
|
||||
list[str]: vw command line parameters as list of strings
|
||||
"""
|
||||
|
@ -88,10 +88,10 @@ class VW:
|
|||
|
||||
def parse_train_params(self, params):
|
||||
"""Parse input hyper-parameters to build vw train commands
|
||||
|
||||
|
||||
Args:
|
||||
params (dict): key = parameter, value = value (use True if parameter is just a flag)
|
||||
|
||||
|
||||
Returns:
|
||||
list[str]: vw command line parameters as list of strings
|
||||
"""
|
||||
|
@ -127,10 +127,10 @@ class VW:
|
|||
|
||||
def parse_test_params(self, params):
|
||||
"""Parse input hyper-parameters to build vw test commands
|
||||
|
||||
|
||||
Args:
|
||||
params (dict): key = parameter, value = value (use True if parameter is just a flag)
|
||||
|
||||
|
||||
Returns:
|
||||
list[str]: vw command line parameters as list of strings
|
||||
"""
|
||||
|
@ -188,7 +188,7 @@ class VW:
|
|||
|
||||
def to_vw_file(self, df, train=True):
|
||||
"""Convert Pandas DataFrame to vw input format file
|
||||
|
||||
|
||||
Args:
|
||||
df (pandas.DataFrame): input DataFrame
|
||||
train (bool): flag for train mode (or test mode if False)
|
||||
|
@ -228,7 +228,7 @@ class VW:
|
|||
|
||||
def fit(self, df):
|
||||
"""Train model
|
||||
|
||||
|
||||
Args:
|
||||
df (pandas.DataFrame): input training data
|
||||
"""
|
||||
|
@ -241,7 +241,7 @@ class VW:
|
|||
|
||||
def predict(self, df):
|
||||
"""Predict results
|
||||
|
||||
|
||||
Args:
|
||||
df (pandas.DataFrame): input test data
|
||||
"""
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
|
||||
import tensorflow as tf
|
||||
|
||||
from reco_utils.common.constants import DEFAULT_USER_COL, DEFAULT_ITEM_COL
|
||||
from reco_utils.common.tf_utils import MODEL_DIR
|
||||
from reco_utils.utils.constants import DEFAULT_USER_COL, DEFAULT_ITEM_COL
|
||||
from reco_utils.utils.tf_utils import MODEL_DIR
|
||||
|
||||
|
||||
def build_feature_columns(
|
||||
|
@ -37,9 +37,9 @@ def build_feature_columns(
|
|||
'wide_deep' for a combination of linear model and neural networks.
|
||||
|
||||
Returns:
|
||||
list, list:
|
||||
- The wide feature columns
|
||||
- The deep feature columns. If only the wide model is selected, the deep column list is empty and viceversa.
|
||||
list, list:
|
||||
- The wide feature columns
|
||||
- The deep feature columns. If only the wide model is selected, the deep column list is empty and viceversa.
|
||||
"""
|
||||
if model_type not in ["wide", "deep", "wide_deep"]:
|
||||
raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'")
|
||||
|
@ -102,7 +102,7 @@ def _build_deep_columns(
|
|||
item_dim (int): Item embedding dimension.
|
||||
item_feat_col (str): Item feature column name.
|
||||
item_feat_shape (int or an iterable of integers): Item feature array shape.
|
||||
|
||||
|
||||
Returns:
|
||||
list: Deep feature columns.
|
||||
"""
|
||||
|
@ -140,7 +140,7 @@ def build_model(
|
|||
seed=None,
|
||||
):
|
||||
"""Build wide-deep model.
|
||||
|
||||
|
||||
To generate wide model, pass wide_columns only.
|
||||
To generate deep model, pass deep_columns only.
|
||||
To generate wide_deep model, pass both wide_columns and deep_columns.
|
||||
|
|
|
@ -12,7 +12,7 @@ import nni
|
|||
import reco_utils.evaluation.python_evaluation as evaluation
|
||||
from reco_utils.recommender.ncf.ncf_singlenode import NCF
|
||||
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
|
||||
from reco_utils.common.constants import SEED as DEFAULT_SEED
|
||||
from reco_utils.utils.constants import SEED as DEFAULT_SEED
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger("ncf")
|
||||
|
|
|
@ -19,7 +19,7 @@ import pandas as pd
|
|||
import pytest
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
@ -27,7 +27,7 @@ from reco_utils.common.constants import (
|
|||
)
|
||||
from reco_utils.dataset.python_splitters import numpy_stratified_split
|
||||
from reco_utils.dataset.python_splitters import python_chrono_split
|
||||
from reco_utils.common.spark_utils import start_or_get_spark
|
||||
from reco_utils.utils.spark_utils import start_or_get_spark
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
|
|
@ -11,7 +11,7 @@ except ImportError:
|
|||
pass # disable error while collecting tests for non-notebook environments
|
||||
|
||||
|
||||
from reco_utils.common.gpu_utils import get_number_gpus
|
||||
from reco_utils.utils.gpu_utils import get_number_gpus
|
||||
|
||||
|
||||
TOL = 0.5
|
||||
|
|
|
@ -3,13 +3,14 @@
|
|||
|
||||
|
||||
import pytest
|
||||
|
||||
try:
|
||||
import papermill as pm
|
||||
import scrapbook as sb
|
||||
except ImportError:
|
||||
pass # disable error while collecting tests for non-notebook environments
|
||||
|
||||
from reco_utils.common.gpu_utils import get_number_gpus
|
||||
from reco_utils.utils.gpu_utils import get_number_gpus
|
||||
|
||||
|
||||
TOL = 0.5
|
||||
|
|
|
@ -3,12 +3,13 @@
|
|||
|
||||
import os
|
||||
import pytest
|
||||
|
||||
try:
|
||||
import papermill as pm
|
||||
except ImportError:
|
||||
pass # disable error while collecting tests for non-notebook environments
|
||||
|
||||
from reco_utils.common.gpu_utils import get_number_gpus
|
||||
from reco_utils.utils.gpu_utils import get_number_gpus
|
||||
|
||||
|
||||
@pytest.mark.notebooks
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
import pytest
|
||||
from reco_utils.common.general_utils import invert_dictionary, get_number_processors
|
||||
from reco_utils.utils.general_utils import invert_dictionary, get_number_processors
|
||||
|
||||
|
||||
def test_invert_dictionary():
|
||||
|
|
|
@ -11,7 +11,7 @@ except ImportError:
|
|||
pass # skip this import if we are in cpu environment
|
||||
|
||||
|
||||
from reco_utils.common.gpu_utils import (
|
||||
from reco_utils.utils.gpu_utils import (
|
||||
get_cuda_version,
|
||||
get_cudnn_version,
|
||||
get_gpu_info,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from reco_utils.common.k8s_utils import (
|
||||
from reco_utils.utils.k8s_utils import (
|
||||
qps_to_replicas,
|
||||
replicas_to_qps,
|
||||
nodes_to_replicas,
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"collapsed": true
|
||||
},
|
||||
"source": [
|
||||
"# This is a test notebook for reco_utils.common.notebook_utils module"
|
||||
"# This is a test notebook for reco_utils.utils.notebook_utils module"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -20,7 +20,7 @@
|
|||
|
||||
"\n",
|
||||
"import scrapbook as sb\n",
|
||||
"from reco_utils.common.notebook_utils import is_jupyter, is_databricks"
|
||||
"from reco_utils.utils.notebook_utils import is_jupyter, is_databricks"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
|
@ -3,12 +3,13 @@
|
|||
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
try:
|
||||
import papermill as pm
|
||||
import scrapbook as sb
|
||||
except ImportError:
|
||||
pass # disable error while collecting tests for non-notebook environments
|
||||
from reco_utils.common.notebook_utils import is_jupyter, is_databricks
|
||||
from reco_utils.utils.notebook_utils import is_jupyter, is_databricks
|
||||
|
||||
|
||||
@pytest.mark.notebooks
|
||||
|
|
|
@ -6,7 +6,7 @@ import matplotlib
|
|||
matplotlib.use("Agg")
|
||||
import matplotlib.pyplot as plt
|
||||
import pytest
|
||||
from reco_utils.common.plot import line_graph
|
||||
from reco_utils.utils.plot import line_graph
|
||||
|
||||
|
||||
def test_line_graph():
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from reco_utils.common.python_utils import (
|
||||
from reco_utils.utils.python_utils import (
|
||||
exponential_decay,
|
||||
jaccard,
|
||||
lift,
|
||||
|
|
|
@ -6,7 +6,7 @@ import os
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
@ -15,7 +15,7 @@ from reco_utils.common.constants import (
|
|||
from reco_utils.evaluation.python_evaluation import rmse
|
||||
|
||||
try:
|
||||
from reco_utils.common.tf_utils import (
|
||||
from reco_utils.utils.tf_utils import (
|
||||
build_optimizer,
|
||||
evaluation_log_hook,
|
||||
export_model,
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
import pytest
|
||||
import time
|
||||
from reco_utils.common.timer import Timer
|
||||
from reco_utils.utils.timer import Timer
|
||||
|
||||
|
||||
TOL = 0.03
|
||||
|
|
|
@ -17,7 +17,7 @@ from reco_utils.dataset.python_splitters import (
|
|||
numpy_stratified_split,
|
||||
)
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
|
|
@ -6,7 +6,7 @@ import numpy as np
|
|||
import pytest
|
||||
|
||||
from reco_utils.dataset.sparse import AffinityMatrix
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
|
|
@ -6,7 +6,7 @@ import pandas as pd
|
|||
import pytest
|
||||
from unittest.mock import Mock
|
||||
from sklearn.preprocessing import minmax_scale
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
@ -65,6 +65,7 @@ def rating_nohit():
|
|||
)
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def rating_true_binary(rating_true):
|
||||
# Convert true ratings to binary
|
||||
|
|
|
@ -6,7 +6,7 @@ import pandas as pd
|
|||
import pytest
|
||||
import cornac
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
|
|
@ -9,7 +9,7 @@ import pandas as pd
|
|||
from scipy.sparse import csr_matrix
|
||||
from pandas.testing import assert_frame_equal
|
||||
|
||||
from reco_utils.common.python_utils import binarize
|
||||
from reco_utils.utils.python_utils import binarize
|
||||
from reco_utils.recommender.geoimc.geoimc_data import DataPtr
|
||||
from reco_utils.recommender.geoimc.geoimc_predict import PlainScalarProduct, Inferer
|
||||
from reco_utils.recommender.geoimc.geoimc_algorithm import IMCProblem
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
SEED,
|
||||
|
|
|
@ -9,7 +9,7 @@ import pytest
|
|||
try:
|
||||
from reco_utils.recommender.ncf.ncf_singlenode import NCF
|
||||
from reco_utils.recommender.ncf.dataset import Dataset
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
SEED,
|
||||
|
|
|
@ -10,7 +10,7 @@ import pandas as pd
|
|||
from pandas.testing import assert_frame_equal
|
||||
import urllib
|
||||
|
||||
from reco_utils.common.constants import DEFAULT_PREDICTION_COL
|
||||
from reco_utils.utils.constants import DEFAULT_PREDICTION_COL
|
||||
from reco_utils.recommender.sar.sar_singlenode import SARSingleNode
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import pandas as pd
|
|||
import pytest
|
||||
import surprise
|
||||
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
import os
|
||||
import pytest
|
||||
import pandas as pd
|
||||
from reco_utils.common.constants import (
|
||||
from reco_utils.utils.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL,
|
||||
)
|
||||
|
||||
try:
|
||||
from reco_utils.common.tf_utils import pandas_input_fn, MODEL_DIR
|
||||
from reco_utils.utils.tf_utils import pandas_input_fn, MODEL_DIR
|
||||
from reco_utils.recommender.wide_deep.wide_deep_utils import (
|
||||
build_model,
|
||||
build_feature_columns,
|
||||
|
|
Загрузка…
Ссылка в новой задаче