This commit is contained in:
miguelgfierro 2021-07-15 16:25:05 +00:00
Родитель 3fb90955c0
Коммит 1eb2a93e4a
83 изменённых файлов: 371 добавлений и 405 удалений

Просмотреть файл

@ -7,64 +7,64 @@ Common utilities module
General utilities
===============================
.. automodule:: reco_utils.common.general_utils
.. automodule:: reco_utils.utils.general_utils
:members:
GPU utilities
===============================
.. automodule:: reco_utils.common.gpu_utils
.. automodule:: reco_utils.utils.gpu_utils
:members:
Kubernetes utilities
===============================
.. automodule:: reco_utils.common.k8s_utils
.. automodule:: reco_utils.utils.k8s_utils
:members:
Notebook utilities
===============================
.. automodule:: reco_utils.common.notebook_utils
.. automodule:: reco_utils.utils.notebook_utils
:members:
.. automodule:: reco_utils.common.notebook_memory_management
.. automodule:: reco_utils.utils.notebook_memory_management
:members:
Python utilities
===============================
.. automodule:: reco_utils.common.python_utils
.. automodule:: reco_utils.utils.python_utils
:members:
Spark utilities
===============================
.. automodule:: reco_utils.common.spark_utils
.. automodule:: reco_utils.utils.spark_utils
:members:
Tensorflow utilities
===============================
.. automodule:: reco_utils.common.tf_utils
.. automodule:: reco_utils.utils.tf_utils
:members:
Timer
===============================
.. automodule:: reco_utils.common.timer
.. automodule:: reco_utils.utils.timer
:members:
Plot utilities
===============================
.. automodule:: reco_utils.common.plot
.. automodule:: reco_utils.utils.plot
:members:

Просмотреть файл

@ -11,7 +11,7 @@ evaluating recommender systems.
:maxdepth: 1
:caption: Contents:
Common <common>
Utils <utils>
Dataset <dataset>
Evaluation <evaluation>
Recommender algorithms <recommender>

Просмотреть файл

@ -52,12 +52,12 @@
"from pyspark.sql.types import StructType, StructField\n",
"from pyspark.sql.types import StringType, FloatType, IntegerType, LongType\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.common.notebook_utils import is_jupyter\n",
"from reco_utils.utils.notebook_utils import is_jupyter\n",
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
"from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation\n",
"from reco_utils.common.spark_utils import start_or_get_spark\n",
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"Spark version: {}\".format(pyspark.__version__))\n"

Просмотреть файл

@ -49,7 +49,7 @@
"import torch, fastai\n",
"from fastai.collab import EmbeddingDotBias, collab_learner, CollabDataBunch, load_learner\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
"from reco_utils.recommender.fastai.fastai_utils import cartesian_product, score\n",

Просмотреть файл

@ -52,11 +52,11 @@
"import tensorflow as tf\n",
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.recommender.ncf.ncf_singlenode import NCF\n",
"from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.common.notebook_utils import is_jupyter\n",
"from reco_utils.utils.notebook_utils import is_jupyter\n",
"from reco_utils.dataset.python_splitters import python_chrono_split\n",
"from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, \n",
" recall_at_k, get_top_k_items)\n",

Просмотреть файл

@ -64,8 +64,8 @@
"import scrapbook as sb\n",
"from sklearn.preprocessing import minmax_scale\n",
"\n",
"from reco_utils.common.python_utils import binarize\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.python_utils import binarize\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
"from reco_utils.evaluation.python_evaluation import (\n",
@ -497,7 +497,7 @@
"source": [
"### 2.3. Evaluate how well SAR performs\n",
"\n",
"We evaluate how well SAR performs for a few common ranking metrics provided in the `python_evaluation` module in reco_utils. We will consider the Mean Average Precision (MAP), Normalized Discounted Cumalative Gain (NDCG), Precision, and Recall for the top-k items per user we computed with SAR. User, item and rating column names are specified in each evaluation method."
"We evaluate how well SAR performs for a few utils ranking metrics provided in the `python_evaluation` module in reco_utils. We will consider the Mean Average Precision (MAP), Normalized Discounted Cumalative Gain (NDCG), Precision, and Recall for the top-k items per user we computed with SAR. User, item and rating column names are specified in each evaluation method."
]
},
{

Просмотреть файл

@ -337,7 +337,7 @@
"from azureml.core import Run\n",
"from sklearn.externals import joblib\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",

Просмотреть файл

@ -63,8 +63,8 @@
"import tensorflow as tf\n",
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.common.constants import SEED\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.utils.constants import SEED\n",
"from reco_utils.recommender.deeprec.deeprec_utils import (\n",
" prepare_hparams\n",
")\n",

Просмотреть файл

@ -65,14 +65,14 @@
"import tensorflow as tf\n",
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
"\n",
"from reco_utils.common.constants import (\n",
"from reco_utils.utils.constants import (\n",
" DEFAULT_USER_COL as USER_COL,\n",
" DEFAULT_ITEM_COL as ITEM_COL,\n",
" DEFAULT_RATING_COL as RATING_COL,\n",
" DEFAULT_PREDICTION_COL as PREDICT_COL,\n",
" SEED\n",
")\n",
"from reco_utils.common import tf_utils, gpu_utils, plot\n",
"from reco_utils.utils import tf_utils, gpu_utils, plot\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.pandas_df_utils import user_item_pairs\n",
"from reco_utils.dataset.python_splitters import python_random_split\n",

Просмотреть файл

@ -53,7 +53,7 @@
"import tensorflow as tf\n",
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
"\n",
"from reco_utils.common.constants import SEED\n",
"from reco_utils.utils.constants import SEED\n",
"from reco_utils.recommender.deeprec.deeprec_utils import (\n",
" download_deeprec_resources, prepare_hparams\n",
")\n",

Просмотреть файл

@ -56,7 +56,7 @@
"import numpy as np\n",
"from datetime import datetime, timedelta\n",
"\n",
"from reco_utils.common.spark_utils import start_or_get_spark\n",
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
"from reco_utils.dataset.download_utils import maybe_download\n",
"from reco_utils.dataset.python_splitters import (\n",
" python_random_split, \n",

Просмотреть файл

@ -70,7 +70,7 @@
"import itertools\n",
"import pandas as pd\n",
"\n",
"from reco_utils.common.notebook_utils import is_jupyter\n",
"from reco_utils.utils.notebook_utils import is_jupyter\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_random_split\n",
"from reco_utils.dataset.pandas_df_utils import filter_by\n",

Просмотреть файл

@ -55,8 +55,8 @@
"from reco_utils.dataset.python_splitters import python_random_split\n",
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
"from reco_utils.recommender.cornac.cornac_utils import predict_ranking\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.common.constants import SEED\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.utils.constants import SEED\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"PyTorch version: {}\".format(torch.__version__))\n",

Просмотреть файл

@ -53,8 +53,8 @@
"from reco_utils.dataset.python_splitters import python_random_split\n",
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
"from reco_utils.recommender.cornac.cornac_utils import predict_ranking\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.common.constants import SEED\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.utils.constants import SEED\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"print(\"Cornac version: {}\".format(cornac.__version__))"
@ -89,7 +89,7 @@
"\n",
"### 1.1 Personalized Ranking from Implicit Feedback\n",
"\n",
"The task of personalized ranking aims at providing each user a ranked list of items (recommendations). This is very common in scenarios where recommender systems are based on implicit user behavior (e.g. purchases, clicks). The available observations are only positive feedback where the non-observed ones are a mixture of real negative feedback and missing values.\n",
"The task of personalized ranking aims at providing each user a ranked list of items (recommendations). This is very utils in scenarios where recommender systems are based on implicit user behavior (e.g. purchases, clicks). The available observations are only positive feedback where the non-observed ones are a mixture of real negative feedback and missing values.\n",
"\n",
"One usual approach for item recommendation is directly predicting a preference score $\\hat{x}_{u,i}$ given to item $i$ by user $u$. BPR uses a different approach by using item pairs $(i, j)$ and optimizing for the correct ranking given preference of user $u$, thus, there are notions of *positive* and *negative* items. The training data $D_S : U \\times I \\times I$ is defined as:\n",
"\n",
@ -118,7 +118,7 @@
"\n",
"The preference scoring function $\\hat{x}_{uij}(\\Theta)$ could be an arbitrary real-valued function of the model parameter $\\Theta$. Thus, it makes BPR a general framework for modeling the relationship between triplets $(u, i, j)$ where different model classes like matrix factorization could be used for estimating $\\hat{x}_{uij}(\\Theta)$.\n",
"\n",
"For the prior, one of the common pratices is to choose $p(\\Theta)$ following a normal distribution, which results in a nice form of L2 regularization in the final log-form of the objective function.\n",
"For the prior, one of the utils pratices is to choose $p(\\Theta)$ following a normal distribution, which results in a nice form of L2 regularization in the final log-form of the objective function.\n",
"\n",
"$$ p(\\Theta) \\sim N(0, \\Sigma_{\\Theta}) $$\n",
"\n",

Просмотреть файл

@ -52,13 +52,13 @@
"import tensorflow as tf\n",
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.recommender.deeprec.models.graphrec.lightgcn import LightGCN\n",
"from reco_utils.recommender.deeprec.DataModel.ImplicitCF import ImplicitCF\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
"from reco_utils.common.constants import SEED as DEFAULT_SEED\n",
"from reco_utils.utils.constants import SEED as DEFAULT_SEED\n",
"from reco_utils.recommender.deeprec.deeprec_utils import prepare_hparams\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
@ -728,7 +728,7 @@
"\n",
"Here there are the performances of LightGCN compared to [SAR](../00_quick_start/sar_movielens.ipynb) and [NCF](../00_quick_start/ncf_movielens.ipynb) on MovieLens dataset of 100k and 1m. The method of data loading and splitting is the same as that described above and the GPU used was a GeForce GTX 1080Ti.\n",
"\n",
"Settings common to the three models: `epochs=15, seed=42`.\n",
"Settings utils to the three models: `epochs=15, seed=42`.\n",
"\n",
"Settings for LightGCN: `embed_size=64, n_layers=3, batch_size=1024, decay=0.0001, learning_rate=0.015 `.\n",
"\n",

Просмотреть файл

@ -96,14 +96,14 @@
"import tensorflow as tf\n",
"import keras\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.split_utils import min_rating_filter_pandas\n",
"from reco_utils.dataset.python_splitters import numpy_stratified_split\n",
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
"\n",
"from reco_utils.dataset.sparse import AffinityMatrix\n",
"from reco_utils.common.python_utils import binarize\n",
"from reco_utils.utils.python_utils import binarize\n",
"from reco_utils.recommender.vae.multinomial_vae import Mult_VAE\n",
"\n",
"from tempfile import TemporaryDirectory\n",

Просмотреть файл

@ -96,15 +96,15 @@
"import tensorflow as tf\n",
"import keras\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.split_utils import min_rating_filter_pandas\n",
"from reco_utils.dataset.python_splitters import numpy_stratified_split\n",
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
"from reco_utils.common.constants import SEED as DEFAULT_SEED\n",
"from reco_utils.utils.constants import SEED as DEFAULT_SEED\n",
"\n",
"from reco_utils.dataset.sparse import AffinityMatrix\n",
"from reco_utils.common.python_utils import binarize\n",
"from reco_utils.utils.python_utils import binarize\n",
"from reco_utils.recommender.vae.standard_vae import StandardVAE\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",

Просмотреть файл

@ -105,7 +105,7 @@
"import scrapbook as sb\n",
"import pandas as pd\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_random_split\n",
"from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, \n",

Просмотреть файл

@ -71,8 +71,8 @@
"import papermill as pm\n",
"import scrapbook as sb\n",
"\n",
"from reco_utils.common.spark_utils import start_or_get_spark\n",
"from reco_utils.common.notebook_utils import is_databricks\n",
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
"from reco_utils.utils.notebook_utils import is_databricks\n",
"from reco_utils.dataset.criteo import load_spark_df\n",
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
"\n",

Просмотреть файл

@ -86,7 +86,7 @@
"import papermill as pm\n",
"import scrapbook as sb\n",
"\n",
"from reco_utils.common.notebook_utils import is_jupyter\n",
"from reco_utils.utils.notebook_utils import is_jupyter\n",
"from reco_utils.dataset.movielens import load_pandas_df\n",
"from reco_utils.dataset.python_splitters import python_random_split\n",
"from reco_utils.evaluation.python_evaluation import (rmse, mae, exp_var, rsquared, get_top_k_items,\n",

Просмотреть файл

@ -251,8 +251,8 @@
"%matplotlib notebook\n",
"from matplotlib import pyplot as plt\n",
"\n",
"from reco_utils.common.constants import SEED\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.constants import SEED\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset.download_utils import maybe_download, unzip_file\n",
"from reco_utils.tuning.parameter_sweep import generate_param_grid\n",
"from reco_utils.dataset.pandas_df_utils import LibffmConverter\n",

Просмотреть файл

@ -53,14 +53,14 @@
"import tensorflow as tf\n",
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.recommender.ncf.ncf_singlenode import NCF\n",
"from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_chrono_split\n",
"from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, \n",
" recall_at_k, get_top_k_items)\n",
"from reco_utils.common.constants import SEED as DEFAULT_SEED\n",
"from reco_utils.utils.constants import SEED as DEFAULT_SEED\n",
"\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
@ -196,7 +196,7 @@
"\n",
"To evaluate the performance of item recommendation, we adopted the leave-one-out evaluation.\n",
"\n",
"For each user, we held out his/her latest interaction as the test set and utilized the remaining data for training. We use `python_chrono_split` to achieve this. And since it is too time-consuming to rank all items for every user during evaluation, we followed the common strategy that randomly samples 100 items that are not interacted by the user, ranking the test item among the 100 items. Our test samples will be constructed by `NCFDataset`."
"For each user, we held out his/her latest interaction as the test set and utilized the remaining data for training. We use `python_chrono_split` to achieve this. And since it is too time-consuming to rank all items for every user during evaluation, we followed the utils strategy that randomly samples 100 items that are not interacted by the user, ranking the test item among the 100 items. Our test samples will be constructed by `NCFDataset`."
]
},
{

Просмотреть файл

@ -68,12 +68,12 @@
"from pyspark.sql.types import StructType, StructField\n",
"from pyspark.sql.types import StringType, FloatType, IntegerType, LongType\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.common.notebook_utils import is_jupyter\n",
"from reco_utils.utils.notebook_utils import is_jupyter\n",
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
"from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation\n",
"from reco_utils.common.spark_utils import start_or_get_spark\n",
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
"\n",
"from reco_utils.evaluation.spark_diversity_evaluation import DiversityEvaluation\n",
"from pyspark.sql.window import Window\n",

Просмотреть файл

@ -63,7 +63,7 @@
"import pyspark\n",
"from sklearn.preprocessing import minmax_scale\n",
"\n",
"from reco_utils.common.spark_utils import start_or_get_spark\n",
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
"from reco_utils.evaluation.spark_evaluation import SparkRankingEvaluation, SparkRatingEvaluation\n",
"from reco_utils.evaluation.python_evaluation import auc, logloss\n",
"\n",

Просмотреть файл

@ -85,9 +85,9 @@
"import azureml.widgets as widgets\n",
"import azureml.train.hyperdrive as hd\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.common.constants import SEED\n",
"from reco_utils.common.tf_utils import pandas_input_fn_for_saved_model\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.utils.constants import SEED\n",
"from reco_utils.utils.tf_utils import pandas_input_fn_for_saved_model\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.pandas_df_utils import user_item_pairs\n",
"from reco_utils.dataset.python_splitters import python_random_split\n",

Просмотреть файл

@ -67,7 +67,7 @@
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
"\n",
"import reco_utils\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_chrono_split\n",
"from reco_utils.evaluation.python_evaluation import rmse, precision_at_k, ndcg_at_k\n",

Просмотреть файл

@ -61,7 +61,7 @@
"from tempfile import TemporaryDirectory\n",
"\n",
"import reco_utils\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_random_split\n",
"from reco_utils.evaluation.python_evaluation import rmse, precision_at_k, ndcg_at_k\n",

Просмотреть файл

@ -19,7 +19,7 @@ try:
except ImportError:
run = None
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,

Просмотреть файл

@ -81,8 +81,8 @@
"from hyperopt.pyll.base import scope\n",
"from hyperopt.pyll.stochastic import sample\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.common.spark_utils import start_or_get_spark\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
"from reco_utils.evaluation.spark_evaluation import SparkRankingEvaluation, SparkRatingEvaluation\n",
"from reco_utils.dataset.movielens import load_spark_df\n",
"from reco_utils.dataset.spark_splitters import spark_random_split\n",

Просмотреть файл

@ -78,7 +78,7 @@
"import urllib\n",
"\n",
"from azure.common.client_factory import get_client_from_cli_profile\n",
"from azure.utils.client_factory import get_client_from_cli_profile\n",
"import azure.mgmt.cosmosdb\n",
"import azureml.core\n",
"from azureml.core import Workspace\n",
@ -96,14 +96,14 @@
"from pyspark.sql.types import StructType, StructField\n",
"from pyspark.sql.types import FloatType, IntegerType, LongType\n",
"\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.common.spark_utils import start_or_get_spark\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.utils.spark_utils import start_or_get_spark\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.cosmos_cli import find_collection, read_collection, read_database, find_database\n",
"from reco_utils.dataset.download_utils import maybe_download\n",
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
"from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation\n",
"from reco_utils.common.notebook_utils import is_databricks\n",
"from reco_utils.utils.notebook_utils import is_databricks\n",
"\n",
"print(\"Azure SDK version:\", azureml.core.VERSION)"
]
@ -165,7 +165,7 @@
"1. [Azure ML Service](https://azure.microsoft.com/en-us/services/machine-learning-service/)\n",
" 1. [Azure ML Workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-workspace)\n",
" 1. [Azure Application Insights](https://azure.microsoft.com/en-us/services/monitor/)\n",
" 1. [Azure Storage](https://docs.microsoft.com/en-us/azure/storage/common/storage-account-overview)\n",
" 1. [Azure Storage](https://docs.microsoft.com/en-us/azure/storage/utils/storage-account-overview)\n",
" 1. [Azure Key Vault](https://azure.microsoft.com/en-us/services/key-vault/) \n",
"\n",
"1. [Azure Cosmos DB](https://azure.microsoft.com/en-us/services/cosmos-db/)\n",

Просмотреть файл

@ -95,7 +95,7 @@
"import shutil\n",
"\n",
"from reco_utils.dataset.criteo import get_spark_schema, load_spark_df\n",
"from reco_utils.common.k8s_utils import qps_to_replicas, replicas_to_qps, nodes_to_replicas\n",
"from reco_utils.utils.k8s_utils import qps_to_replicas, replicas_to_qps, nodes_to_replicas\n",
"\n",
"from azureml.core import Workspace\n",
"from azureml.core import VERSION as azureml_version\n",

Просмотреть файл

@ -7,7 +7,7 @@ from fastai.collab import collab_learner, CollabDataBunch
import surprise
import cornac
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
COL_DICT,
DEFAULT_K,
DEFAULT_USER_COL,
@ -17,8 +17,8 @@ from reco_utils.common.constants import (
DEFAULT_TIMESTAMP_COL,
SEED,
)
from reco_utils.common.timer import Timer
from reco_utils.common.spark_utils import start_or_get_spark
from reco_utils.utils.timer import Timer
from reco_utils.utils.spark_utils import start_or_get_spark
from reco_utils.recommender.sar.sar_singlenode import SARSingleNode
from reco_utils.recommender.ncf.ncf_singlenode import NCF
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
@ -44,12 +44,7 @@ from reco_utils.evaluation.python_evaluation import (
precision_at_k,
recall_at_k,
)
from reco_utils.evaluation.python_evaluation import (
rmse,
mae,
rsquared,
exp_var
)
from reco_utils.evaluation.python_evaluation import rmse, mae, rsquared, exp_var
def prepare_training_als(train, test):
@ -82,8 +77,7 @@ def prepare_metrics_als(train, test):
)
)
spark = start_or_get_spark()
return spark.createDataFrame(train, schema), spark.createDataFrame(test,
schema)
return spark.createDataFrame(train, schema), spark.createDataFrame(test, schema)
def predict_als(model, test):
@ -172,10 +166,7 @@ def prepare_training_fastai(train, test):
def train_fastai(params, data):
model = collab_learner(
data,
n_factors=params["n_factors"],
y_range=params["y_range"],
wd=params["wd"]
data, n_factors=params["n_factors"], y_range=params["y_range"], wd=params["wd"]
)
with Timer() as t:
model.fit_one_cycle(cyc_len=params["epochs"], max_lr=params["max_lr"])
@ -267,10 +258,7 @@ def recommend_k_ncf(model, test, train, top_k=DEFAULT_K, remove_seen=True):
}
)
merged = pd.merge(
train,
topk_scores,
on=[DEFAULT_USER_COL, DEFAULT_ITEM_COL],
how="outer"
train, topk_scores, on=[DEFAULT_USER_COL, DEFAULT_ITEM_COL], how="outer"
)
topk_scores = merged[merged[DEFAULT_RATING_COL].isnull()].drop(
DEFAULT_RATING_COL, axis=1
@ -280,8 +268,7 @@ def recommend_k_ncf(model, test, train, top_k=DEFAULT_K, remove_seen=True):
def prepare_training_cornac(train, test):
return cornac.data.Dataset.from_uir(
train.drop(DEFAULT_TIMESTAMP_COL, axis=1).itertuples(index=False),
seed=SEED
train.drop(DEFAULT_TIMESTAMP_COL, axis=1).itertuples(index=False), seed=SEED
)
@ -344,11 +331,7 @@ def train_lightgcn(params, data):
return model, t
def recommend_k_lightgcn(model,
test,
train,
top_k=DEFAULT_K,
remove_seen=True):
def recommend_k_lightgcn(model, test, train, top_k=DEFAULT_K, remove_seen=True):
with Timer() as t:
topk_scores = model.recommend_k_items(
test, top_k=top_k, remove_seen=remove_seen

Просмотреть файл

@ -106,8 +106,8 @@
"tf.get_logger().setLevel('ERROR') # only show error messages\n",
"import surprise\n",
"\n",
"from reco_utils.common.general_utils import get_number_processors\n",
"from reco_utils.common.gpu_utils import get_cuda_version, get_cudnn_version\n",
"from reco_utils.utils.general_utils import get_number_processors\n",
"from reco_utils.utils.gpu_utils import get_cuda_version, get_cudnn_version\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
"\n",

Просмотреть файл

@ -36,13 +36,13 @@
"import pandas as pd\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"from reco_utils.common.timer import Timer\n",
"from reco_utils.utils.timer import Timer\n",
"from reco_utils.recommender.deeprec.models.graphrec.lightgcn import LightGCN\n",
"from reco_utils.recommender.deeprec.DataModel.ImplicitCF import ImplicitCF\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
"from reco_utils.common.constants import SEED as DEFAULT_SEED\n",
"from reco_utils.utils.constants import SEED as DEFAULT_SEED\n",
"from reco_utils.recommender.deeprec.deeprec_utils import prepare_hparams\n",
"from reco_utils.recommender.deeprec.deeprec_utils import cal_metric\n",
"from utils.general import *\n",

Просмотреть файл

@ -84,7 +84,7 @@ It is also possible to install directly from GitHub. Or from a specific branch a
# Contents
## [Common](common)
## [Utils](utils)
This submodule contains high-level utilities for defining constants used in most algorithms as well as helper functions for managing aspects of different frameworks: GPU, Spark, Jupyter notebook.

Просмотреть файл

@ -11,7 +11,7 @@ import random
import logging
import _pickle as cPickle
from reco_utils.common.constants import SEED
from reco_utils.utils.constants import SEED
from reco_utils.dataset.download_utils import maybe_download, download_path

Просмотреть файл

@ -12,7 +12,7 @@ except ImportError:
pass # so the environment without spark doesn't break
from reco_utils.dataset.download_utils import maybe_download, download_path
from reco_utils.common.notebook_utils import is_databricks
from reco_utils.utils.notebook_utils import is_databricks
CRITEO_URL = {
@ -40,11 +40,11 @@ def load_pandas_df(size="sample", local_cache_path=None, header=DEFAULT_HEADER):
The schema is:
.. code-block:: python
<label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
More details (need to accept user terms to see the information):
http://labs.criteo.com/2013/12/download-terabyte-click-logs/
More details (need to accept user terms to see the information):
http://labs.criteo.com/2013/12/download-terabyte-click-logs/
Args:
size (str): Dataset size. It can be "sample" or "full".
@ -80,13 +80,13 @@ def load_spark_df(
onto 32 bits for anonymization purposes.
The schema is:
.. code-block:: python
<label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>
More details (need to accept user terms to see the information):
http://labs.criteo.com/2013/12/download-terabyte-click-logs/
More details (need to accept user terms to see the information):
http://labs.criteo.com/2013/12/download-terabyte-click-logs/
Args:
spark (pySpark.SparkSession): Spark session.
@ -95,7 +95,7 @@ def load_spark_df(
header (list): Dataset header names.
dbfs_datapath (str): Where to store the extracted files on Databricks.
dbutils (Databricks.dbutils): Databricks utility object.
Returns:
pyspark.sql.DataFrame: Criteo DAC training dataset.
"""
@ -145,10 +145,10 @@ def extract_criteo(size, compressed_file, path=None):
size (str): Size of Criteo dataset. It can be "full" or "sample".
compressed_file (str): Path to compressed file.
path (str): Path to extract the file.
Returns:
str: Path to the extracted file.
"""
if path is None:
folder = os.path.dirname(compressed_file)

Просмотреть файл

@ -8,8 +8,8 @@ import warnings
import pandas as pd
from zipfile import ZipFile
from reco_utils.dataset.download_utils import maybe_download, download_path
from reco_utils.common.notebook_utils import is_databricks
from reco_utils.common.constants import (
from reco_utils.utils.notebook_utils import is_databricks
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
@ -151,7 +151,7 @@ def load_pandas_df(
"""Loads the MovieLens dataset as pd.DataFrame.
Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load.
To load movie information only, you can use load_item_df function.
To load movie information only, you can use load_item_df function.
Args:
size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
@ -165,12 +165,12 @@ def load_pandas_df(
Returns:
pandas.DataFrame: Movie rating dataset.
**Examples**
.. code-block:: python
# To load just user-id, item-id, and ratings from MovieLens-1M dataset,
df = load_pandas_df('1m', ('UserId', 'ItemId', 'Rating'))
@ -345,14 +345,14 @@ def load_spark_df(
Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load as `pyspark.sql.DataFrame`.
To load movie information only, you can use `load_item_df` function.
To load movie information only, you can use `load_item_df` function.
Args:
spark (pyspark.SparkSession): Spark session.
size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
header (list or tuple): Rating dataset header.
If schema is provided, this argument is ignored.
schema (pyspark.StructType): Dataset schema.
schema (pyspark.StructType): Dataset schema.
local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
If None, all the intermediate files will be stored in a temporary directory and removed after use.
dbutils (Databricks.dbutils): Databricks utility object
@ -363,11 +363,11 @@ def load_spark_df(
Returns:
pyspark.sql.DataFrame: Movie rating dataset.
**Examples**
.. code-block:: python
# To load just user-id, item-id, and ratings from MovieLens-1M dataset:
spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'))

Просмотреть файл

@ -6,7 +6,7 @@ import pandas as pd
import numpy as np
from functools import lru_cache, wraps
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,

Просмотреть файл

@ -4,7 +4,7 @@ import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as sk_split
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_ITEM_COL,
DEFAULT_USER_COL,
DEFAULT_TIMESTAMP_COL,
@ -245,8 +245,8 @@ def numpy_stratified_split(X, ratio=0.75, seed=42):
seed (int): random seed
Returns:
numpy.ndarray, numpy.ndarray:
- Xtr: The train set user/item affinity matrix.
numpy.ndarray, numpy.ndarray:
- Xtr: The train set user/item affinity matrix.
- Xtst: The test set user/item affinity matrix.
"""

Просмотреть файл

@ -8,7 +8,7 @@ try:
except ImportError:
pass # skip this import if we are in pure python environment
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_ITEM_COL,
DEFAULT_USER_COL,
DEFAULT_TIMESTAMP_COL,
@ -19,15 +19,15 @@ from reco_utils.dataset.split_utils import process_split_ratio, min_rating_filte
def spark_random_split(data, ratio=0.75, seed=42):
"""Spark random splitter.
Randomly split the data into several splits.
Args:
data (pyspark.sql.DataFrame): Spark DataFrame to be split.
ratio (float or list): Ratio for splitting data. If it is a single float number
it splits data into two halves and the ratio argument indicates the ratio of
training data set; if it is a list of float numbers, the splitter splits
data into several portions corresponding to the split ratios. If a list
it splits data into two halves and the ratio argument indicates the ratio of
training data set; if it is a list of float numbers, the splitter splits
data into several portions corresponding to the split ratios. If a list
is provided and the ratios are not summed to 1, they will be normalized.
seed (int): Seed.
@ -56,30 +56,30 @@ def _do_stratification_spark(
):
"""Helper function to perform stratified splits.
This function splits data in a stratified manner. That is, the same values for the
filter_by column are retained in each split, but the corresponding set of entries
are divided according to the ratio provided.
This function splits data in a stratified manner. That is, the same values for the
filter_by column are retained in each split, but the corresponding set of entries
are divided according to the ratio provided.
Args:
data (pyspark.sql.DataFrame): Spark DataFrame to be split.
ratio (float or list): Ratio for splitting data. If it is a single float number
it splits data into two sets and the ratio argument indicates the ratio of
training data set; if it is a list of float numbers, the splitter splits
data into several portions corresponding to the split ratios. If a list is
provided and the ratios are not summed to 1, they will be normalized.
min_rating (int): minimum number of ratings for user or item.
filter_by (str): either "user" or "item", depending on which of the two is to filter
with min_rating.
is_partitioned (bool): flag to partition data by filter_by column
is_random (bool): flag to make split randomly or use timestamp column
seed (int): Seed.
col_user (str): column name of user IDs.
col_item (str): column name of item IDs.
col_timestamp (str): column name of timestamps.
Args:
data (pyspark.sql.DataFrame): Spark DataFrame to be split.
ratio (float or list): Ratio for splitting data. If it is a single float number
it splits data into two sets and the ratio argument indicates the ratio of
training data set; if it is a list of float numbers, the splitter splits
data into several portions corresponding to the split ratios. If a list is
provided and the ratios are not summed to 1, they will be normalized.
min_rating (int): minimum number of ratings for user or item.
filter_by (str): either "user" or "item", depending on which of the two is to filter
with min_rating.
is_partitioned (bool): flag to partition data by filter_by column
is_random (bool): flag to make split randomly or use timestamp column
seed (int): Seed.
col_user (str): column name of user IDs.
col_item (str): column name of item IDs.
col_timestamp (str): column name of timestamps.
Args:
Args:
Returns:
Returns:
"""
# A few preliminary checks.
if filter_by not in ["user", "item"]:
@ -115,17 +115,16 @@ def _do_stratification_spark(
window_spec = Window.partitionBy(partition_by).orderBy(order_by)
data = (
data
.withColumn("_count", F.count(split_by).over(window_count))
.withColumn("_rank", F.row_number().over(window_spec) / F.col("_count"))
.drop("_count")
data.withColumn("_count", F.count(split_by).over(window_count))
.withColumn("_rank", F.row_number().over(window_spec) / F.col("_count"))
.drop("_count")
)
multi_split, ratio = process_split_ratio(ratio)
ratio = ratio if multi_split else [ratio, 1 - ratio]
splits = []
prev_split = None
prev_split = None
for split in np.cumsum(ratio):
condition = F.col("_rank") <= split
if prev_split is not None:
@ -156,8 +155,8 @@ def spark_chrono_split(
data (pyspark.sql.DataFrame): Spark DataFrame to be split.
ratio (float or list): Ratio for splitting data. If it is a single float number
it splits data into two sets and the ratio argument indicates the ratio of
training data set; if it is a list of float numbers, the splitter splits
data into several portions corresponding to the split ratios. If a list is
training data set; if it is a list of float numbers, the splitter splits
data into several portions corresponding to the split ratios. If a list is
provided and the ratios are not summed to 1, they will be normalized.
seed (int): Seed.
min_rating (int): minimum number of ratings for user or item.
@ -183,6 +182,7 @@ def spark_chrono_split(
col_timestamp=col_timestamp,
)
def spark_stratified_split(
data,
ratio=0.75,
@ -228,6 +228,7 @@ def spark_stratified_split(
col_item=col_item,
)
def spark_timestamp_split(
data,
ratio=0.75,

Просмотреть файл

@ -9,7 +9,7 @@ from scipy.sparse import coo_matrix
import logging
# import default parameters
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
@ -62,7 +62,7 @@ class AffinityMatrix:
map_users, map_items: dictionaries mapping the original user/item index to matrix indices
map_back_users, map_back_items: dictionaries to map back the matrix elements to the original
dataframe indices
Basic mechanics:
As a first step we retieve the unique elements in the dataset. In this way we can take care
of either completely missing rows (a user with no ratings) or completely missing columns
@ -108,7 +108,7 @@ class AffinityMatrix:
np.save(self.save_path + "/item_back_dict", self.map_back_items)
def gen_affinity_matrix(self):
"""Generate the user/item affinity matrix.
"""Generate the user/item affinity matrix.
As a first step, two new columns are added to the input DF, containing the index maps
generated by the gen_index() method. The new indices, together with the ratings, are

Просмотреть файл

@ -5,7 +5,7 @@ import pandas as pd
import numpy as np
import math
from reco_utils.common.constants import DEFAULT_ITEM_COL, DEFAULT_USER_COL
from reco_utils.utils.constants import DEFAULT_ITEM_COL, DEFAULT_USER_COL
try:
from pyspark.sql import functions as F, Window
@ -62,10 +62,10 @@ def min_rating_filter_pandas(
Args:
data (pandas.DataFrame): DataFrame of user-item tuples. Columns of user and item
should be present in the DataFrame while other columns like rating,
should be present in the DataFrame while other columns like rating,
timestamp, etc. can be optional.
min_rating (int): minimum number of ratings for user or item.
filter_by (str): either "user" or "item", depending on which of the two is to
filter_by (str): either "user" or "item", depending on which of the two is to
filter with min_rating.
col_user (str): column name of user ID.
col_item (str): column name of item ID.
@ -73,19 +73,12 @@ def min_rating_filter_pandas(
Returns:
pandas.DataFrame: DataFrame with at least columns of user and item that has been filtered by the given specifications.
"""
split_by_column = _get_column_name(
filter_by, col_user, col_item
)
split_by_column = _get_column_name(filter_by, col_user, col_item)
if min_rating < 1:
raise ValueError("min_rating should be integer and larger than or equal to 1.")
return (
data
.groupby(split_by_column)
.filter(lambda x: len(x) >= min_rating)
)
return data.groupby(split_by_column).filter(lambda x: len(x) >= min_rating)
def min_rating_filter_spark(
@ -103,10 +96,10 @@ def min_rating_filter_spark(
Args:
data (pyspark.sql.DataFrame): DataFrame of user-item tuples. Columns of user and item
should be present in the DataFrame while other columns like rating,
should be present in the DataFrame while other columns like rating,
timestamp, etc. can be optional.
min_rating (int): minimum number of ratings for user or item.
filter_by (str): either "user" or "item", depending on which of the two is to
filter_by (str): either "user" or "item", depending on which of the two is to
filter with min_rating.
col_user (str): column name of user ID.
col_item (str): column name of item ID.
@ -115,9 +108,7 @@ def min_rating_filter_spark(
pyspark.sql.DataFrame: DataFrame with at least columns of user and item that has been filtered by the given specifications.
"""
split_by_column = _get_column_name(
filter_by, col_user, col_item
)
split_by_column = _get_column_name(filter_by, col_user, col_item)
if min_rating < 1:
raise ValueError("min_rating should be integer and larger than or equal to 1.")
@ -125,12 +116,11 @@ def min_rating_filter_spark(
if min_rating > 1:
window = Window.partitionBy(split_by_column)
data = (
data
.withColumn("_count", F.count(split_by_column).over(window))
data.withColumn("_count", F.count(split_by_column).over(window))
.where(F.col("_count") >= min_rating)
.drop("_count")
)
return data

Просмотреть файл

@ -13,7 +13,7 @@ from sklearn.metrics import (
log_loss,
)
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
@ -31,14 +31,14 @@ from reco_utils.dataset.pandas_df_utils import (
def check_column_dtypes(func):
"""Checks columns of DataFrame inputs
This includes the checks on:
This includes the checks on:
* whether the input columns exist in the input DataFrames
* whether the data types of col_user as well as col_item are matched in the two input DataFrames.
Args:
func (function): function that will be wrapped
Returns:
function: Wrapper function for checking dtypes.
"""
@ -100,7 +100,7 @@ def merge_rating_true_pred(
):
"""Join truth and prediction data frames on userID and itemID and return the true
and predicted rated with the correct index.
Args:
rating_true (pandas.DataFrame): True data
rating_pred (pandas.DataFrame): Predicted data
@ -210,7 +210,7 @@ def rsquared(
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
Returns:
float: R squared (min=0, max=1).
"""
@ -352,7 +352,7 @@ def merge_ranking_true_pred(
k=DEFAULT_K,
threshold=DEFAULT_THRESHOLD,
):
"""Filter truth and prediction data frames on common users
"""Filter truth and prediction data frames on utils users
Args:
rating_true (pandas.DataFrame): True DataFrame
@ -361,7 +361,7 @@ def merge_ranking_true_pred(
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
k (int): number of top k items per user (optional)
threshold (float): threshold of top items per user (optional)
@ -438,7 +438,7 @@ def precision_at_k(
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
k (int): number of top k items per user
threshold (float): threshold of top items per user (optional)
@ -485,13 +485,13 @@ def recall_at_k(
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
k (int): number of top k items per user
threshold (float): threshold of top items per user (optional)
Returns:
float: recall at k (min=0, max=1). The maximum value is 1 even when fewer than
float: recall at k (min=0, max=1). The maximum value is 1 even when fewer than
k items exist for a user in rating_true.
"""
@ -525,9 +525,9 @@ def ndcg_at_k(
threshold=DEFAULT_THRESHOLD,
):
"""Normalized Discounted Cumulative Gain (nDCG).
Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain
Args:
rating_true (pandas.DataFrame): True DataFrame
rating_pred (pandas.DataFrame): Predicted DataFrame
@ -535,7 +535,7 @@ def ndcg_at_k(
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
k (int): number of top k items per user
threshold (float): threshold of top items per user (optional)
@ -587,7 +587,7 @@ def map_at_k(
threshold=DEFAULT_THRESHOLD,
):
"""Mean Average Precision at k
The implementation of MAP is referenced from Spark MLlib evaluation metrics.
https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems
@ -597,7 +597,7 @@ def map_at_k(
Note:
1. The evaluation function is named as 'MAP is at k' because the evaluation class takes top k items for
the prediction items. The naming is different from Spark.
2. The MAP is meant to calculate Avg. Precision for the relevant items, so it is normalized by the number of
relevant items in the ground truth data, instead of k.
@ -608,7 +608,7 @@ def map_at_k(
col_item (str): column name for item
col_rating (str): column name for rating
col_prediction (str): column name for prediction
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
top k items are directly provided, so there is no need to compute the relevancy operation.
k (int): number of top k items per user
threshold (float): threshold of top items per user (optional)
@ -649,7 +649,7 @@ def get_top_k_items(
"""Get the input customer-item-rating tuple in the format of Pandas
DataFrame, output a Pandas DataFrame in the dense format of top k items
for each user.
Note:
If it is implicit rating, just append a column of constants to be
ratings.

Просмотреть файл

@ -4,7 +4,7 @@
from pyspark.sql.types import *
from pyspark.sql import functions as F
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
)
@ -21,7 +21,7 @@ class DiversityEvaluation:
col_item=DEFAULT_ITEM_COL,
col_relevance=None,
):
"""Initializer.
"""Initializer.
This is the Spark version of diversity metrics evaluator.
The methods of this class calculate following diversity metrics:
@ -163,7 +163,7 @@ class DiversityEvaluation:
def user_diversity(self):
"""Calculate average diversity for recommendations for each user.
The metric definition is based on formula (3) in the following reference:
- Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing serendipity into music recommendation, WSDM 2012
@ -201,7 +201,7 @@ class DiversityEvaluation:
The metric definition is based on following reference:
- P. Castells, S. Vargas, and J. Wang, Novelty and diversity metrics for recommender systems: choice, discovery and relevance, ECIR 2011
- Eugene Yan, Serendipity: Accuracys unpopular best friend in Recommender Systems, eugeneyan.com, April 2020
Returns:
pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_item, item_novelty.
"""
@ -266,7 +266,7 @@ class DiversityEvaluation:
The metric definition is based on following reference:
- Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing serendipity into music recommendation, WSDM 2012
- Eugene Yan, Serendipity: Accuracys unpopular best friend in Recommender Systems, eugeneyan.com, April 2020
Returns:
pyspark.sql.dataframe.DataFrame: A dataframe with following columns: col_user, col_item, user_item_serendipity.
"""
@ -380,8 +380,6 @@ class DiversityEvaluation:
"p(i)", F.col("count") / count_row_reco
).withColumn("entropy(i)", F.col("p(i)") * F.log2(F.col("p(i)")))
# distributional coverage
d_coverage = -df_entropy.agg(
F.sum("entropy(i)")
).collect()[0][0]
d_coverage = -df_entropy.agg(F.sum("entropy(i)")).collect()[0][0]
return d_coverage

Просмотреть файл

@ -10,7 +10,7 @@ try:
except ImportError:
pass # skip this import if we are in pure python environment
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_PREDICTION_COL,
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
@ -117,7 +117,7 @@ class SparkRatingEvaluation:
def rmse(self):
"""Calculate Root Mean Squared Error.
Returns:
float: Root mean squared error.
"""
@ -125,7 +125,7 @@ class SparkRatingEvaluation:
def mae(self):
"""Calculate Mean Absolute Error.
Returns:
float: Mean Absolute Error.
"""
@ -187,7 +187,7 @@ class SparkRankingEvaluation:
col_rating (str): column name for rating.
col_prediction (str): column name for prediction.
k (int): number of items to recommend to each user.
relevancy_method (str): method for determining relevant items. Possible
relevancy_method (str): method for determining relevant items. Possible
values are "top_k", "by_time_stamp", and "by_threshold".
threshold (float): threshold for determining the relevant recommended items.
This is used for the case that predicted ratings follow a known
@ -305,7 +305,7 @@ class SparkRankingEvaluation:
def recall_at_k(self):
"""Get recall@K.
NOTE:
NOTE:
More details can be found `here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision>`_.
Return:
@ -320,7 +320,7 @@ class SparkRankingEvaluation:
def ndcg_at_k(self):
"""Get Normalized Discounted Cumulative Gain (NDCG)
NOTE:
NOTE:
More details can be found `here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.ndcgAt>`_.
Return:
@ -333,7 +333,7 @@ class SparkRankingEvaluation:
def map_at_k(self):
"""Get mean average precision at k.
NOTE:
NOTE:
More details can be found `here <http://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics.meanAveragePrecision>`_.
Return:
@ -356,7 +356,7 @@ def _get_top_k_items(
DataFrame, output a Spark DataFrame in the dense format of top k items
for each user.
NOTE:
NOTE:
if it is implicit rating, just append a column of constants to be ratings.
Args:

Просмотреть файл

@ -4,7 +4,7 @@
import pandas as pd
import numpy as np
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_PREDICTION_COL,

Просмотреть файл

@ -6,7 +6,7 @@ import numpy as np
import pandas as pd
import scipy.sparse as sp
import time
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_ITEM_COL,
DEFAULT_USER_COL,
DEFAULT_RATING_COL,
@ -194,7 +194,7 @@ class ImplicitCF(object):
batch_size (int): Batch size of users.
Returns:
numpy.ndarray, numpy.ndarray, numpy.ndarray:
numpy.ndarray, numpy.ndarray, numpy.ndarray:
- Sampled users.
- Sampled positive items.
- Sampled negative items.

Просмотреть файл

@ -13,14 +13,14 @@ from reco_utils.evaluation.python_evaluation import (
precision_at_k,
recall_at_k,
)
from reco_utils.common.python_utils import get_top_k_scored_items
from reco_utils.utils.python_utils import get_top_k_scored_items
class LightGCN(object):
"""LightGCN model
:Citation:
He, Xiangnan, Kuan Deng, Xiang Wang, Yan Li, Yongdong Zhang, and Meng Wang.
"LightGCN: Simplifying and Powering Graph Convolution Network for Recommendation." arXiv
preprint arXiv:2002.02126, 2020.

Просмотреть файл

@ -8,7 +8,7 @@ import fastai
import fastprogress
from fastprogress.fastprogress import force_console_behavior
from reco_utils.common import constants as cc
from reco_utils.utils import constants as cc
def cartesian_product(*arrays):
@ -38,7 +38,7 @@ def score(
top_k=None,
):
"""Score all users+items provided and reduce to top_k items per user if top_k>0
Args:
learner (object): Model.
test_df (pandas.DataFrame): Test dataframe.
@ -48,7 +48,7 @@ def score(
top_k (int): Number of top items to recommend.
Returns:
pandas.DataFrame: Result of recommendation
pandas.DataFrame: Result of recommendation
"""
# replace values not known to the model with NaN
total_users, total_items = learner.data.train_ds.x.classes.values()

Просмотреть файл

@ -12,14 +12,15 @@ from sklearn import datasets
from sklearn.preprocessing import normalize
from numba import jit, prange
from reco_utils.common.python_utils import binarize
from reco_utils.utils.python_utils import binarize
from .geoimc_utils import length_normalize, reduce_dims
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("geoimc")
class DataPtr():
class DataPtr:
"""
Holds data and its respective indices
"""
@ -39,7 +40,6 @@ class DataPtr():
self.data_indices = None
self.entity_indices = [None, None]
def get_data(self):
"""
Returns:
@ -49,33 +49,26 @@ class DataPtr():
return self.data
return self.data[self.data_indices]
def get_entity(self, of="row"):
""" Get entity
"""Get entity
Args:
of (str): The entity, either 'row' or 'col'
Returns:
numpy.ndarray: Entity matrix (based on the entity_indices filter)
"""
idx = 0 if of=="row" else 1
idx = 0 if of == "row" else 1
if self.entity_indices[idx] is None:
return self.entities[idx]
return self.entities[idx][self.entity_indices[idx]]
class Dataset():
class Dataset:
"""
Base class that holds necessary (minimal) information needed
"""
def __init__(
self,
name,
features_dim=0,
normalize=False,
target_transform=''
):
def __init__(self, name, features_dim=0, normalize=False, target_transform=""):
"""Initialize parameters
Args:
@ -97,11 +90,8 @@ class Dataset():
self.feat_normalize = normalize
self.target_transform = target_transform
def normalize(self):
"""Normalizes the entity features
"""
"""Normalizes the entity features"""
if self.feat_normalize:
for i in range(len(self.entities)):
if isspmatrix_csr(self.entities[i]):
@ -110,7 +100,6 @@ class Dataset():
else:
self.entities[i] = length_normalize(self.entities[i])
def generate_train_test_data(self, data, test_ratio=0.3):
"""Generate train, test split. The split is performed on the row
entities. So, this essentially becomes a cold start row entity test.
@ -127,16 +116,13 @@ class Dataset():
np.array(range(0, data.shape[0])),
test_size=test_ratio,
shuffle=True,
random_state=0
random_state=0,
)
self.training_data.entity_indices[0] = self.training_data.data_indices
self.test_data.entity_indices[0] = self.test_data.data_indices
def reduce_dims(self):
"""Reduces the dimensionality of entity features.
"""
"""Reduces the dimensionality of entity features."""
if self.features_dim != 0:
self.entities[0] = reduce_dims(self.entities[0], self.features_dim)
self.entities[1] = reduce_dims(self.entities[1], self.features_dim)
@ -153,7 +139,6 @@ class ML_100K(Dataset):
self.min_rating = 1
self.max_rating = 5
def df2coo(self, df):
"""Convert the input dataframe into a coo matrix
@ -161,46 +146,57 @@ class ML_100K(Dataset):
df (pandas.DataFrame): DataFrame containing the target matrix information.
"""
data = []
row = list(df['user id']-1)
col = list(df['item id']-1)
row = list(df["user id"] - 1)
col = list(df["item id"] - 1)
for idx in range(0, len(df)):
val = df['rating'].iloc[idx]
val = df["rating"].iloc[idx]
data += [val]
if self.target_transform == 'normalize':
data = data/np.sqrt(np.sum(np.arange(self.min_rating, self.max_rating+1)**2))
elif self.target_transform == 'binarize':
if self.target_transform == "normalize":
data = data / np.sqrt(
np.sum(np.arange(self.min_rating, self.max_rating + 1) ** 2)
)
elif self.target_transform == "binarize":
data = binarize(np.array(data), 3)
# TODO: Get this from `u.info`
return coo_matrix((data, (row, col)), shape=(943, 1682))
def _read_from_file(self, path):
"""Read the traget matrix from file at path.
Args:
path (str): Path to the target matrix
"""
df = pd.read_csv(path, delimiter='\t', names=['user id','item id','rating','timestamp'], encoding="ISO-8859-1")
df.drop(['timestamp'], axis=1, inplace=True)
df = pd.read_csv(
path,
delimiter="\t",
names=["user id", "item id", "rating", "timestamp"],
encoding="ISO-8859-1",
)
df.drop(["timestamp"], axis=1, inplace=True)
return self.df2coo(df)
def load_data(self, path):
""" Load dataset
"""Load dataset
Args:
path (str): Path to the directory containing ML100K dataset
e1_path (str): Path to the file containing row (user) features of ML100K dataset
e2_path (str): Path to the file containing col (movie) features of ML100K dataset
"""
self.entities = [self._load_user_features(f"{path}/u.user"), self._load_item_features(f"{path}/u.item")]
self.entities = [
self._load_user_features(f"{path}/u.user"),
self._load_item_features(f"{path}/u.item"),
]
self.normalize()
self.reduce_dims()
self.training_data = DataPtr(self._read_from_file(f"{path}/u1.base").tocsr(), self.entities)
self.test_data = DataPtr(self._read_from_file(f"{path}/u1.test").tocsr(), self.entities)
self.training_data = DataPtr(
self._read_from_file(f"{path}/u1.base").tocsr(), self.entities
)
self.test_data = DataPtr(
self._read_from_file(f"{path}/u1.test").tocsr(), self.entities
)
def _load_user_features(self, path):
"""Load user features
@ -209,21 +205,26 @@ class ML_100K(Dataset):
path (str): Path to the file containing user features information
"""
data = pd.read_csv(path, delimiter='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
features_df = pd.concat(
[data['user_id'],
pd.get_dummies(data['user_id']),
pd.get_dummies(data['age']),
pd.get_dummies(data['gender']),
pd.get_dummies(data['occupation']),
pd.get_dummies(data['zip_code'])],
axis=1
data = pd.read_csv(
path,
delimiter="|",
names=["user_id", "age", "gender", "occupation", "zip_code"],
)
features_df.drop(['user_id'], axis=1, inplace=True)
features_df = pd.concat(
[
data["user_id"],
pd.get_dummies(data["user_id"]),
pd.get_dummies(data["age"]),
pd.get_dummies(data["gender"]),
pd.get_dummies(data["occupation"]),
pd.get_dummies(data["zip_code"]),
],
axis=1,
)
features_df.drop(["user_id"], axis=1, inplace=True)
user_features = np.nan_to_num(features_df.to_numpy())
return user_features
def _load_item_features(self, path):
"""Load item features
@ -231,38 +232,43 @@ class ML_100K(Dataset):
path (str): Path to the file containing item features information
"""
header =[
'movie_id',
'movie_title',
'release_date',
'video_release_date',
'IMDb_URL',
'unknown',
'Action',
'Adventure',
'Animation',
'Childrens',
'Comedy',
'Crime',
'Documentary',
'Drama',
'Fantasy',
'Film-Noir',
'Horror',
'Musical',
'Mystery',
'Romance',
'Sci-Fi',
'Thriller',
'War',
'Western']
data = pd.read_csv(path, delimiter='|', names=header, encoding="ISO-8859-1")
header = [
"movie_id",
"movie_title",
"release_date",
"video_release_date",
"IMDb_URL",
"unknown",
"Action",
"Adventure",
"Animation",
"Childrens",
"Comedy",
"Crime",
"Documentary",
"Drama",
"Fantasy",
"Film-Noir",
"Horror",
"Musical",
"Mystery",
"Romance",
"Sci-Fi",
"Thriller",
"War",
"Western",
]
data = pd.read_csv(path, delimiter="|", names=header, encoding="ISO-8859-1")
features_df = pd.concat([
pd.get_dummies(data['movie_title']),
pd.get_dummies(data['release_date']),
pd.get_dummies('video_release_date'),
pd.get_dummies('IMDb_URL'),
data[header[5:]]], axis=1)
features_df = pd.concat(
[
pd.get_dummies(data["movie_title"]),
pd.get_dummies(data["release_date"]),
pd.get_dummies("video_release_date"),
pd.get_dummies("IMDb_URL"),
data[header[5:]],
],
axis=1,
)
item_features = np.nan_to_num(features_df.to_numpy())
return item_features

Просмотреть файл

@ -6,7 +6,8 @@ from scipy.linalg import sqrtm
from numba import njit, jit, prange
from .geoimc_utils import length_normalize
from reco_utils.common.python_utils import binarize as conv_binary
from reco_utils.utils.python_utils import binarize as conv_binary
class PlainScalarProduct(object):
"""
@ -14,12 +15,7 @@ class PlainScalarProduct(object):
as the retrieval criterion
"""
def __init__(
self,
X,
Y,
**kwargs
):
def __init__(self, X, Y, **kwargs):
"""
Args:
X: numpy matrix of shape (users, features)
@ -28,25 +24,18 @@ class PlainScalarProduct(object):
self.X = X
self.Y = Y
def sim(self, **kwargs):
"""Calculate the similarity score
"""
"""Calculate the similarity score"""
sim = self.X.dot(self.Y.T)
return sim
class Inferer():
class Inferer:
"""
Holds necessary (minimal) information needed for inference
"""
def __init__(
self,
method='dot',
k=10,
transformation=''
):
def __init__(self, method="dot", k=10, transformation=""):
"""Initialize parameters
Args:
@ -64,7 +53,6 @@ class Inferer():
self.k = k
self.transformation = transformation
def _get_method(self, k):
"""Get the inferer method
@ -74,13 +62,12 @@ class Inferer():
Returns:
class: A class object implementing the inferer 'k'
"""
if k == 'dot':
if k == "dot":
method = PlainScalarProduct
else:
raise ValueError(f"{k} is unknown.")
return method
def infer(self, dataPtr, W, **kwargs):
"""Main inference method
@ -96,18 +83,15 @@ class Inferer():
a = dataPtr.get_entity("row").dot(W[0]).dot(sqrtm(W[1]))
b = dataPtr.get_entity("col").dot(W[2]).dot(sqrtm(W[1]))
sim_score = self.method(
a,
b
).sim(**kwargs)
sim_score = self.method(a, b).sim(**kwargs)
if self.transformation == 'mean':
if self.transformation == "mean":
prediction = conv_binary(sim_score, sim_score.mean())
elif self.transformation == 'topk':
elif self.transformation == "topk":
masked_sim_score = sim_score.copy()
for i in range(sim_score.shape[0]):
topKidx = np.argpartition(masked_sim_score[i], -self.k)[-self.k:]
topKidx = np.argpartition(masked_sim_score[i], -self.k)[-self.k :]
mask = np.ones(sim_score[i].size, dtype=bool)
mask[topKidx] = False

Просмотреть файл

@ -3,7 +3,7 @@ import numpy as np
import pandas as pd
import warnings
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_ITEM_COL,
DEFAULT_USER_COL,
DEFAULT_RATING_COL,
@ -27,21 +27,21 @@ class Dataset(object):
binary=True,
seed=None,
):
"""Constructor
"""Constructor
Args:
train (pandas.DataFrame): Training data with at least columns (col_user, col_item, col_rating).
test (pandas.DataFrame): Test data with at least columns (col_user, col_item, col_rating). test can be None,
test (pandas.DataFrame): Test data with at least columns (col_user, col_item, col_rating). test can be None,
if so, we only process the training data.
n_neg (int): Number of negative samples for training set.
n_neg_test (int): Number of negative samples for test set.
col_user (str): User column name.
col_item (str): Item column name.
col_rating (str): Rating column name.
col_rating (str): Rating column name.
col_timestamp (str): Timestamp column name.
binary (bool): If true, set rating > 0 to rating = 1.
binary (bool): If true, set rating > 0 to rating = 1.
seed (int): Seed.
"""
# initialize user and item index
self.user_idx = None
@ -66,14 +66,14 @@ class Dataset(object):
"""Process the dataset to reindex userID and itemID, also set rating as binary feedback
Args:
train (pandas.DataFrame): Training data with at least columns (col_user, col_item, col_rating).
train (pandas.DataFrame): Training data with at least columns (col_user, col_item, col_rating).
test (pandas.DataFrame): Test data with at least columns (col_user, col_item, col_rating)
test can be None, if so, we only process the training data.
binary (bool): If true, set rating>0 to rating = 1.
Returns:
list: train and test pandas.DataFrame Dataset, which have been reindexed.
"""
# If testing dataset is None
df = train if test is None else train.append(test)
@ -109,12 +109,12 @@ class Dataset(object):
"""Process dataset to reindex userID and itemID, also set rating as binary feedback
Args:
df (pandas.DataFrame): dataframe with at least columns (col_user, col_item, col_rating)
binary (bool): if true, set rating>0 to rating = 1
df (pandas.DataFrame): dataframe with at least columns (col_user, col_item, col_rating)
binary (bool): if true, set rating>0 to rating = 1
Returns:
list: train and test pandas.DataFrame Dataset, which have been reindexed.
"""
# If testing dataset is None
@ -140,7 +140,7 @@ class Dataset(object):
def _init_train_data(self):
"""Return all negative items (in train dataset) and store them in self.interact_status[self.col_item + '_negative']
store train dataset in self.users, self.items and self.ratings
"""
self.item_pool = set(self.train[self.col_item].unique())
@ -277,11 +277,11 @@ class Dataset(object):
def train_loader(self, batch_size, shuffle=True):
"""Feed train data every batch.
Args:
batch_size (int): Batch size.
shuffle (bool): Ff true, train data will be shuffled.
Yields:
list: A list of userID list, itemID list, and rating list. Public data loader returns the userID, itemID consistent with raw data.
"""
@ -305,7 +305,7 @@ class Dataset(object):
def test_loader(self):
"""Feed leave-one-out data every user
Generate test batch by every positive test instance,
(eg. \[1, 2, 1\] is a positive user & item pair in test set
(\[userID, itemID, rating\] for this tuple). This function

Просмотреть файл

@ -5,7 +5,7 @@ import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_ITEM_COL,
DEFAULT_USER_COL,
DEFAULT_RATING_COL,

Просмотреть файл

@ -7,14 +7,14 @@ import pandas as pd
import logging
from scipy import sparse
from reco_utils.common.python_utils import (
from reco_utils.utils.python_utils import (
jaccard,
lift,
exponential_decay,
get_top_k_scored_items,
rescale,
)
from reco_utils.common import constants
from reco_utils.utils import constants
COOCCUR = "cooccurrence"
@ -111,7 +111,7 @@ class SARSingleNode:
self.index2item = None
def compute_affinity_matrix(self, df, rating_col):
""" Affinity matrix.
"""Affinity matrix.
The user-affinity matrix can be constructed by treating the users and items as
indices in a sparse matrix, and the events as the data. Here, we're treating
@ -157,7 +157,7 @@ class SARSingleNode:
return df.groupby([self.col_user, self.col_item]).sum().reset_index()
def compute_coocurrence_matrix(self, df):
""" Co-occurrence matrix.
"""Co-occurrence matrix.
The co-occurrence matrix is defined as :math:`C = U^T * U`

Просмотреть файл

@ -5,12 +5,12 @@ import pandas as pd
import numpy as np
import pandas as pd
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_PREDICTION_COL,
)
from reco_utils.common.general_utils import invert_dictionary
from reco_utils.utils.general_utils import invert_dictionary
def surprise_trainset_to_df(
@ -25,7 +25,7 @@ def surprise_trainset_to_df(
col_user (str): User column name.
col_item (str): Item column name.
col_rating (str): Rating column name.
Returns:
pandas.DataFrame: A dataframe with user column (str), item column (str), and rating column (float).
"""
@ -53,13 +53,13 @@ def predict(
predcol=DEFAULT_PREDICTION_COL,
):
"""Computes predictions of an algorithm from Surprise on the data. Can be used for computing rating metrics like RMSE.
Args:
algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
data (pandas.DataFrame): the data on which to predict
usercol (str): name of the user column
itemcol (str): name of the item column
Returns:
pandas.DataFrame: Dataframe with usercol, itemcol, predcol
"""
@ -84,14 +84,14 @@ def compute_ranking_predictions(
):
"""Computes predictions of an algorithm from Surprise on all users and items in data. It can be used for computing
ranking metrics like NDCG.
Args:
algo (surprise.prediction_algorithms.algo_base.AlgoBase): an algorithm from Surprise
data (pandas.DataFrame): the data from which to get the users and items
usercol (str): name of the user column
itemcol (str): name of the item column
remove_seen (bool): flag to remove (user, item) pairs seen in the training data
Returns:
pandas.DataFrame: Dataframe with usercol, itemcol, predcol
"""

Просмотреть файл

@ -12,7 +12,7 @@ from subprocess import run
from tempfile import TemporaryDirectory
import pandas as pd
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
@ -67,7 +67,7 @@ class VW:
Args:
params (dict): key = parameter, value = value (use True if parameter is just a flag)
Returns:
list[str]: vw command line parameters as list of strings
"""
@ -88,10 +88,10 @@ class VW:
def parse_train_params(self, params):
"""Parse input hyper-parameters to build vw train commands
Args:
params (dict): key = parameter, value = value (use True if parameter is just a flag)
Returns:
list[str]: vw command line parameters as list of strings
"""
@ -127,10 +127,10 @@ class VW:
def parse_test_params(self, params):
"""Parse input hyper-parameters to build vw test commands
Args:
params (dict): key = parameter, value = value (use True if parameter is just a flag)
Returns:
list[str]: vw command line parameters as list of strings
"""
@ -188,7 +188,7 @@ class VW:
def to_vw_file(self, df, train=True):
"""Convert Pandas DataFrame to vw input format file
Args:
df (pandas.DataFrame): input DataFrame
train (bool): flag for train mode (or test mode if False)
@ -228,7 +228,7 @@ class VW:
def fit(self, df):
"""Train model
Args:
df (pandas.DataFrame): input training data
"""
@ -241,7 +241,7 @@ class VW:
def predict(self, df):
"""Predict results
Args:
df (pandas.DataFrame): input test data
"""

Просмотреть файл

@ -3,8 +3,8 @@
import tensorflow as tf
from reco_utils.common.constants import DEFAULT_USER_COL, DEFAULT_ITEM_COL
from reco_utils.common.tf_utils import MODEL_DIR
from reco_utils.utils.constants import DEFAULT_USER_COL, DEFAULT_ITEM_COL
from reco_utils.utils.tf_utils import MODEL_DIR
def build_feature_columns(
@ -37,9 +37,9 @@ def build_feature_columns(
'wide_deep' for a combination of linear model and neural networks.
Returns:
list, list:
- The wide feature columns
- The deep feature columns. If only the wide model is selected, the deep column list is empty and viceversa.
list, list:
- The wide feature columns
- The deep feature columns. If only the wide model is selected, the deep column list is empty and viceversa.
"""
if model_type not in ["wide", "deep", "wide_deep"]:
raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'")
@ -102,7 +102,7 @@ def _build_deep_columns(
item_dim (int): Item embedding dimension.
item_feat_col (str): Item feature column name.
item_feat_shape (int or an iterable of integers): Item feature array shape.
Returns:
list: Deep feature columns.
"""
@ -140,7 +140,7 @@ def build_model(
seed=None,
):
"""Build wide-deep model.
To generate wide model, pass wide_columns only.
To generate deep model, pass deep_columns only.
To generate wide_deep model, pass both wide_columns and deep_columns.

Просмотреть файл

@ -12,7 +12,7 @@ import nni
import reco_utils.evaluation.python_evaluation as evaluation
from reco_utils.recommender.ncf.ncf_singlenode import NCF
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
from reco_utils.common.constants import SEED as DEFAULT_SEED
from reco_utils.utils.constants import SEED as DEFAULT_SEED
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("ncf")

Просмотреть файл

@ -19,7 +19,7 @@ import pandas as pd
import pytest
from sklearn.model_selection import train_test_split
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
@ -27,7 +27,7 @@ from reco_utils.common.constants import (
)
from reco_utils.dataset.python_splitters import numpy_stratified_split
from reco_utils.dataset.python_splitters import python_chrono_split
from reco_utils.common.spark_utils import start_or_get_spark
from reco_utils.utils.spark_utils import start_or_get_spark
@pytest.fixture(scope="session")

Просмотреть файл

@ -11,7 +11,7 @@ except ImportError:
pass # disable error while collecting tests for non-notebook environments
from reco_utils.common.gpu_utils import get_number_gpus
from reco_utils.utils.gpu_utils import get_number_gpus
TOL = 0.5

Просмотреть файл

@ -3,13 +3,14 @@
import pytest
try:
import papermill as pm
import scrapbook as sb
except ImportError:
pass # disable error while collecting tests for non-notebook environments
from reco_utils.common.gpu_utils import get_number_gpus
from reco_utils.utils.gpu_utils import get_number_gpus
TOL = 0.5

Просмотреть файл

@ -3,12 +3,13 @@
import os
import pytest
try:
import papermill as pm
except ImportError:
pass # disable error while collecting tests for non-notebook environments
from reco_utils.common.gpu_utils import get_number_gpus
from reco_utils.utils.gpu_utils import get_number_gpus
@pytest.mark.notebooks

Просмотреть файл

@ -2,7 +2,7 @@
# Licensed under the MIT License.
import pytest
from reco_utils.common.general_utils import invert_dictionary, get_number_processors
from reco_utils.utils.general_utils import invert_dictionary, get_number_processors
def test_invert_dictionary():

Просмотреть файл

@ -11,7 +11,7 @@ except ImportError:
pass # skip this import if we are in cpu environment
from reco_utils.common.gpu_utils import (
from reco_utils.utils.gpu_utils import (
get_cuda_version,
get_cudnn_version,
get_gpu_info,

Просмотреть файл

@ -1,7 +1,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from reco_utils.common.k8s_utils import (
from reco_utils.utils.k8s_utils import (
qps_to_replicas,
replicas_to_qps,
nodes_to_replicas,

Просмотреть файл

@ -6,7 +6,7 @@
"collapsed": true
},
"source": [
"# This is a test notebook for reco_utils.common.notebook_utils module"
"# This is a test notebook for reco_utils.utils.notebook_utils module"
]
},
{
@ -20,7 +20,7 @@
"\n",
"import scrapbook as sb\n",
"from reco_utils.common.notebook_utils import is_jupyter, is_databricks"
"from reco_utils.utils.notebook_utils import is_jupyter, is_databricks"
]
},
{

Просмотреть файл

@ -3,12 +3,13 @@
from pathlib import Path
import pytest
try:
import papermill as pm
import scrapbook as sb
except ImportError:
pass # disable error while collecting tests for non-notebook environments
from reco_utils.common.notebook_utils import is_jupyter, is_databricks
from reco_utils.utils.notebook_utils import is_jupyter, is_databricks
@pytest.mark.notebooks

Просмотреть файл

@ -6,7 +6,7 @@ import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import pytest
from reco_utils.common.plot import line_graph
from reco_utils.utils.plot import line_graph
def test_line_graph():

Просмотреть файл

@ -5,7 +5,7 @@
import numpy as np
import pytest
from reco_utils.common.python_utils import (
from reco_utils.utils.python_utils import (
exponential_decay,
jaccard,
lift,

Просмотреть файл

@ -6,7 +6,7 @@ import os
import numpy as np
import pandas as pd
import pytest
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
@ -15,7 +15,7 @@ from reco_utils.common.constants import (
from reco_utils.evaluation.python_evaluation import rmse
try:
from reco_utils.common.tf_utils import (
from reco_utils.utils.tf_utils import (
build_optimizer,
evaluation_log_hook,
export_model,

Просмотреть файл

@ -4,7 +4,7 @@
import pytest
import time
from reco_utils.common.timer import Timer
from reco_utils.utils.timer import Timer
TOL = 0.03

Просмотреть файл

@ -17,7 +17,7 @@ from reco_utils.dataset.python_splitters import (
numpy_stratified_split,
)
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,

Просмотреть файл

@ -4,7 +4,7 @@
import numpy as np
import pandas as pd
import pytest
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,

Просмотреть файл

@ -6,7 +6,7 @@ import numpy as np
import pytest
from reco_utils.dataset.sparse import AffinityMatrix
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,

Просмотреть файл

@ -6,7 +6,7 @@ import pandas as pd
import pytest
from unittest.mock import Mock
from sklearn.preprocessing import minmax_scale
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
@ -65,6 +65,7 @@ def rating_nohit():
)
# fmt: on
@pytest.fixture
def rating_true_binary(rating_true):
# Convert true ratings to binary

Просмотреть файл

@ -6,7 +6,7 @@ import pandas as pd
import pytest
import cornac
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,

Просмотреть файл

@ -9,7 +9,7 @@ import pandas as pd
from scipy.sparse import csr_matrix
from pandas.testing import assert_frame_equal
from reco_utils.common.python_utils import binarize
from reco_utils.utils.python_utils import binarize
from reco_utils.recommender.geoimc.geoimc_data import DataPtr
from reco_utils.recommender.geoimc.geoimc_predict import PlainScalarProduct, Inferer
from reco_utils.recommender.geoimc.geoimc_algorithm import IMCProblem

Просмотреть файл

@ -2,7 +2,7 @@
# Licensed under the MIT License.
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
SEED,

Просмотреть файл

@ -9,7 +9,7 @@ import pytest
try:
from reco_utils.recommender.ncf.ncf_singlenode import NCF
from reco_utils.recommender.ncf.dataset import Dataset
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
SEED,

Просмотреть файл

@ -10,7 +10,7 @@ import pandas as pd
from pandas.testing import assert_frame_equal
import urllib
from reco_utils.common.constants import DEFAULT_PREDICTION_COL
from reco_utils.utils.constants import DEFAULT_PREDICTION_COL
from reco_utils.recommender.sar.sar_singlenode import SARSingleNode

Просмотреть файл

@ -6,7 +6,7 @@ import pandas as pd
import pytest
import surprise
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,

Просмотреть файл

@ -4,14 +4,14 @@
import os
import pytest
import pandas as pd
from reco_utils.common.constants import (
from reco_utils.utils.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL,
)
try:
from reco_utils.common.tf_utils import pandas_input_fn, MODEL_DIR
from reco_utils.utils.tf_utils import pandas_input_fn, MODEL_DIR
from reco_utils.recommender.wide_deep.wide_deep_utils import (
build_model,
build_feature_columns,