42

2019-02-25 11:38:15 +00:00 · 2019-02-25 11:38:15 +00:00 · cec5efadcc
--- a/notebooks/00_quick_start/als_movielens.ipynb
+++ b/notebooks/00_quick_start/als_movielens.ipynb
@ -226,7 +226,7 @@
    "    regParam=0.05,\n",
    "    coldStartStrategy='drop',\n",
    "    nonnegative=False,\n",
-    "    seed=0,\n",
+    "    seed=42,\n",
    "    **header\n",
    ")"
   ]
--- a/notebooks/00_quick_start/ncf_movielens.ipynb
+++ b/notebooks/00_quick_start/ncf_movielens.ipynb
@ -134,7 +134,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "data = NCFDataset(train=train, test=test, seed=123)"
+    "data = NCFDataset(train=train, test=test, seed=42)"
   ]
  },
  {
--- a/notebooks/00_quick_start/wide_deep_movielens.ipynb
+++ b/notebooks/00_quick_start/wide_deep_movielens.ipynb
@ -398,7 +398,7 @@
    "    train, test = python_random_split(\n",
    "        data.drop('Genres_string', axis=1),  # We don't need Genres original string column\n",
    "        ratio=0.75,\n",
-    "        seed=123 \n",
+    "        seed=42 \n",
    "    )\n",
    "    data_loaded = True\n",
    "\n",
--- a/notebooks/02_model/als_deep_dive.ipynb
+++ b/notebooks/02_model/als_deep_dive.ipynb
@ -276,7 +276,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "dfs_train, dfs_test = spark_random_split(dfs, ratio=0.75, seed=123)"
+    "dfs_train, dfs_test = spark_random_split(dfs, ratio=0.75, seed=42)"
   ]
  },
  {
--- a/notebooks/02_model/baseline_deep_dive.ipynb
+++ b/notebooks/02_model/baseline_deep_dive.ipynb
@ -204,7 +204,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "train, test = python_random_split(data, ratio=0.75, seed=123)"
+    "train, test = python_random_split(data, ratio=0.75, seed=42)"
   ]
  },
  {
--- a/notebooks/02_model/ncf_deep_dive.ipynb
+++ b/notebooks/02_model/ncf_deep_dive.ipynb
@ -317,7 +317,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "data = NCFDataset(train=train, test=test, seed=123)"
+    "data = NCFDataset(train=train, test=test, seed=42)"
   ]
  },
  {
--- a/notebooks/04_model_select_and_optimize/hypertune_aml_wide_and_deep_quickstart.ipynb
+++ b/notebooks/04_model_select_and_optimize/hypertune_aml_wide_and_deep_quickstart.ipynb
@ -381,7 +381,7 @@
   ],
   "source": [
    "# Use the same seed to make the train and test sets identical across other notebooks in the repo.\n",
-    "train, test = python_random_split(data, ratio=0.75, seed=123)\n",
+    "train, test = python_random_split(data, ratio=0.75, seed=42)\n",
    "# Further split the train set into train and validation set.\n",
    "train, valid = python_random_split(train)\n",
    "\n",
--- a/notebooks/04_model_select_and_optimize/hypertune_spark_deep_dive.ipynb
+++ b/notebooks/04_model_select_and_optimize/hypertune_spark_deep_dive.ipynb
@ -889,7 +889,7 @@
    "        regParam=reg,\n",
    "        coldStartStrategy='drop',\n",
    "        nonnegative=False,\n",
-    "        seed=0,\n",
+    "        seed=42,\n",
    "        **HEADER_ALS\n",
    "    )\n",
    "    \n",
@ -1083,7 +1083,7 @@
    "    alpha=0.1,\n",
    "    coldStartStrategy='drop',\n",
    "    nonnegative=False,\n",
-    "    seed=0,\n",
+    "    seed=42,\n",
    "    **HEADER_ALS\n",
    ")\n",
    "    \n",
@ -1130,7 +1130,7 @@
    "        regParam=reg,\n",
    "        coldStartStrategy='drop',\n",
    "        nonnegative=False,\n",
-    "        seed=0,\n",
+    "        seed=42,\n",
    "        **header\n",
    "    )\n",
    "    \n",
@ -1310,7 +1310,7 @@
    "    alpha=0.1,\n",
    "    coldStartStrategy='drop',\n",
    "    nonnegative=False,\n",
-    "    seed=0,\n",
+    "    seed=42,\n",
    "    **HEADER_ALS\n",
    ")\n",
    "    \n",
--- a/notebooks/05_operationalize/als_movie_o16n.ipynb
+++ b/notebooks/05_operationalize/als_movie_o16n.ipynb
@ -368,7 +368,7 @@
   },
   "outputs": [],
   "source": [
-    "train, test = spark_random_split(data, ratio=0.75, seed=123)\n",
+    "train, test = spark_random_split(data, ratio=0.75, seed=42)\n",
    "print (\"N train\", train.cache().count())\n",
    "print (\"N test\", test.cache().count())"
   ]
--- a/reco_utils/dataset/python_splitters.py
+++ b/reco_utils/dataset/python_splitters.py
@ -16,7 +16,7 @@ from reco_utils.dataset.split_utils import (
 )


-def python_random_split(data, ratio=0.75, seed=123):
+def python_random_split(data, ratio=0.75, seed=42):
    """Pandas random splitter
    The splitter randomly splits the input data.

@ -135,7 +135,7 @@ def python_stratified_split(
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
-    seed=1234,
+    seed=42,
 ):
    """Pandas stratified splitter
    For each user / item, the split function takes proportions of ratings which is
@ -211,7 +211,7 @@ def python_stratified_split(
    return splits_list


-def numpy_stratified_split(X, ratio=0.75, seed=123):
+def numpy_stratified_split(X, ratio=0.75, seed=42):

    """
    Split the user/item affinity matrix (sparse matrix) into train and test set matrices while maintaining
--- a/reco_utils/dataset/spark_splitters.py
+++ b/reco_utils/dataset/spark_splitters.py
@ -15,7 +15,7 @@ from reco_utils.common.constants import (
 from reco_utils.dataset.split_utils import process_split_ratio, min_rating_filter_spark


-def spark_random_split(data, ratio=0.75, seed=123):
+def spark_random_split(data, ratio=0.75, seed=42):
    """Spark random splitter
    Randomly split the data into several splits.

@ -128,7 +128,7 @@ def spark_stratified_split(
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
-    seed=1234,
+    seed=42,
 ):
    """Spark stratified splitter
    For each user / item, the split function takes proportions of ratings which is
@ -207,7 +207,7 @@ def spark_timestamp_split(
    ratio=0.75,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
-    col_timestamp=DEFAULT_TIMESTAMP_COL
+    col_timestamp=DEFAULT_TIMESTAMP_COL,
 ):
    """Spark timestamp based splitter
    The splitter splits the data into sets by timestamps without stratification on either
@ -241,14 +241,14 @@ def spark_timestamp_split(
    rating = data.withColumn("rank", row_number().over(window_spec))

    data_count = rating.count()
-    rating_rank = rating.withColumn(
-        "rank", row_number().over(window_spec) / data_count
-    )
+    rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count)

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
-            rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop("rank")
+            rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop(
+                "rank"
+            )
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
--- a/reco_utils/dataset/split_utils.py
+++ b/reco_utils/dataset/split_utils.py
@ -135,7 +135,7 @@ def _check_min_rating_filter(filter_by, min_rating, col_user, col_item):
    return split_by_column, split_with_column


-def split_pandas_data_with_ratios(data, ratios, seed=1234, shuffle=False):
+def split_pandas_data_with_ratios(data, ratios, seed=42, shuffle=False):
    """Helper function to split pandas DataFrame with given ratios

    Note:
--- a/reco_utils/recommender/deeprec/models/base_model.py
+++ b/reco_utils/recommender/deeprec/models/base_model.py
@ -14,7 +14,7 @@ __all__ = ["BaseModel"]


 class BaseModel(object):
-    def __init__(self, hparams, iterator_creator, graph=None, seed=123):
+    def __init__(self, hparams, iterator_creator, graph=None, seed=42):
        """Initializing the model. Create common logics which are needed by all deeprec models, such as loss function, 
        parameter set.

--- a/reco_utils/recommender/ncf/dataset.py
+++ b/reco_utils/recommender/ncf/dataset.py
@ -12,25 +12,25 @@ from reco_utils.common.constants import (


 class Dataset(object):
-    '''
+    """
    classdocs
-    '''
+    """

    def __init__(
-            self,
-            train,
-            test=None,
-            n_neg=4,
-            n_neg_test=100,
-            col_user=DEFAULT_USER_COL,
-            col_item=DEFAULT_ITEM_COL,
-            col_rating=DEFAULT_RATING_COL,
-            col_timestamp=DEFAULT_TIMESTAMP_COL,
-            seed=1234,
+        self,
+        train,
+        test=None,
+        n_neg=4,
+        n_neg_test=100,
+        col_user=DEFAULT_USER_COL,
+        col_item=DEFAULT_ITEM_COL,
+        col_rating=DEFAULT_RATING_COL,
+        col_timestamp=DEFAULT_TIMESTAMP_COL,
+        seed=42,
    ):
-        '''
+        """
        Constructor
-        '''
+        """

        # initialize user and item index
        self.user_idx = None
@ -40,7 +40,7 @@ class Dataset(object):
        self.n_neg_test = n_neg_test
        # get col name of user, item and rating
        self.col_user = col_user
-        self.col_item = col_item 
+        self.col_item = col_item
        self.col_rating = col_rating
        self.col_timestamp = col_timestamp
        # data preprocessing for training and test data
@ -75,8 +75,10 @@ class Dataset(object):
            self.n_users = len(user_idx)
            self.user_idx = user_idx

-            self.user2id = dict(zip(user_idx[self.col_user], user_idx[self.col_user+"_idx"]))
-            self.id2user = {self.user2id[k]:k for k in self.user2id}
+            self.user2id = dict(
+                zip(user_idx[self.col_user], user_idx[self.col_user + "_idx"])
+            )
+            self.id2user = {self.user2id[k]: k for k in self.user2id}

        if self.item_idx is None:
            # Map item id
@ -85,8 +87,10 @@ class Dataset(object):
            self.n_items = len(item_idx)
            self.item_idx = item_idx

-            self.item2id = dict(zip(item_idx[self.col_item], item_idx[self.col_item+"_idx"]))
-            self.id2item = {self.item2id[k]:k for k in self.item2id}
+            self.item2id = dict(
+                zip(item_idx[self.col_item], item_idx[self.col_item + "_idx"])
+            )
+            self.id2item = {self.item2id[k]: k for k in self.item2id}

        return self._reindex(train, implicit), self._reindex(test, implicit)

@ -107,15 +111,17 @@ class Dataset(object):
            return None

        # Map user_idx and item_idx
-        df = pd.merge(df, self.user_idx, on=self.col_user, how='left')
-        df = pd.merge(df, self.item_idx, on=self.col_item, how='left')
+        df = pd.merge(df, self.user_idx, on=self.col_user, how="left")
+        df = pd.merge(df, self.item_idx, on=self.col_item, how="left")

        # If implicit feedback, set rating as 1.0 or 0.0
        if implicit:
            df[self.col_rating] = df[self.col_rating].apply(lambda x: float(x > 0))

        # Select relevant columns
-        df_reindex = df[[self.col_user + "_idx", self.col_item + "_idx", self.col_rating]]
+        df_reindex = df[
+            [self.col_user + "_idx", self.col_item + "_idx", self.col_rating]
+        ]
        df_reindex.columns = [self.col_user, self.col_item, self.col_rating]

        return df_reindex
@ -127,13 +133,15 @@ class Dataset(object):
        """

        self.item_pool = set(self.train[self.col_item].unique())
-        self.interact_status = self.train\
-                                   .groupby(self.col_user)[self.col_item]\
-                                   .apply(set)\
-                                   .reset_index()\
-                                   .rename(columns={self.col_item: self.col_item + '_interacted'})
-        self.interact_status[self.col_item + '_negative'] = self.interact_status[self.col_item + '_interacted']\
-                                                                .apply(lambda x: self.item_pool - x)
+        self.interact_status = (
+            self.train.groupby(self.col_user)[self.col_item]
+            .apply(set)
+            .reset_index()
+            .rename(columns={self.col_item: self.col_item + "_interacted"})
+        )
+        self.interact_status[self.col_item + "_negative"] = self.interact_status[
+            self.col_item + "_interacted"
+        ].apply(lambda x: self.item_pool - x)

        self.users, self.items, self.ratings = [], [], []

@ -146,7 +154,6 @@ class Dataset(object):
        self.users = np.array(self.users)
        self.items = np.array(self.items)
        self.ratings = np.array(self.ratings)
-        

    def _init_test_data(self):
        """ initialize self.test using 'leave-one-out' evaluation protocol in
@ -154,41 +161,46 @@ class Dataset(object):
        """
        if self.test is not None:
            # get test positive set for every user
-            test_interact_status = self.test\
-                                       .groupby(self.col_user)[self.col_item]\
-                                       .apply(set)\
-                                       .reset_index().rename(columns={self.col_item: self.col_item + '_interacted_test'})
+            test_interact_status = (
+                self.test.groupby(self.col_user)[self.col_item]
+                .apply(set)
+                .reset_index()
+                .rename(columns={self.col_item: self.col_item + "_interacted_test"})
+            )

            # get negative pools for every user based on training and test interactions
            test_interact_status = pd.merge(
-                test_interact_status,
-                self.interact_status,
-                on=self.col_user,
-                how="left"
+                test_interact_status, self.interact_status, on=self.col_user, how="left"
            )
-            test_interact_status[self.col_item + '_negative'] = test_interact_status.apply(
-                    lambda row: row[self.col_item + '_negative'] - row[self.col_item + '_interacted_test'],
-                    axis=1,
+            test_interact_status[
+                self.col_item + "_negative"
+            ] = test_interact_status.apply(
+                lambda row: row[self.col_item + "_negative"]
+                - row[self.col_item + "_interacted_test"],
+                axis=1,
            )
            test_ratings = pd.merge(
                self.test,
-                test_interact_status[[self.col_user, self.col_item + '_negative']],
+                test_interact_status[[self.col_user, self.col_item + "_negative"]],
                on=self.col_user,
-                how="left"
+                how="left",
            )

            # sample n_neg_test negative samples for testing
            try:
-                test_ratings[self.col_item + '_negative'] = test_ratings[self.col_item + '_negative'].apply(
-                    lambda x: random.sample(x, self.n_neg_test)
-                )
+                test_ratings[self.col_item + "_negative"] = test_ratings[
+                    self.col_item + "_negative"
+                ].apply(lambda x: random.sample(x, self.n_neg_test))

            except:
-                min_num = min(map(len, list(test_ratings[self.col_item + '_negative'])))
-                warnings.warn("n_neg_test is larger than negative items set size! We will set n_neg as the smallest size: %d" % min_num)
-                test_ratings[self.col_item + '_negative'] = test_ratings[self.col_item + '_negative'].apply(
-                    lambda x: random.sample(x, min_num)
+                min_num = min(map(len, list(test_ratings[self.col_item + "_negative"])))
+                warnings.warn(
+                    "n_neg_test is larger than negative items set size! We will set n_neg as the smallest size: %d"
+                    % min_num
                )
+                test_ratings[self.col_item + "_negative"] = test_ratings[
+                    self.col_item + "_negative"
+                ].apply(lambda x: random.sample(x, min_num))

            self.test_data = []

@ -200,15 +212,16 @@ class Dataset(object):
                self.test_items.append(int(getattr(row, self.col_item)))
                self.test_ratings.append(float(getattr(row, self.col_rating)))

-                for i in getattr(row, self.col_item + '_negative'):
+                for i in getattr(row, self.col_item + "_negative"):
                    self.test_users.append(int(getattr(row, self.col_user)))
                    self.test_items.append(int(i))
                    self.test_ratings.append(float(0))

-                self.test_data.append ( [
+                self.test_data.append(
+                    [
                        [self.id2user[x] for x in self.test_users],
                        [self.id2item[x] for x in self.test_items],
-                        self.test_ratings
+                        self.test_ratings,
                    ]
                )

@ -219,25 +232,32 @@ class Dataset(object):
        self.users, self.items, self.ratings = [], [], []

        # sample n_neg negative samples for training
-        train_ratings = pd.merge(self.train, self.interact_status[[self.col_user, self.col_item + '_negative']],
-                                 on=self.col_user)
+        train_ratings = pd.merge(
+            self.train,
+            self.interact_status[[self.col_user, self.col_item + "_negative"]],
+            on=self.col_user,
+        )

        try:
-            train_ratings[self.col_item + '_negative'] = train_ratings[self.col_item + '_negative'].apply(
-                lambda x: random.sample(x, self.n_neg))
+            train_ratings[self.col_item + "_negative"] = train_ratings[
+                self.col_item + "_negative"
+            ].apply(lambda x: random.sample(x, self.n_neg))
        except:
-            min_num = min(map(len, list(train_ratings[self.col_item + '_negative'])))
-            warnings.warn("n_neg is larger than negative items set size! We will set n_neg as the smallest size: %d" % min_num)
-            train_ratings[self.col_item + '_negative'] = train_ratings[self.col_item + '_negative'].apply(
-                lambda x: random.sample(x, min_num)
+            min_num = min(map(len, list(train_ratings[self.col_item + "_negative"])))
+            warnings.warn(
+                "n_neg is larger than negative items set size! We will set n_neg as the smallest size: %d"
+                % min_num
            )
+            train_ratings[self.col_item + "_negative"] = train_ratings[
+                self.col_item + "_negative"
+            ].apply(lambda x: random.sample(x, min_num))

        # generate training data
        for row in train_ratings.itertuples():
            self.users.append(int(getattr(row, self.col_user)))
            self.items.append(int(getattr(row, self.col_item)))
            self.ratings.append(float(getattr(row, self.col_rating)))
-            for i in getattr(row, self.col_item + '_negative'):
+            for i in getattr(row, self.col_item + "_negative"):
                self.users.append(int(getattr(row, self.col_user)))
                self.items.append(int(i))
                self.ratings.append(float(0))
@ -265,12 +285,12 @@ class Dataset(object):
        for i in range(len(indices) // batch_size):
            begin_idx = i * batch_size
            end_idx = (i + 1) * batch_size
-            batch_indices = indices[begin_idx: end_idx]
+            batch_indices = indices[begin_idx:end_idx]

-            # train_loader() could be called and used by our users in other situations, 
+            # train_loader() could be called and used by our users in other situations,
            # who expect the not re-indexed data. So we convert id --> orignal user and item
            # when returning batch
-            
+
            yield [
                [self.id2user[x] for x in self.users[batch_indices]],
                [self.id2item[x] for x in self.items[batch_indices]],
--- a/reco_utils/recommender/ncf/ncf_singlenode.py
+++ b/reco_utils/recommender/ncf/ncf_singlenode.py
@ -13,20 +13,20 @@ class NCF:
    """NCF implementation"""

    def __init__(
-            self,
-            n_users,
-            n_items,
-            model_type="NeuMF",
-            random_state=0,
-            n_factors=8,
-            layer_sizes=[16,8,4],
-            n_epochs=50,
-            batch_size=64,
-            learning_rate=5e-3,
-            verbose=1,
-            save=False,
-            pretrain=False,
-            seed=123
+        self,
+        n_users,
+        n_items,
+        model_type="NeuMF",
+        random_state=0,
+        n_factors=8,
+        layer_sizes=[16, 8, 4],
+        n_epochs=50,
+        batch_size=64,
+        learning_rate=5e-3,
+        verbose=1,
+        save=False,
+        pretrain=False,
+        seed=42,
    ):
        # number of users in dataset
        self.n_users = n_users
@ -37,7 +37,11 @@ class NCF:
        # check model type
        model_options = ["gmf", "mlp", "neumf"]
        if self.model_type not in model_options:
-            raise ValueError("Wrong model type, please select one of this list: {}".format(model_options))
+            raise ValueError(
+                "Wrong model type, please select one of this list: {}".format(
+                    model_options
+                )
+            )
        # seed
        tf.set_random_seed(seed)
        np.random.seed(seed)
@ -64,10 +68,7 @@ class NCF:
        # parameters initialization
        self.sess.run(tf.global_variables_initializer())

-
-    def _create_model(
-            self,
-    ):
+    def _create_model(self,):
        # reset graph
        tf.reset_default_graph()

@ -82,27 +83,51 @@ class NCF:

            # set embedding table
            self.embedding_gmf_P = tf.Variable(
-                tf.truncated_normal(shape=[self.n_users, self.n_factors], mean=0.0, stddev=0.01),
-                name='embedding_gmf_P', dtype=tf.float32)
+                tf.truncated_normal(
+                    shape=[self.n_users, self.n_factors], mean=0.0, stddev=0.01
+                ),
+                name="embedding_gmf_P",
+                dtype=tf.float32,
+            )

            self.embedding_gmf_Q = tf.Variable(
-                tf.truncated_normal(shape=[self.n_items, self.n_factors], mean=0.0, stddev=0.01),
-                name='embedding_gmf_Q', dtype=tf.float32)
+                tf.truncated_normal(
+                    shape=[self.n_items, self.n_factors], mean=0.0, stddev=0.01
+                ),
+                name="embedding_gmf_Q",
+                dtype=tf.float32,
+            )

            # set embedding table
            self.embedding_mlp_P = tf.Variable(
-                tf.truncated_normal(shape=[self.n_users, int(self.layer_sizes[0]/2)], mean=0.0, stddev=0.01),
-                name='embedding_mlp_P', dtype=tf.float32)
+                tf.truncated_normal(
+                    shape=[self.n_users, int(self.layer_sizes[0] / 2)],
+                    mean=0.0,
+                    stddev=0.01,
+                ),
+                name="embedding_mlp_P",
+                dtype=tf.float32,
+            )

            self.embedding_mlp_Q = tf.Variable(
-                tf.truncated_normal(shape=[self.n_items, int(self.layer_sizes[0]/2)], mean=0.0, stddev=0.01),
-                name='embedding_mlp_Q', dtype=tf.float32)
+                tf.truncated_normal(
+                    shape=[self.n_items, int(self.layer_sizes[0] / 2)],
+                    mean=0.0,
+                    stddev=0.01,
+                ),
+                name="embedding_mlp_Q",
+                dtype=tf.float32,
+            )

        with tf.variable_scope("gmf", reuse=tf.AUTO_REUSE):

            # get user embedding p and item embedding q
-            self.gmf_p = tf.reduce_sum(tf.nn.embedding_lookup(self.embedding_gmf_P, self.user_input), 1)
-            self.gmf_q = tf.reduce_sum(tf.nn.embedding_lookup(self.embedding_gmf_Q, self.item_input), 1)
+            self.gmf_p = tf.reduce_sum(
+                tf.nn.embedding_lookup(self.embedding_gmf_P, self.user_input), 1
+            )
+            self.gmf_q = tf.reduce_sum(
+                tf.nn.embedding_lookup(self.embedding_gmf_Q, self.item_input), 1
+            )

            # get gmf vector
            self.gmf_vector = self.gmf_p * self.gmf_q
@ -110,15 +135,21 @@ class NCF:
        with tf.variable_scope("mlp", reuse=tf.AUTO_REUSE):

            # get user embedding p and item embedding q
-            self.mlp_p = tf.reduce_sum(tf.nn.embedding_lookup(self.embedding_mlp_P, self.user_input), 1)
-            self.mlp_q = tf.reduce_sum(tf.nn.embedding_lookup(self.embedding_mlp_Q, self.item_input), 1)
+            self.mlp_p = tf.reduce_sum(
+                tf.nn.embedding_lookup(self.embedding_mlp_P, self.user_input), 1
+            )
+            self.mlp_q = tf.reduce_sum(
+                tf.nn.embedding_lookup(self.embedding_mlp_Q, self.item_input), 1
+            )

            # concatenate user and item vector
            output = tf.concat([self.mlp_p, self.mlp_q], 1)

            # MLP Layers
            for layer_size in self.layer_sizes[1:]:
-                output = tf.contrib.layers.fully_connected(output, num_outputs=layer_size, activation_fn=tf.nn.relu)
+                output = tf.contrib.layers.fully_connected(
+                    output, num_outputs=layer_size, activation_fn=tf.nn.relu
+                )
            self.mlp_vector = output

            # self.output = tf.sigmoid(tf.reduce_sum(self.mlp_vector, axis=1, keepdims=True))
@ -127,22 +158,34 @@ class NCF:

            if self.model_type == "gmf":
                # GMF only
-                output = tf.contrib.layers.fully_connected(self.gmf_vector, num_outputs=1,
-                                                       activation_fn=None, biases_initializer=None)
+                output = tf.contrib.layers.fully_connected(
+                    self.gmf_vector,
+                    num_outputs=1,
+                    activation_fn=None,
+                    biases_initializer=None,
+                )
                self.output = tf.sigmoid(output)

            elif self.model_type == "mlp":
                # MLP only
-                output = tf.contrib.layers.fully_connected(self.mlp_vector, num_outputs=1,
-                                                       activation_fn=None, biases_initializer=None)
+                output = tf.contrib.layers.fully_connected(
+                    self.mlp_vector,
+                    num_outputs=1,
+                    activation_fn=None,
+                    biases_initializer=None,
+                )
                self.output = tf.sigmoid(output)

            elif self.model_type == "neumf":
                # concatenate GMF and MLP vector
                self.ncf_vector = tf.concat([self.gmf_vector, self.mlp_vector], 1)
                # get predicted rating score
-                output = tf.contrib.layers.fully_connected(self.ncf_vector, num_outputs=1,
-                                                        activation_fn=None, biases_initializer=None)
+                output = tf.contrib.layers.fully_connected(
+                    self.ncf_vector,
+                    num_outputs=1,
+                    activation_fn=None,
+                    biases_initializer=None,
+                )
                self.output = tf.sigmoid(output)

        with tf.variable_scope("loss", reuse=tf.AUTO_REUSE):
@ -153,7 +196,9 @@ class NCF:
        with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE):

            # set optimizer
-            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
+            self.optimizer = tf.train.AdamOptimizer(
+                learning_rate=self.learning_rate
+            ).minimize(self.loss)

    def save(self, dir_name):
        """ save model parameters in `dir_name`
@ -210,23 +255,27 @@ class NCF:
        # load gmf part
        variables = tf.global_variables()
        # get variables with 'gmf'
-        var_flow_restore = [val for val in variables if 'gmf' in val.name and 'ncf' not in val.name] 
+        var_flow_restore = [
+            val for val in variables if "gmf" in val.name and "ncf" not in val.name
+        ]
        # load 'gmf' variable
-        saver = tf.train.Saver(var_flow_restore) 
+        saver = tf.train.Saver(var_flow_restore)
        # restore
-        saver.restore(self.sess, os.path.join(gmf_dir, "model.ckpt")) 
+        saver.restore(self.sess, os.path.join(gmf_dir, "model.ckpt"))

        # load mlp part
        variables = tf.global_variables()
        # get variables with 'gmf'
-        var_flow_restore = [val for val in variables if 'mlp' in val.name and 'ncf' not in val.name] 
+        var_flow_restore = [
+            val for val in variables if "mlp" in val.name and "ncf" not in val.name
+        ]
        # load 'gmf' variable
-        saver = tf.train.Saver(var_flow_restore) 
+        saver = tf.train.Saver(var_flow_restore)
        # restore
-        saver.restore(self.sess, os.path.join(mlp_dir, "model.ckpt")) 
+        saver.restore(self.sess, os.path.join(mlp_dir, "model.ckpt"))

        # concat pretrain h_from_gmf and h_from_mlp
-        vars_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='ncf')
+        vars_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="ncf")

        assert len(vars_list) == 1
        ncf_fc = vars_list[0]
@ -236,10 +285,11 @@ class NCF:
        mlp_fc = tf.contrib.framework.load_variable(mlp_dir, ncf_fc.name)

        # load fc layer by tf.concat
-        assign_op = tf.assign(ncf_fc, tf.concat([alpha*gmf_fc, (1-alpha)*mlp_fc], axis=0))
+        assign_op = tf.assign(
+            ncf_fc, tf.concat([alpha * gmf_fc, (1 - alpha) * mlp_fc], axis=0)
+        )
        self.sess.run(assign_op)

-
    def fit(self, data):
        """ fit model with training data
            Args: 
@ -256,7 +306,7 @@ class NCF:
        print("Training model: %s" % self.model_type)

        # loop for n_epochs
-        for epoch_count in range(1, self.n_epochs+1):
+        for epoch_count in range(1, self.n_epochs + 1):

            # negative sampling for training
            train_begin = time()
@ -275,7 +325,7 @@ class NCF:
                feed_dict = {
                    self.user_input: user_input[..., None],
                    self.item_input: item_input[..., None],
-                    self.labels: labels[..., None]
+                    self.labels: labels[..., None],
                }

                # get loss and execute optimization
@ -286,16 +336,12 @@ class NCF:
            # output every self.verbose
            if self.verbose and epoch_count % self.verbose == 0:

-                print("Epoch %d [%.2fs]: train_loss = %.6f " % (
-                    epoch_count, train_time, sum(train_loss) / len(train_loss)))
+                print(
+                    "Epoch %d [%.2fs]: train_loss = %.6f "
+                    % (epoch_count, train_time, sum(train_loss) / len(train_loss))
+                )

-
-    def predict(
-            self,
-            user_input,
-            item_input,
-            is_list=False,
-    ):
+    def predict(self, user_input, item_input, is_list=False):
        """ predict function of this trained model
            Args:
                user_input ( list or element of list ): userID or userID list 
@ -313,13 +359,8 @@ class NCF:
        else:
            output = self._predict(np.array([user_input]), np.array([item_input]))
            return float(output.reshape(-1)[0])
-            

-    def _predict(
-                self,
-                user_input,
-                item_input,
-        ):
+    def _predict(self, user_input, item_input):

        # index converting
        user_input = np.array([self.user2id[x] for x in user_input])
@ -335,4 +376,3 @@ class NCF:
        output = self.sess.run(self.output, feed_dict)
        return output

-
--- a/reco_utils/recommender/rbm/rbm.py
+++ b/reco_utils/recommender/rbm/rbm.py
@ -71,6 +71,7 @@ class RBM:
        sampling_protocol=[50, 70, 80, 90, 100],
        debug=False,
        with_metrics=False,
+        seed=42
    ):

        # RBM parameters
@ -105,6 +106,9 @@ class RBM:
        # Initialize the start time
        self.start_time = None

+        # Seed
+        self.seed = seed
+
        log.info("TensorFlow version: {}".format(tf.__version__))

    # =========================
@ -154,7 +158,7 @@ class RBM:
            h_sampled (tensor, float32): sampled units. The value is 1 if pr>g and 0 otherwise.
        """

-        np.random.seed(1)
+        np.random.seed(self.seed)

        # sample from a Bernoulli distribution with same dimensions as input distribution
        g = tf.convert_to_tensor(np.random.uniform(size=pr.shape[1]), dtype=tf.float32)
@ -190,7 +194,7 @@ class RBM:
            v_samp (tensor, float32): an (m,n) tensor of sampled rankings from 1 to r .
        """

-        np.random.seed(1)
+        np.random.seed(self.seed)

        g = np.random.uniform(size=pr.shape[2])  # sample from a uniform distribution
        f = tf.convert_to_tensor(
@ -279,14 +283,14 @@ class RBM:
           bh (tensor, float32): (1, Nhidden) hidden units' bias, initiliazed to zero.
        """

-        tf.set_random_seed(1)  # set the seed for the random number generator
+        tf.set_random_seed(self.seed)  # set the seed for the random number generator

        with tf.variable_scope("Network_parameters"):

            self.w = tf.get_variable(
                "weight",
                [self.Nvisible, self.Nhidden],
-                initializer=tf.random_normal_initializer(stddev=self.stdv, seed=1),
+                initializer=tf.random_normal_initializer(stddev=self.stdv, seed=self.seed),
                dtype="float32",
            )

@ -581,7 +585,7 @@ class RBM:
        self.dataset = tf.data.Dataset.from_tensor_slices(self.vu)

        self.dataset = self.dataset.shuffle(
-            buffer_size=50, reshuffle_each_iteration=True, seed=123
+            buffer_size=50, reshuffle_each_iteration=True, seed=self.seed
        )  # randomize the batch

        self.dataset = self.dataset.batch(batch_size=self.batch_size).repeat()