This commit is contained in:
miguelgfierro 2019-02-25 11:38:15 +00:00
Родитель 96ce535ae3
Коммит cec5efadcc
16 изменённых файлов: 224 добавлений и 160 удалений

Просмотреть файл

@ -226,7 +226,7 @@
" regParam=0.05,\n",
" coldStartStrategy='drop',\n",
" nonnegative=False,\n",
" seed=0,\n",
" seed=42,\n",
" **header\n",
")"
]

Просмотреть файл

@ -134,7 +134,7 @@
"metadata": {},
"outputs": [],
"source": [
"data = NCFDataset(train=train, test=test, seed=123)"
"data = NCFDataset(train=train, test=test, seed=42)"
]
},
{

Просмотреть файл

@ -398,7 +398,7 @@
" train, test = python_random_split(\n",
" data.drop('Genres_string', axis=1), # We don't need Genres original string column\n",
" ratio=0.75,\n",
" seed=123 \n",
" seed=42 \n",
" )\n",
" data_loaded = True\n",
"\n",

Просмотреть файл

@ -276,7 +276,7 @@
"metadata": {},
"outputs": [],
"source": [
"dfs_train, dfs_test = spark_random_split(dfs, ratio=0.75, seed=123)"
"dfs_train, dfs_test = spark_random_split(dfs, ratio=0.75, seed=42)"
]
},
{

Просмотреть файл

@ -204,7 +204,7 @@
"metadata": {},
"outputs": [],
"source": [
"train, test = python_random_split(data, ratio=0.75, seed=123)"
"train, test = python_random_split(data, ratio=0.75, seed=42)"
]
},
{

Просмотреть файл

@ -317,7 +317,7 @@
"metadata": {},
"outputs": [],
"source": [
"data = NCFDataset(train=train, test=test, seed=123)"
"data = NCFDataset(train=train, test=test, seed=42)"
]
},
{

Просмотреть файл

@ -381,7 +381,7 @@
],
"source": [
"# Use the same seed to make the train and test sets identical across other notebooks in the repo.\n",
"train, test = python_random_split(data, ratio=0.75, seed=123)\n",
"train, test = python_random_split(data, ratio=0.75, seed=42)\n",
"# Further split the train set into train and validation set.\n",
"train, valid = python_random_split(train)\n",
"\n",

Просмотреть файл

@ -889,7 +889,7 @@
" regParam=reg,\n",
" coldStartStrategy='drop',\n",
" nonnegative=False,\n",
" seed=0,\n",
" seed=42,\n",
" **HEADER_ALS\n",
" )\n",
" \n",
@ -1083,7 +1083,7 @@
" alpha=0.1,\n",
" coldStartStrategy='drop',\n",
" nonnegative=False,\n",
" seed=0,\n",
" seed=42,\n",
" **HEADER_ALS\n",
")\n",
" \n",
@ -1130,7 +1130,7 @@
" regParam=reg,\n",
" coldStartStrategy='drop',\n",
" nonnegative=False,\n",
" seed=0,\n",
" seed=42,\n",
" **header\n",
" )\n",
" \n",
@ -1310,7 +1310,7 @@
" alpha=0.1,\n",
" coldStartStrategy='drop',\n",
" nonnegative=False,\n",
" seed=0,\n",
" seed=42,\n",
" **HEADER_ALS\n",
")\n",
" \n",

Просмотреть файл

@ -368,7 +368,7 @@
},
"outputs": [],
"source": [
"train, test = spark_random_split(data, ratio=0.75, seed=123)\n",
"train, test = spark_random_split(data, ratio=0.75, seed=42)\n",
"print (\"N train\", train.cache().count())\n",
"print (\"N test\", test.cache().count())"
]

Просмотреть файл

@ -16,7 +16,7 @@ from reco_utils.dataset.split_utils import (
)
def python_random_split(data, ratio=0.75, seed=123):
def python_random_split(data, ratio=0.75, seed=42):
"""Pandas random splitter
The splitter randomly splits the input data.
@ -135,7 +135,7 @@ def python_stratified_split(
filter_by="user",
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
seed=1234,
seed=42,
):
"""Pandas stratified splitter
For each user / item, the split function takes proportions of ratings which is
@ -211,7 +211,7 @@ def python_stratified_split(
return splits_list
def numpy_stratified_split(X, ratio=0.75, seed=123):
def numpy_stratified_split(X, ratio=0.75, seed=42):
"""
Split the user/item affinity matrix (sparse matrix) into train and test set matrices while maintaining

Просмотреть файл

@ -15,7 +15,7 @@ from reco_utils.common.constants import (
from reco_utils.dataset.split_utils import process_split_ratio, min_rating_filter_spark
def spark_random_split(data, ratio=0.75, seed=123):
def spark_random_split(data, ratio=0.75, seed=42):
"""Spark random splitter
Randomly split the data into several splits.
@ -128,7 +128,7 @@ def spark_stratified_split(
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
seed=1234,
seed=42,
):
"""Spark stratified splitter
For each user / item, the split function takes proportions of ratings which is
@ -207,7 +207,7 @@ def spark_timestamp_split(
ratio=0.75,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_timestamp=DEFAULT_TIMESTAMP_COL
col_timestamp=DEFAULT_TIMESTAMP_COL,
):
"""Spark timestamp based splitter
The splitter splits the data into sets by timestamps without stratification on either
@ -241,14 +241,14 @@ def spark_timestamp_split(
rating = data.withColumn("rank", row_number().over(window_spec))
data_count = rating.count()
rating_rank = rating.withColumn(
"rank", row_number().over(window_spec) / data_count
)
rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count)
splits = []
for i, _ in enumerate(ratio_index):
if i == 0:
rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop("rank")
rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop(
"rank"
)
else:
rating_split = rating_rank.filter(
(col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])

Просмотреть файл

@ -135,7 +135,7 @@ def _check_min_rating_filter(filter_by, min_rating, col_user, col_item):
return split_by_column, split_with_column
def split_pandas_data_with_ratios(data, ratios, seed=1234, shuffle=False):
def split_pandas_data_with_ratios(data, ratios, seed=42, shuffle=False):
"""Helper function to split pandas DataFrame with given ratios
Note:

Просмотреть файл

@ -14,7 +14,7 @@ __all__ = ["BaseModel"]
class BaseModel(object):
def __init__(self, hparams, iterator_creator, graph=None, seed=123):
def __init__(self, hparams, iterator_creator, graph=None, seed=42):
"""Initializing the model. Create common logics which are needed by all deeprec models, such as loss function,
parameter set.

Просмотреть файл

@ -12,25 +12,25 @@ from reco_utils.common.constants import (
class Dataset(object):
'''
"""
classdocs
'''
"""
def __init__(
self,
train,
test=None,
n_neg=4,
n_neg_test=100,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_timestamp=DEFAULT_TIMESTAMP_COL,
seed=1234,
self,
train,
test=None,
n_neg=4,
n_neg_test=100,
col_user=DEFAULT_USER_COL,
col_item=DEFAULT_ITEM_COL,
col_rating=DEFAULT_RATING_COL,
col_timestamp=DEFAULT_TIMESTAMP_COL,
seed=42,
):
'''
"""
Constructor
'''
"""
# initialize user and item index
self.user_idx = None
@ -40,7 +40,7 @@ class Dataset(object):
self.n_neg_test = n_neg_test
# get col name of user, item and rating
self.col_user = col_user
self.col_item = col_item
self.col_item = col_item
self.col_rating = col_rating
self.col_timestamp = col_timestamp
# data preprocessing for training and test data
@ -75,8 +75,10 @@ class Dataset(object):
self.n_users = len(user_idx)
self.user_idx = user_idx
self.user2id = dict(zip(user_idx[self.col_user], user_idx[self.col_user+"_idx"]))
self.id2user = {self.user2id[k]:k for k in self.user2id}
self.user2id = dict(
zip(user_idx[self.col_user], user_idx[self.col_user + "_idx"])
)
self.id2user = {self.user2id[k]: k for k in self.user2id}
if self.item_idx is None:
# Map item id
@ -85,8 +87,10 @@ class Dataset(object):
self.n_items = len(item_idx)
self.item_idx = item_idx
self.item2id = dict(zip(item_idx[self.col_item], item_idx[self.col_item+"_idx"]))
self.id2item = {self.item2id[k]:k for k in self.item2id}
self.item2id = dict(
zip(item_idx[self.col_item], item_idx[self.col_item + "_idx"])
)
self.id2item = {self.item2id[k]: k for k in self.item2id}
return self._reindex(train, implicit), self._reindex(test, implicit)
@ -107,15 +111,17 @@ class Dataset(object):
return None
# Map user_idx and item_idx
df = pd.merge(df, self.user_idx, on=self.col_user, how='left')
df = pd.merge(df, self.item_idx, on=self.col_item, how='left')
df = pd.merge(df, self.user_idx, on=self.col_user, how="left")
df = pd.merge(df, self.item_idx, on=self.col_item, how="left")
# If implicit feedback, set rating as 1.0 or 0.0
if implicit:
df[self.col_rating] = df[self.col_rating].apply(lambda x: float(x > 0))
# Select relevant columns
df_reindex = df[[self.col_user + "_idx", self.col_item + "_idx", self.col_rating]]
df_reindex = df[
[self.col_user + "_idx", self.col_item + "_idx", self.col_rating]
]
df_reindex.columns = [self.col_user, self.col_item, self.col_rating]
return df_reindex
@ -127,13 +133,15 @@ class Dataset(object):
"""
self.item_pool = set(self.train[self.col_item].unique())
self.interact_status = self.train\
.groupby(self.col_user)[self.col_item]\
.apply(set)\
.reset_index()\
.rename(columns={self.col_item: self.col_item + '_interacted'})
self.interact_status[self.col_item + '_negative'] = self.interact_status[self.col_item + '_interacted']\
.apply(lambda x: self.item_pool - x)
self.interact_status = (
self.train.groupby(self.col_user)[self.col_item]
.apply(set)
.reset_index()
.rename(columns={self.col_item: self.col_item + "_interacted"})
)
self.interact_status[self.col_item + "_negative"] = self.interact_status[
self.col_item + "_interacted"
].apply(lambda x: self.item_pool - x)
self.users, self.items, self.ratings = [], [], []
@ -146,7 +154,6 @@ class Dataset(object):
self.users = np.array(self.users)
self.items = np.array(self.items)
self.ratings = np.array(self.ratings)
def _init_test_data(self):
""" initialize self.test using 'leave-one-out' evaluation protocol in
@ -154,41 +161,46 @@ class Dataset(object):
"""
if self.test is not None:
# get test positive set for every user
test_interact_status = self.test\
.groupby(self.col_user)[self.col_item]\
.apply(set)\
.reset_index().rename(columns={self.col_item: self.col_item + '_interacted_test'})
test_interact_status = (
self.test.groupby(self.col_user)[self.col_item]
.apply(set)
.reset_index()
.rename(columns={self.col_item: self.col_item + "_interacted_test"})
)
# get negative pools for every user based on training and test interactions
test_interact_status = pd.merge(
test_interact_status,
self.interact_status,
on=self.col_user,
how="left"
test_interact_status, self.interact_status, on=self.col_user, how="left"
)
test_interact_status[self.col_item + '_negative'] = test_interact_status.apply(
lambda row: row[self.col_item + '_negative'] - row[self.col_item + '_interacted_test'],
axis=1,
test_interact_status[
self.col_item + "_negative"
] = test_interact_status.apply(
lambda row: row[self.col_item + "_negative"]
- row[self.col_item + "_interacted_test"],
axis=1,
)
test_ratings = pd.merge(
self.test,
test_interact_status[[self.col_user, self.col_item + '_negative']],
test_interact_status[[self.col_user, self.col_item + "_negative"]],
on=self.col_user,
how="left"
how="left",
)
# sample n_neg_test negative samples for testing
try:
test_ratings[self.col_item + '_negative'] = test_ratings[self.col_item + '_negative'].apply(
lambda x: random.sample(x, self.n_neg_test)
)
test_ratings[self.col_item + "_negative"] = test_ratings[
self.col_item + "_negative"
].apply(lambda x: random.sample(x, self.n_neg_test))
except:
min_num = min(map(len, list(test_ratings[self.col_item + '_negative'])))
warnings.warn("n_neg_test is larger than negative items set size! We will set n_neg as the smallest size: %d" % min_num)
test_ratings[self.col_item + '_negative'] = test_ratings[self.col_item + '_negative'].apply(
lambda x: random.sample(x, min_num)
min_num = min(map(len, list(test_ratings[self.col_item + "_negative"])))
warnings.warn(
"n_neg_test is larger than negative items set size! We will set n_neg as the smallest size: %d"
% min_num
)
test_ratings[self.col_item + "_negative"] = test_ratings[
self.col_item + "_negative"
].apply(lambda x: random.sample(x, min_num))
self.test_data = []
@ -200,15 +212,16 @@ class Dataset(object):
self.test_items.append(int(getattr(row, self.col_item)))
self.test_ratings.append(float(getattr(row, self.col_rating)))
for i in getattr(row, self.col_item + '_negative'):
for i in getattr(row, self.col_item + "_negative"):
self.test_users.append(int(getattr(row, self.col_user)))
self.test_items.append(int(i))
self.test_ratings.append(float(0))
self.test_data.append ( [
self.test_data.append(
[
[self.id2user[x] for x in self.test_users],
[self.id2item[x] for x in self.test_items],
self.test_ratings
self.test_ratings,
]
)
@ -219,25 +232,32 @@ class Dataset(object):
self.users, self.items, self.ratings = [], [], []
# sample n_neg negative samples for training
train_ratings = pd.merge(self.train, self.interact_status[[self.col_user, self.col_item + '_negative']],
on=self.col_user)
train_ratings = pd.merge(
self.train,
self.interact_status[[self.col_user, self.col_item + "_negative"]],
on=self.col_user,
)
try:
train_ratings[self.col_item + '_negative'] = train_ratings[self.col_item + '_negative'].apply(
lambda x: random.sample(x, self.n_neg))
train_ratings[self.col_item + "_negative"] = train_ratings[
self.col_item + "_negative"
].apply(lambda x: random.sample(x, self.n_neg))
except:
min_num = min(map(len, list(train_ratings[self.col_item + '_negative'])))
warnings.warn("n_neg is larger than negative items set size! We will set n_neg as the smallest size: %d" % min_num)
train_ratings[self.col_item + '_negative'] = train_ratings[self.col_item + '_negative'].apply(
lambda x: random.sample(x, min_num)
min_num = min(map(len, list(train_ratings[self.col_item + "_negative"])))
warnings.warn(
"n_neg is larger than negative items set size! We will set n_neg as the smallest size: %d"
% min_num
)
train_ratings[self.col_item + "_negative"] = train_ratings[
self.col_item + "_negative"
].apply(lambda x: random.sample(x, min_num))
# generate training data
for row in train_ratings.itertuples():
self.users.append(int(getattr(row, self.col_user)))
self.items.append(int(getattr(row, self.col_item)))
self.ratings.append(float(getattr(row, self.col_rating)))
for i in getattr(row, self.col_item + '_negative'):
for i in getattr(row, self.col_item + "_negative"):
self.users.append(int(getattr(row, self.col_user)))
self.items.append(int(i))
self.ratings.append(float(0))
@ -265,12 +285,12 @@ class Dataset(object):
for i in range(len(indices) // batch_size):
begin_idx = i * batch_size
end_idx = (i + 1) * batch_size
batch_indices = indices[begin_idx: end_idx]
batch_indices = indices[begin_idx:end_idx]
# train_loader() could be called and used by our users in other situations,
# train_loader() could be called and used by our users in other situations,
# who expect the not re-indexed data. So we convert id --> orignal user and item
# when returning batch
yield [
[self.id2user[x] for x in self.users[batch_indices]],
[self.id2item[x] for x in self.items[batch_indices]],

Просмотреть файл

@ -13,20 +13,20 @@ class NCF:
"""NCF implementation"""
def __init__(
self,
n_users,
n_items,
model_type="NeuMF",
random_state=0,
n_factors=8,
layer_sizes=[16,8,4],
n_epochs=50,
batch_size=64,
learning_rate=5e-3,
verbose=1,
save=False,
pretrain=False,
seed=123
self,
n_users,
n_items,
model_type="NeuMF",
random_state=0,
n_factors=8,
layer_sizes=[16, 8, 4],
n_epochs=50,
batch_size=64,
learning_rate=5e-3,
verbose=1,
save=False,
pretrain=False,
seed=42,
):
# number of users in dataset
self.n_users = n_users
@ -37,7 +37,11 @@ class NCF:
# check model type
model_options = ["gmf", "mlp", "neumf"]
if self.model_type not in model_options:
raise ValueError("Wrong model type, please select one of this list: {}".format(model_options))
raise ValueError(
"Wrong model type, please select one of this list: {}".format(
model_options
)
)
# seed
tf.set_random_seed(seed)
np.random.seed(seed)
@ -64,10 +68,7 @@ class NCF:
# parameters initialization
self.sess.run(tf.global_variables_initializer())
def _create_model(
self,
):
def _create_model(self,):
# reset graph
tf.reset_default_graph()
@ -82,27 +83,51 @@ class NCF:
# set embedding table
self.embedding_gmf_P = tf.Variable(
tf.truncated_normal(shape=[self.n_users, self.n_factors], mean=0.0, stddev=0.01),
name='embedding_gmf_P', dtype=tf.float32)
tf.truncated_normal(
shape=[self.n_users, self.n_factors], mean=0.0, stddev=0.01
),
name="embedding_gmf_P",
dtype=tf.float32,
)
self.embedding_gmf_Q = tf.Variable(
tf.truncated_normal(shape=[self.n_items, self.n_factors], mean=0.0, stddev=0.01),
name='embedding_gmf_Q', dtype=tf.float32)
tf.truncated_normal(
shape=[self.n_items, self.n_factors], mean=0.0, stddev=0.01
),
name="embedding_gmf_Q",
dtype=tf.float32,
)
# set embedding table
self.embedding_mlp_P = tf.Variable(
tf.truncated_normal(shape=[self.n_users, int(self.layer_sizes[0]/2)], mean=0.0, stddev=0.01),
name='embedding_mlp_P', dtype=tf.float32)
tf.truncated_normal(
shape=[self.n_users, int(self.layer_sizes[0] / 2)],
mean=0.0,
stddev=0.01,
),
name="embedding_mlp_P",
dtype=tf.float32,
)
self.embedding_mlp_Q = tf.Variable(
tf.truncated_normal(shape=[self.n_items, int(self.layer_sizes[0]/2)], mean=0.0, stddev=0.01),
name='embedding_mlp_Q', dtype=tf.float32)
tf.truncated_normal(
shape=[self.n_items, int(self.layer_sizes[0] / 2)],
mean=0.0,
stddev=0.01,
),
name="embedding_mlp_Q",
dtype=tf.float32,
)
with tf.variable_scope("gmf", reuse=tf.AUTO_REUSE):
# get user embedding p and item embedding q
self.gmf_p = tf.reduce_sum(tf.nn.embedding_lookup(self.embedding_gmf_P, self.user_input), 1)
self.gmf_q = tf.reduce_sum(tf.nn.embedding_lookup(self.embedding_gmf_Q, self.item_input), 1)
self.gmf_p = tf.reduce_sum(
tf.nn.embedding_lookup(self.embedding_gmf_P, self.user_input), 1
)
self.gmf_q = tf.reduce_sum(
tf.nn.embedding_lookup(self.embedding_gmf_Q, self.item_input), 1
)
# get gmf vector
self.gmf_vector = self.gmf_p * self.gmf_q
@ -110,15 +135,21 @@ class NCF:
with tf.variable_scope("mlp", reuse=tf.AUTO_REUSE):
# get user embedding p and item embedding q
self.mlp_p = tf.reduce_sum(tf.nn.embedding_lookup(self.embedding_mlp_P, self.user_input), 1)
self.mlp_q = tf.reduce_sum(tf.nn.embedding_lookup(self.embedding_mlp_Q, self.item_input), 1)
self.mlp_p = tf.reduce_sum(
tf.nn.embedding_lookup(self.embedding_mlp_P, self.user_input), 1
)
self.mlp_q = tf.reduce_sum(
tf.nn.embedding_lookup(self.embedding_mlp_Q, self.item_input), 1
)
# concatenate user and item vector
output = tf.concat([self.mlp_p, self.mlp_q], 1)
# MLP Layers
for layer_size in self.layer_sizes[1:]:
output = tf.contrib.layers.fully_connected(output, num_outputs=layer_size, activation_fn=tf.nn.relu)
output = tf.contrib.layers.fully_connected(
output, num_outputs=layer_size, activation_fn=tf.nn.relu
)
self.mlp_vector = output
# self.output = tf.sigmoid(tf.reduce_sum(self.mlp_vector, axis=1, keepdims=True))
@ -127,22 +158,34 @@ class NCF:
if self.model_type == "gmf":
# GMF only
output = tf.contrib.layers.fully_connected(self.gmf_vector, num_outputs=1,
activation_fn=None, biases_initializer=None)
output = tf.contrib.layers.fully_connected(
self.gmf_vector,
num_outputs=1,
activation_fn=None,
biases_initializer=None,
)
self.output = tf.sigmoid(output)
elif self.model_type == "mlp":
# MLP only
output = tf.contrib.layers.fully_connected(self.mlp_vector, num_outputs=1,
activation_fn=None, biases_initializer=None)
output = tf.contrib.layers.fully_connected(
self.mlp_vector,
num_outputs=1,
activation_fn=None,
biases_initializer=None,
)
self.output = tf.sigmoid(output)
elif self.model_type == "neumf":
# concatenate GMF and MLP vector
self.ncf_vector = tf.concat([self.gmf_vector, self.mlp_vector], 1)
# get predicted rating score
output = tf.contrib.layers.fully_connected(self.ncf_vector, num_outputs=1,
activation_fn=None, biases_initializer=None)
output = tf.contrib.layers.fully_connected(
self.ncf_vector,
num_outputs=1,
activation_fn=None,
biases_initializer=None,
)
self.output = tf.sigmoid(output)
with tf.variable_scope("loss", reuse=tf.AUTO_REUSE):
@ -153,7 +196,9 @@ class NCF:
with tf.variable_scope("optimizer", reuse=tf.AUTO_REUSE):
# set optimizer
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
self.optimizer = tf.train.AdamOptimizer(
learning_rate=self.learning_rate
).minimize(self.loss)
def save(self, dir_name):
""" save model parameters in `dir_name`
@ -210,23 +255,27 @@ class NCF:
# load gmf part
variables = tf.global_variables()
# get variables with 'gmf'
var_flow_restore = [val for val in variables if 'gmf' in val.name and 'ncf' not in val.name]
var_flow_restore = [
val for val in variables if "gmf" in val.name and "ncf" not in val.name
]
# load 'gmf' variable
saver = tf.train.Saver(var_flow_restore)
saver = tf.train.Saver(var_flow_restore)
# restore
saver.restore(self.sess, os.path.join(gmf_dir, "model.ckpt"))
saver.restore(self.sess, os.path.join(gmf_dir, "model.ckpt"))
# load mlp part
variables = tf.global_variables()
# get variables with 'gmf'
var_flow_restore = [val for val in variables if 'mlp' in val.name and 'ncf' not in val.name]
var_flow_restore = [
val for val in variables if "mlp" in val.name and "ncf" not in val.name
]
# load 'gmf' variable
saver = tf.train.Saver(var_flow_restore)
saver = tf.train.Saver(var_flow_restore)
# restore
saver.restore(self.sess, os.path.join(mlp_dir, "model.ckpt"))
saver.restore(self.sess, os.path.join(mlp_dir, "model.ckpt"))
# concat pretrain h_from_gmf and h_from_mlp
vars_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='ncf')
vars_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="ncf")
assert len(vars_list) == 1
ncf_fc = vars_list[0]
@ -236,10 +285,11 @@ class NCF:
mlp_fc = tf.contrib.framework.load_variable(mlp_dir, ncf_fc.name)
# load fc layer by tf.concat
assign_op = tf.assign(ncf_fc, tf.concat([alpha*gmf_fc, (1-alpha)*mlp_fc], axis=0))
assign_op = tf.assign(
ncf_fc, tf.concat([alpha * gmf_fc, (1 - alpha) * mlp_fc], axis=0)
)
self.sess.run(assign_op)
def fit(self, data):
""" fit model with training data
Args:
@ -256,7 +306,7 @@ class NCF:
print("Training model: %s" % self.model_type)
# loop for n_epochs
for epoch_count in range(1, self.n_epochs+1):
for epoch_count in range(1, self.n_epochs + 1):
# negative sampling for training
train_begin = time()
@ -275,7 +325,7 @@ class NCF:
feed_dict = {
self.user_input: user_input[..., None],
self.item_input: item_input[..., None],
self.labels: labels[..., None]
self.labels: labels[..., None],
}
# get loss and execute optimization
@ -286,16 +336,12 @@ class NCF:
# output every self.verbose
if self.verbose and epoch_count % self.verbose == 0:
print("Epoch %d [%.2fs]: train_loss = %.6f " % (
epoch_count, train_time, sum(train_loss) / len(train_loss)))
print(
"Epoch %d [%.2fs]: train_loss = %.6f "
% (epoch_count, train_time, sum(train_loss) / len(train_loss))
)
def predict(
self,
user_input,
item_input,
is_list=False,
):
def predict(self, user_input, item_input, is_list=False):
""" predict function of this trained model
Args:
user_input ( list or element of list ): userID or userID list
@ -313,13 +359,8 @@ class NCF:
else:
output = self._predict(np.array([user_input]), np.array([item_input]))
return float(output.reshape(-1)[0])
def _predict(
self,
user_input,
item_input,
):
def _predict(self, user_input, item_input):
# index converting
user_input = np.array([self.user2id[x] for x in user_input])
@ -335,4 +376,3 @@ class NCF:
output = self.sess.run(self.output, feed_dict)
return output

Просмотреть файл

@ -71,6 +71,7 @@ class RBM:
sampling_protocol=[50, 70, 80, 90, 100],
debug=False,
with_metrics=False,
seed=42
):
# RBM parameters
@ -105,6 +106,9 @@ class RBM:
# Initialize the start time
self.start_time = None
# Seed
self.seed = seed
log.info("TensorFlow version: {}".format(tf.__version__))
# =========================
@ -154,7 +158,7 @@ class RBM:
h_sampled (tensor, float32): sampled units. The value is 1 if pr>g and 0 otherwise.
"""
np.random.seed(1)
np.random.seed(self.seed)
# sample from a Bernoulli distribution with same dimensions as input distribution
g = tf.convert_to_tensor(np.random.uniform(size=pr.shape[1]), dtype=tf.float32)
@ -190,7 +194,7 @@ class RBM:
v_samp (tensor, float32): an (m,n) tensor of sampled rankings from 1 to r .
"""
np.random.seed(1)
np.random.seed(self.seed)
g = np.random.uniform(size=pr.shape[2]) # sample from a uniform distribution
f = tf.convert_to_tensor(
@ -279,14 +283,14 @@ class RBM:
bh (tensor, float32): (1, Nhidden) hidden units' bias, initiliazed to zero.
"""
tf.set_random_seed(1) # set the seed for the random number generator
tf.set_random_seed(self.seed) # set the seed for the random number generator
with tf.variable_scope("Network_parameters"):
self.w = tf.get_variable(
"weight",
[self.Nvisible, self.Nhidden],
initializer=tf.random_normal_initializer(stddev=self.stdv, seed=1),
initializer=tf.random_normal_initializer(stddev=self.stdv, seed=self.seed),
dtype="float32",
)
@ -581,7 +585,7 @@ class RBM:
self.dataset = tf.data.Dataset.from_tensor_slices(self.vu)
self.dataset = self.dataset.shuffle(
buffer_size=50, reshuffle_each_iteration=True, seed=123
buffer_size=50, reshuffle_each_iteration=True, seed=self.seed
) # randomize the batch
self.dataset = self.dataset.batch(batch_size=self.batch_size).repeat()