blacked
This commit is contained in:
Родитель
8106c0354e
Коммит
1564033ebb
|
@ -29,4 +29,3 @@ class SARModel:
|
|||
|
||||
def predict(self, items, ratings, top_k, remove_seen):
|
||||
return self.model.predict(items, ratings, top_k, remove_seen)
|
||||
|
||||
|
|
|
@ -5,7 +5,14 @@ This is the one and only (to rule them all) implementation of SAR.
|
|||
import logging
|
||||
import pyspark.sql.functions as F
|
||||
import pandas as pd
|
||||
from pyspark.sql.types import StringType, DoubleType, StructType, StructField, IntegerType, FloatType
|
||||
from pyspark.sql.types import (
|
||||
StringType,
|
||||
DoubleType,
|
||||
StructType,
|
||||
StructField,
|
||||
IntegerType,
|
||||
FloatType,
|
||||
)
|
||||
from pyspark.sql.functions import pandas_udf, PandasUDFType
|
||||
from pysarplus import SARModel
|
||||
|
||||
|
@ -14,7 +21,8 @@ SIM_JACCARD = "jaccard"
|
|||
SIM_LIFT = "lift"
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
log = logging.getLogger('sarplus')
|
||||
log = logging.getLogger("sarplus")
|
||||
|
||||
|
||||
class SARPlus:
|
||||
"""SAR implementation for PySpark"""
|
||||
|
@ -31,7 +39,7 @@ class SARPlus:
|
|||
time_decay_coefficient=30,
|
||||
time_now=None,
|
||||
timedecay_formula=False,
|
||||
threshold=1
|
||||
threshold=1,
|
||||
):
|
||||
assert threshold > 0
|
||||
|
||||
|
@ -44,7 +52,7 @@ class SARPlus:
|
|||
"prefix": table_prefix,
|
||||
"time_now": time_now,
|
||||
"time_decay_coefficient": time_decay_coefficient,
|
||||
"threshold": threshold
|
||||
"threshold": threshold,
|
||||
}
|
||||
|
||||
self.similarity_type = similarity_type
|
||||
|
@ -83,7 +91,7 @@ class SARPlus:
|
|||
# the folling is the query which we want to run
|
||||
|
||||
query = self.f(
|
||||
"""
|
||||
"""
|
||||
SELECT
|
||||
{col_user}, {col_item},
|
||||
SUM({col_rating} * EXP(-log(2) * (latest_timestamp - CAST({col_timestamp} AS long)) / ({time_decay_coefficient} * 3600 * 24))) as {col_rating}
|
||||
|
@ -91,16 +99,19 @@ class SARPlus:
|
|||
(SELECT CAST(MAX({col_timestamp}) AS long) latest_timestamp FROM {prefix}df_train_input)
|
||||
GROUP BY {col_user}, {col_item}
|
||||
CLUSTER BY {col_user}
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
# replace with timedecayed version
|
||||
df = self.spark.sql(query)
|
||||
else:
|
||||
# since SQL is case insensitive, this check needs to be performed similar
|
||||
if self.header['col_timestamp'].lower() in [s.name.lower() for s in df.schema]:
|
||||
if self.header["col_timestamp"].lower() in [
|
||||
s.name.lower() for s in df.schema
|
||||
]:
|
||||
# we need to de-duplicate items by using the latest item
|
||||
query = self.f(
|
||||
"""
|
||||
"""
|
||||
SELECT {col_user}, {col_item}, {col_rating}
|
||||
FROM
|
||||
(
|
||||
|
@ -112,7 +123,7 @@ class SARPlus:
|
|||
WHERE latest = 1
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
df = self.spark.sql(query)
|
||||
|
||||
df.createOrReplaceTempView(self.f("{prefix}df_train"))
|
||||
|
@ -128,7 +139,8 @@ class SARPlus:
|
|||
GROUP BY A.{col_item}, B.{col_item}
|
||||
HAVING COUNT(*) >= {threshold}
|
||||
CLUSTER BY i1, i2
|
||||
""")
|
||||
"""
|
||||
)
|
||||
|
||||
item_cooccurrence = self.spark.sql(query)
|
||||
item_cooccurrence.write.mode("overwrite").saveAsTable(
|
||||
|
@ -148,7 +160,7 @@ class SARPlus:
|
|||
self.item_similarity = item_cooccurrence
|
||||
elif self.similarity_type == SIM_JACCARD:
|
||||
query = self.f(
|
||||
"""
|
||||
"""
|
||||
SELECT i1, i2, value / (M1.margin + M2.margin - value) AS value
|
||||
FROM {prefix}item_cooccurrence A
|
||||
INNER JOIN {prefix}item_marginal M1 ON A.i1 = M1.i
|
||||
|
@ -159,7 +171,7 @@ class SARPlus:
|
|||
self.item_similarity = self.spark.sql(query)
|
||||
elif self.similarity_type == SIM_LIFT:
|
||||
query = self.f(
|
||||
"""
|
||||
"""
|
||||
SELECT i1, i2, value / (M1.margin * M2.margin) AS value
|
||||
FROM {prefix}item_cooccurrence A
|
||||
INNER JOIN {prefix}item_marginal M1 ON A.i1 = M1.i
|
||||
|
@ -169,11 +181,14 @@ class SARPlus:
|
|||
)
|
||||
self.item_similarity = self.spark.sql(query)
|
||||
else:
|
||||
raise ValueError("Unknown similarity type: {0}".format(self.similarity_type))
|
||||
|
||||
raise ValueError(
|
||||
"Unknown similarity type: {0}".format(self.similarity_type)
|
||||
)
|
||||
|
||||
# store upper triangular
|
||||
log.info("sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type)
|
||||
log.info(
|
||||
"sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type
|
||||
)
|
||||
self.item_similarity.write.mode("overwrite").saveAsTable(
|
||||
self.f("{prefix}item_similarity_upper")
|
||||
)
|
||||
|
@ -181,7 +196,7 @@ class SARPlus:
|
|||
# expand upper triangular to full matrix
|
||||
|
||||
query = self.f(
|
||||
"""
|
||||
"""
|
||||
SELECT i1, i2, value
|
||||
FROM
|
||||
(
|
||||
|
@ -223,7 +238,7 @@ class SARPlus:
|
|||
)
|
||||
|
||||
query = self.f(
|
||||
"""
|
||||
"""
|
||||
SELECT a.{col_user}, a.{col_item}, CAST(a.{col_rating} AS double) {col_rating}
|
||||
FROM {prefix}df_train a INNER JOIN {prefix}df_test_users b ON a.{col_user} = b.{col_user}
|
||||
DISTRIBUTE BY {col_user}
|
||||
|
@ -233,39 +248,59 @@ class SARPlus:
|
|||
|
||||
return self.spark.sql(query)
|
||||
|
||||
def recommend_k_items(self, test, cache_path, top_k=10, remove_seen=True, n_user_prediction_partitions=200):
|
||||
def recommend_k_items(
|
||||
self,
|
||||
test,
|
||||
cache_path,
|
||||
top_k=10,
|
||||
remove_seen=True,
|
||||
n_user_prediction_partitions=200,
|
||||
):
|
||||
|
||||
# create item id to continuous index mapping
|
||||
log.info("sarplus.recommend_k_items 1/3: create item index")
|
||||
self.spark.sql(self.f("SELECT i1, row_number() OVER(ORDER BY i1)-1 idx FROM (SELECT DISTINCT i1 FROM {prefix}item_similarity) CLUSTER BY i1"))\
|
||||
.write.mode("overwrite").saveAsTable(self.f("{prefix}item_mapping"))
|
||||
self.spark.sql(
|
||||
self.f(
|
||||
"SELECT i1, row_number() OVER(ORDER BY i1)-1 idx FROM (SELECT DISTINCT i1 FROM {prefix}item_similarity) CLUSTER BY i1"
|
||||
)
|
||||
).write.mode("overwrite").saveAsTable(self.f("{prefix}item_mapping"))
|
||||
|
||||
# map similarity matrix into index space
|
||||
self.spark.sql(self.f("""
|
||||
self.spark.sql(
|
||||
self.f(
|
||||
"""
|
||||
SELECT a.idx i1, b.idx i2, is.value
|
||||
FROM {prefix}item_similarity is, {prefix}item_mapping a, {prefix}item_mapping b
|
||||
WHERE is.i1 = a.i1 AND i2 = b.i1
|
||||
"""))\
|
||||
.write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity_mapped"))
|
||||
"""
|
||||
)
|
||||
).write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity_mapped"))
|
||||
|
||||
cache_path_output = cache_path
|
||||
if cache_path.startswith('dbfs:'):
|
||||
cache_path_input = '/dbfs' + cache_path[5:]
|
||||
if cache_path.startswith("dbfs:"):
|
||||
cache_path_input = "/dbfs" + cache_path[5:]
|
||||
else:
|
||||
cache_path_input = cache_path
|
||||
|
||||
# export similarity matrix for C++ backed UDF
|
||||
log.info("sarplus.recommend_k_items 2/3: prepare similarity matrix")
|
||||
|
||||
self.spark.sql(self.f("SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2"))\
|
||||
.coalesce(1)\
|
||||
.write.format("com.microsoft.sarplus").mode("overwrite")\
|
||||
.save(cache_path_output)
|
||||
self.spark.sql(
|
||||
self.f(
|
||||
"SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2"
|
||||
)
|
||||
).coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save(
|
||||
cache_path_output
|
||||
)
|
||||
|
||||
self.get_user_affinity(test).createOrReplaceTempView(self.f("{prefix}user_affinity"))
|
||||
self.get_user_affinity(test).createOrReplaceTempView(
|
||||
self.f("{prefix}user_affinity")
|
||||
)
|
||||
|
||||
# map item ids to index space
|
||||
pred_input = self.spark.sql(self.f("""
|
||||
pred_input = self.spark.sql(
|
||||
self.f(
|
||||
"""
|
||||
SELECT {col_user}, idx, rating
|
||||
FROM
|
||||
(
|
||||
|
@ -273,13 +308,19 @@ class SARPlus:
|
|||
FROM {prefix}user_affinity JOIN {prefix}item_mapping b ON {col_item} = b.i1
|
||||
)
|
||||
CLUSTER BY {col_user}
|
||||
"""))
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
schema = StructType([
|
||||
StructField("userID", pred_input.schema[self.header['col_user']].dataType, True),
|
||||
StructField("itemID", IntegerType(), True),
|
||||
StructField("score", FloatType(), True)
|
||||
])
|
||||
schema = StructType(
|
||||
[
|
||||
StructField(
|
||||
"userID", pred_input.schema[self.header["col_user"]].dataType, True
|
||||
),
|
||||
StructField("itemID", IntegerType(), True),
|
||||
StructField("score", FloatType(), True),
|
||||
]
|
||||
)
|
||||
|
||||
# make sure only the header is pickled
|
||||
local_header = self.header
|
||||
|
@ -291,33 +332,42 @@ class SARPlus:
|
|||
# The cache_path points to file write to by com.microsoft.sarplus
|
||||
# This has exactly the memory layout we need and since the file is
|
||||
# memory mapped, the memory consumption only happens ones per worker
|
||||
# for all python processes
|
||||
# for all python processes
|
||||
model = SARModel(cache_path_input)
|
||||
preds = model.predict(df['idx'].values, df['rating'].values, top_k, remove_seen)
|
||||
|
||||
user = df[local_header['col_user']].iloc[0]
|
||||
preds = model.predict(
|
||||
df["idx"].values, df["rating"].values, top_k, remove_seen
|
||||
)
|
||||
|
||||
user = df[local_header["col_user"]].iloc[0]
|
||||
|
||||
preds_ret = pd.DataFrame(
|
||||
[(user, x.id, x.score) for x in preds],
|
||||
columns=range(3))
|
||||
[(user, x.id, x.score) for x in preds], columns=range(3)
|
||||
)
|
||||
|
||||
return preds_ret
|
||||
|
||||
|
||||
log.info("sarplus.recommend_k_items 3/3: compute recommendations")
|
||||
|
||||
df_preds = pred_input\
|
||||
.repartition(n_user_prediction_partitions, self.header['col_user'])\
|
||||
.groupby(self.header['col_user'])\
|
||||
df_preds = (
|
||||
pred_input.repartition(
|
||||
n_user_prediction_partitions, self.header["col_user"]
|
||||
)
|
||||
.groupby(self.header["col_user"])
|
||||
.apply(sar_predict_udf)
|
||||
)
|
||||
|
||||
df_preds.createOrReplaceTempView(self.f("{prefix}predictions"))
|
||||
|
||||
return self.spark.sql(self.f("""
|
||||
return self.spark.sql(
|
||||
self.f(
|
||||
"""
|
||||
SELECT userID {col_user}, b.i1 {col_item}, score
|
||||
FROM {prefix}predictions p, {prefix}item_mapping b
|
||||
WHERE p.itemID = b.idx
|
||||
"""))
|
||||
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
def recommend_k_items_slow(self, test, top_k=10, remove_seen=True):
|
||||
"""Recommend top K items for all users which are in the test set.
|
||||
|
||||
|
@ -331,9 +381,9 @@ class SARPlus:
|
|||
if remove_seen:
|
||||
raise ValueError("Not implemented")
|
||||
|
||||
self.get_user_affinity(test)\
|
||||
.write.mode("overwrite")\
|
||||
.saveAsTable(self.f("{prefix}user_affinity"))
|
||||
self.get_user_affinity(test).write.mode("overwrite").saveAsTable(
|
||||
self.f("{prefix}user_affinity")
|
||||
)
|
||||
|
||||
# user_affinity * item_similarity
|
||||
# filter top-k
|
||||
|
@ -357,4 +407,4 @@ class SARPlus:
|
|||
top_k=top_k,
|
||||
)
|
||||
|
||||
return self.spark.sql(query)
|
||||
return self.spark.sql(query)
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
from .SARModel import SARModel
|
||||
from .SARPlus import SARPlus
|
||||
from .SARPlus import SARPlus
|
||||
|
|
|
@ -11,9 +11,13 @@ from pyspark.sql import SparkSession
|
|||
|
||||
from pysarplus import SARPlus, SARModel
|
||||
|
||||
|
||||
def assert_compare(expected_id, expected_score, actual_prediction):
|
||||
assert expected_id == actual_prediction.id
|
||||
assert math.isclose(expected_score, actual_prediction.score, rel_tol=1e-3, abs_tol=1e-3)
|
||||
assert math.isclose(
|
||||
expected_score, actual_prediction.score, rel_tol=1e-3, abs_tol=1e-3
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def spark(app_name="Sample", url="local[*]", memory="1G"):
|
||||
|
@ -27,7 +31,11 @@ def spark(app_name="Sample", url="local[*]", memory="1G"):
|
|||
spark = (
|
||||
SparkSession.builder.appName(app_name)
|
||||
.master(url)
|
||||
.config("spark.jars", os.path.dirname(__file__) + "/../../scala/target/scala-2.11/sarplus_2.11-0.2.6.jar")
|
||||
.config(
|
||||
"spark.jars",
|
||||
os.path.dirname(__file__)
|
||||
+ "/../../scala/target/scala-2.11/sarplus_2.11-0.2.6.jar",
|
||||
)
|
||||
.config("spark.driver.memory", memory)
|
||||
.config("spark.sql.shuffle.partitions", "1")
|
||||
.config("spark.default.parallelism", "1")
|
||||
|
@ -39,19 +47,18 @@ def spark(app_name="Sample", url="local[*]", memory="1G"):
|
|||
|
||||
return spark
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def sample_cache(spark):
|
||||
df = spark.read.csv("tests/sample-input.txt", header=True, inferSchema=True)
|
||||
|
||||
path = "tests/sample-output.sar"
|
||||
|
||||
df.coalesce(1)\
|
||||
.write.format("com.microsoft.sarplus")\
|
||||
.mode("overwrite")\
|
||||
.save(path)
|
||||
df.coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save(path)
|
||||
|
||||
return path
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def header():
|
||||
header = {
|
||||
|
@ -62,6 +69,7 @@ def header():
|
|||
}
|
||||
return header
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def pandas_dummy_dataset(header):
|
||||
"""Load sample dataset in pandas for testing; can be used to create a Spark dataframe
|
||||
|
@ -75,6 +83,7 @@ def pandas_dummy_dataset(header):
|
|||
}
|
||||
return pd.DataFrame(ratings_dict)
|
||||
|
||||
|
||||
@pytest.mark.spark
|
||||
def test_good(spark, sample_cache):
|
||||
model = SARModel(sample_cache)
|
||||
|
@ -84,6 +93,7 @@ def test_good(spark, sample_cache):
|
|||
assert_compare(1, 44, y[1])
|
||||
assert_compare(2, 64, y[2])
|
||||
|
||||
|
||||
@pytest.mark.spark
|
||||
def test_good_less(spark, sample_cache):
|
||||
model = SARModel(sample_cache)
|
||||
|
@ -93,6 +103,7 @@ def test_good_less(spark, sample_cache):
|
|||
assert_compare(1, 11.6, y[1])
|
||||
assert_compare(2, 12.3, y[2])
|
||||
|
||||
|
||||
@pytest.mark.spark
|
||||
def test_good_require_sort(spark, sample_cache):
|
||||
model = SARModel(sample_cache)
|
||||
|
@ -104,6 +115,7 @@ def test_good_require_sort(spark, sample_cache):
|
|||
|
||||
assert 3 == len(y)
|
||||
|
||||
|
||||
@pytest.mark.spark
|
||||
def test_good_require_sort_remove_seen(spark, sample_cache):
|
||||
model = SARModel(sample_cache)
|
||||
|
@ -112,52 +124,65 @@ def test_good_require_sort_remove_seen(spark, sample_cache):
|
|||
assert_compare(2, 64, y[0])
|
||||
assert 1 == len(y)
|
||||
|
||||
|
||||
@pytest.mark.spark
|
||||
def test_pandas(spark, sample_cache):
|
||||
item_scores = pd.DataFrame([(0, 2.3), (1, 3.1)], columns=["itemID", "score"])
|
||||
|
||||
model = SARModel(sample_cache)
|
||||
y = model.predict(item_scores["itemID"].values, item_scores["score"].values, top_k=10, remove_seen=False)
|
||||
y = model.predict(
|
||||
item_scores["itemID"].values,
|
||||
item_scores["score"].values,
|
||||
top_k=10,
|
||||
remove_seen=False,
|
||||
)
|
||||
|
||||
assert_compare(0, 0.85, y[0])
|
||||
assert_compare(1, 6.9699, y[1])
|
||||
assert_compare(2, 9.92, y[2])
|
||||
|
||||
|
||||
@pytest.mark.spark
|
||||
def test_e2e(spark, pandas_dummy_dataset, header):
|
||||
sar = SARPlus(spark, **header)
|
||||
|
||||
|
||||
df = spark.createDataFrame(pandas_dummy_dataset)
|
||||
sar.fit(df)
|
||||
sar.fit(df)
|
||||
|
||||
# assert 4*4 + 32 == sar.item_similarity.count()
|
||||
|
||||
# print(sar.item_similarity
|
||||
# .toPandas()
|
||||
# .pivot_table(index='i1', columns='i2', values='value'))
|
||||
# .toPandas()
|
||||
# .pivot_table(index='i1', columns='i2', values='value'))
|
||||
|
||||
test_df = spark.createDataFrame(pd.DataFrame({
|
||||
header['col_user']: [3],
|
||||
header['col_item']: [2]
|
||||
}))
|
||||
|
||||
r1 = sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False)\
|
||||
.toPandas()\
|
||||
.sort_values([header['col_user'], header['col_item']])\
|
||||
.reset_index(drop=True)
|
||||
|
||||
r2 = sar.recommend_k_items(test_df, "tests/test_e2e_cache", top_k=3, n_user_prediction_partitions=2, remove_seen=False)\
|
||||
.toPandas()\
|
||||
.sort_values([header['col_user'], header['col_item']])\
|
||||
.reset_index(drop=True)
|
||||
|
||||
assert (r1.iloc[:,:2] == r2.iloc[:,:2]).all().all()
|
||||
assert np.allclose(
|
||||
r1.score.values,
|
||||
r2.score.values,
|
||||
1e-3
|
||||
test_df = spark.createDataFrame(
|
||||
pd.DataFrame({header["col_user"]: [3], header["col_item"]: [2]})
|
||||
)
|
||||
|
||||
r1 = (
|
||||
sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False)
|
||||
.toPandas()
|
||||
.sort_values([header["col_user"], header["col_item"]])
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
r2 = (
|
||||
sar.recommend_k_items(
|
||||
test_df,
|
||||
"tests/test_e2e_cache",
|
||||
top_k=3,
|
||||
n_user_prediction_partitions=2,
|
||||
remove_seen=False,
|
||||
)
|
||||
.toPandas()
|
||||
.sort_values([header["col_user"], header["col_item"]])
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all()
|
||||
assert np.allclose(r1.score.values, r2.score.values, 1e-3)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def pandas_dummy(header):
|
||||
ratings_dict = {
|
||||
|
@ -233,10 +258,16 @@ def sar_settings():
|
|||
@pytest.mark.parametrize(
|
||||
"similarity_type, timedecay_formula", [("jaccard", False), ("lift", True)]
|
||||
)
|
||||
def test_fit(spark, similarity_type, timedecay_formula, train_test_dummy_timestamp, header):
|
||||
model = SARPlus(spark, **header, timedecay_formula=timedecay_formula,
|
||||
similarity_type=similarity_type)
|
||||
|
||||
def test_fit(
|
||||
spark, similarity_type, timedecay_formula, train_test_dummy_timestamp, header
|
||||
):
|
||||
model = SARPlus(
|
||||
spark,
|
||||
**header,
|
||||
timedecay_formula=timedecay_formula,
|
||||
similarity_type=similarity_type
|
||||
)
|
||||
|
||||
trainset, testset = train_test_dummy_timestamp
|
||||
|
||||
df = spark.createDataFrame(trainset)
|
||||
|
@ -244,7 +275,7 @@ def test_fit(spark, similarity_type, timedecay_formula, train_test_dummy_timesta
|
|||
|
||||
df = spark.table("trainset")
|
||||
|
||||
model.fit(df)
|
||||
model.fit(df)
|
||||
|
||||
|
||||
"""
|
||||
|
@ -267,77 +298,98 @@ def test_sar_item_similarity(
|
|||
spark, threshold, similarity_type, file, demo_usage_data, sar_settings, header
|
||||
):
|
||||
|
||||
model = SARPlus(spark,
|
||||
**header,
|
||||
timedecay_formula=False,
|
||||
time_decay_coefficient=30,
|
||||
time_now=None,
|
||||
threshold=threshold,
|
||||
similarity_type=similarity_type)
|
||||
model = SARPlus(
|
||||
spark,
|
||||
**header,
|
||||
timedecay_formula=False,
|
||||
time_decay_coefficient=30,
|
||||
time_now=None,
|
||||
threshold=threshold,
|
||||
similarity_type=similarity_type
|
||||
)
|
||||
|
||||
df = spark.createDataFrame(demo_usage_data)
|
||||
model.fit(df)
|
||||
|
||||
# reference
|
||||
item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv")
|
||||
item_similarity_ref = pd.read_csv(
|
||||
sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv"
|
||||
)
|
||||
|
||||
item_similarity_ref = pd.melt(item_similarity_ref,
|
||||
item_similarity_ref = pd.melt(
|
||||
item_similarity_ref,
|
||||
item_similarity_ref.columns[0],
|
||||
item_similarity_ref.columns[1:],
|
||||
'i2',
|
||||
'value')
|
||||
item_similarity_ref.columns = ['i1', 'i2', 'value']
|
||||
"i2",
|
||||
"value",
|
||||
)
|
||||
item_similarity_ref.columns = ["i1", "i2", "value"]
|
||||
|
||||
item_similarity_ref = item_similarity_ref[item_similarity_ref.value > 0]\
|
||||
.sort_values(['i1', 'i2'])\
|
||||
.reset_index(drop=True)\
|
||||
|
||||
# actual
|
||||
item_similarity = model.item_similarity\
|
||||
.toPandas()\
|
||||
.sort_values(['i1', 'i2'])\
|
||||
item_similarity_ref = (
|
||||
item_similarity_ref[item_similarity_ref.value > 0]
|
||||
.sort_values(["i1", "i2"])
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
# actual
|
||||
item_similarity = (
|
||||
model.item_similarity.toPandas()
|
||||
.sort_values(["i1", "i2"])
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
if similarity_type is "cooccurrence":
|
||||
assert((item_similarity_ref == item_similarity).all().all())
|
||||
assert (item_similarity_ref == item_similarity).all().all()
|
||||
else:
|
||||
assert((item_similarity.iloc[:,:1] == item_similarity_ref.iloc[:,:1]).all().all())
|
||||
assert (
|
||||
(item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1]).all().all()
|
||||
)
|
||||
|
||||
assert np.allclose(
|
||||
item_similarity.value.values,
|
||||
item_similarity_ref.value.values
|
||||
item_similarity.value.values, item_similarity_ref.value.values
|
||||
)
|
||||
|
||||
|
||||
# Test 7
|
||||
def test_user_affinity(spark, demo_usage_data, sar_settings, header):
|
||||
time_now = demo_usage_data[header["col_timestamp"]].max()
|
||||
|
||||
model = SARPlus(spark,
|
||||
**header,
|
||||
timedecay_formula=True,
|
||||
time_decay_coefficient=30,
|
||||
time_now=time_now,
|
||||
similarity_type="cooccurrence")
|
||||
model = SARPlus(
|
||||
spark,
|
||||
**header,
|
||||
timedecay_formula=True,
|
||||
time_decay_coefficient=30,
|
||||
time_now=time_now,
|
||||
similarity_type="cooccurrence"
|
||||
)
|
||||
|
||||
df = spark.createDataFrame(demo_usage_data)
|
||||
model.fit(df)
|
||||
|
||||
user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv")
|
||||
user_affinity_ref = pd.melt(user_affinity_ref, user_affinity_ref.columns[0], user_affinity_ref.columns[1:], 'ItemId', 'Rating')
|
||||
user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0]\
|
||||
.reset_index(drop=True)
|
||||
user_affinity_ref = pd.melt(
|
||||
user_affinity_ref,
|
||||
user_affinity_ref.columns[0],
|
||||
user_affinity_ref.columns[1:],
|
||||
"ItemId",
|
||||
"Rating",
|
||||
)
|
||||
user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0].reset_index(
|
||||
drop=True
|
||||
)
|
||||
|
||||
# construct dataframe with test user id we'd like to get the affinity for
|
||||
df_test = spark.createDataFrame(pd.DataFrame({header['col_user']:[sar_settings["TEST_USER_ID"]]}))
|
||||
df_test = spark.createDataFrame(
|
||||
pd.DataFrame({header["col_user"]: [sar_settings["TEST_USER_ID"]]})
|
||||
)
|
||||
user_affinity = model.get_user_affinity(df_test).toPandas().reset_index(drop=True)
|
||||
|
||||
# verify the that item ids are the same
|
||||
assert (user_affinity[header['col_item']] == user_affinity_ref.ItemId).all()
|
||||
assert (user_affinity[header["col_item"]] == user_affinity_ref.ItemId).all()
|
||||
|
||||
assert np.allclose(
|
||||
user_affinity_ref[header['col_rating']].values,
|
||||
user_affinity['Rating'].values,
|
||||
atol=sar_settings["ATOL"]
|
||||
user_affinity_ref[header["col_rating"]].values,
|
||||
user_affinity["Rating"].values,
|
||||
atol=sar_settings["ATOL"],
|
||||
)
|
||||
|
||||
|
||||
|
@ -351,43 +403,52 @@ def test_userpred(
|
|||
):
|
||||
time_now = demo_usage_data[header["col_timestamp"]].max()
|
||||
|
||||
test_id = '{0}_{1}_{2}'.format(threshold, similarity_type, file)
|
||||
test_id = "{0}_{1}_{2}".format(threshold, similarity_type, file)
|
||||
|
||||
model = SARPlus(spark,
|
||||
**header,
|
||||
table_prefix=test_id,
|
||||
timedecay_formula=True,
|
||||
time_decay_coefficient=30,
|
||||
time_now=time_now,
|
||||
threshold=threshold,
|
||||
similarity_type=similarity_type)
|
||||
model = SARPlus(
|
||||
spark,
|
||||
**header,
|
||||
table_prefix=test_id,
|
||||
timedecay_formula=True,
|
||||
time_decay_coefficient=30,
|
||||
time_now=time_now,
|
||||
threshold=threshold,
|
||||
similarity_type=similarity_type
|
||||
)
|
||||
|
||||
df = spark.createDataFrame(demo_usage_data)
|
||||
model.fit(df)
|
||||
|
||||
url = (sar_settings["FILE_DIR"]
|
||||
url = (
|
||||
sar_settings["FILE_DIR"]
|
||||
+ "userpred_"
|
||||
+ file
|
||||
+ str(threshold)
|
||||
+ "_userid_only.csv")
|
||||
+ "_userid_only.csv"
|
||||
)
|
||||
|
||||
pred_ref = pd.read_csv(url)
|
||||
pred_ref = pd.wide_to_long(pred_ref, ['rec','score'], 'user', 'idx')\
|
||||
.sort_values('score', ascending=False)\
|
||||
pred_ref = (
|
||||
pd.wide_to_long(pred_ref, ["rec", "score"], "user", "idx")
|
||||
.sort_values("score", ascending=False)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
# Note: it's important to have a separate cache_path for each run as they're interferring with each other
|
||||
pred = model.recommend_k_items(
|
||||
spark.createDataFrame(demo_usage_data[
|
||||
demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
|
||||
]),
|
||||
cache_path='test_userpred-' + test_id,
|
||||
spark.createDataFrame(
|
||||
demo_usage_data[
|
||||
demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
|
||||
]
|
||||
),
|
||||
cache_path="test_userpred-" + test_id,
|
||||
top_k=10,
|
||||
n_user_prediction_partitions=1)
|
||||
n_user_prediction_partitions=1,
|
||||
)
|
||||
|
||||
pred = pred.toPandas()\
|
||||
.sort_values('score', ascending=False)\
|
||||
.reset_index(drop=True)
|
||||
pred = pred.toPandas().sort_values("score", ascending=False).reset_index(drop=True)
|
||||
|
||||
assert (pred.MovieId.values == pred_ref.rec.values).all()
|
||||
assert np.allclose(pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"])
|
||||
assert np.allclose(
|
||||
pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"]
|
||||
)
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
from distutils.core import setup
|
||||
|
||||
setup(name='pysarplus_dummy',
|
||||
version='0.2',
|
||||
description='pysarplus dummy package to trigger spark packaging',
|
||||
author='Markus Cozowicz',
|
||||
author_email='marcozo@microsoft.com',
|
||||
url='https://github.com/Microsoft/Recommenders/contrib/sarplus',
|
||||
packages=['pysarplus_dummy'],
|
||||
)
|
||||
setup(
|
||||
name="pysarplus_dummy",
|
||||
version="0.2",
|
||||
description="pysarplus dummy package to trigger spark packaging",
|
||||
author="Markus Cozowicz",
|
||||
author_email="marcozo@microsoft.com",
|
||||
url="https://github.com/Microsoft/Recommenders/contrib/sarplus",
|
||||
packages=["pysarplus_dummy"],
|
||||
)
|
||||
|
|
|
@ -91,4 +91,3 @@ if __name__ == "__main__":
|
|||
score_result,
|
||||
schema=DataFrameSchema.data_frame_to_dict(score_result),
|
||||
)
|
||||
|
||||
|
|
|
@ -91,4 +91,3 @@ if __name__ == "__main__":
|
|||
score_result,
|
||||
schema=DataFrameSchema.data_frame_to_dict(score_result),
|
||||
)
|
||||
|
||||
|
|
|
@ -91,4 +91,3 @@ if __name__ == "__main__":
|
|||
score_result,
|
||||
schema=DataFrameSchema.data_frame_to_dict(score_result),
|
||||
)
|
||||
|
||||
|
|
|
@ -91,4 +91,3 @@ if __name__ == "__main__":
|
|||
score_result,
|
||||
schema=DataFrameSchema.data_frame_to_dict(score_result),
|
||||
)
|
||||
|
||||
|
|
|
@ -6,29 +6,32 @@ import joblib
|
|||
|
||||
from azureml.studio.core.data_frame_schema import DataFrameSchema
|
||||
from azureml.studio.core.logger import module_logger as logger
|
||||
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory
|
||||
from azureml.studio.core.io.data_frame_directory import (
|
||||
load_data_frame_from_directory,
|
||||
save_data_frame_to_directory,
|
||||
)
|
||||
from azureml.studio.core.io.model_directory import load_model_from_directory
|
||||
|
||||
|
||||
class ScoreType(Enum):
|
||||
ITEM_RECOMMENDATION = 'Item recommendation'
|
||||
RATING_PREDICTION = 'Rating prediction'
|
||||
ITEM_RECOMMENDATION = "Item recommendation"
|
||||
RATING_PREDICTION = "Rating prediction"
|
||||
|
||||
|
||||
class RankingMetric(Enum):
|
||||
RATING = 'Rating'
|
||||
SIMILARITY = 'Similarity'
|
||||
POPULARITY = 'Popularity'
|
||||
RATING = "Rating"
|
||||
SIMILARITY = "Similarity"
|
||||
POPULARITY = "Popularity"
|
||||
|
||||
|
||||
class ItemSet(Enum):
|
||||
TRAIN_ONLY = 'Items in training set'
|
||||
SCORE_ONLY = 'Items in score set'
|
||||
TRAIN_ONLY = "Items in training set"
|
||||
SCORE_ONLY = "Items in score set"
|
||||
|
||||
|
||||
def joblib_loader(load_from_dir, model_spec):
|
||||
file_name = model_spec['file_name']
|
||||
with open(Path(load_from_dir) / file_name, 'rb') as fin:
|
||||
file_name = model_spec["file_name"]
|
||||
with open(Path(load_from_dir) / file_name, "rb") as fin:
|
||||
return joblib.load(fin)
|
||||
|
||||
|
||||
|
@ -45,56 +48,87 @@ class ScoreSARModule:
|
|||
def input_data(self):
|
||||
return self._input_data
|
||||
|
||||
def recommend_items(self, ranking_metric, top_k, sort_top_k, remove_seen, normalize):
|
||||
def recommend_items(
|
||||
self, ranking_metric, top_k, sort_top_k, remove_seen, normalize
|
||||
):
|
||||
if ranking_metric == RankingMetric.RATING:
|
||||
return self.model.recommend_k_items(test=self.input_data, top_k=top_k, sort_top_k=sort_top_k,
|
||||
remove_seen=remove_seen, normalize=normalize)
|
||||
return self.model.recommend_k_items(
|
||||
test=self.input_data,
|
||||
top_k=top_k,
|
||||
sort_top_k=sort_top_k,
|
||||
remove_seen=remove_seen,
|
||||
normalize=normalize,
|
||||
)
|
||||
if ranking_metric == RankingMetric.SIMILARITY:
|
||||
return self.model.get_item_based_topk(items=self.input_data, top_k=top_k, sort_top_k=sort_top_k)
|
||||
return self.model.get_item_based_topk(
|
||||
items=self.input_data, top_k=top_k, sort_top_k=sort_top_k
|
||||
)
|
||||
if ranking_metric == RankingMetric.POPULARITY:
|
||||
return self.model.get_popularity_based_topk(top_k=top_k, sort_top_k=sort_top_k)
|
||||
return self.model.get_popularity_based_topk(
|
||||
top_k=top_k, sort_top_k=sort_top_k
|
||||
)
|
||||
raise ValueError(f"Got unexpected ranking metric: {ranking_metric}.")
|
||||
|
||||
def predict_ratings(self, items_to_predict, normalize):
|
||||
if items_to_predict == ItemSet.TRAIN_ONLY:
|
||||
return self.model.predict_training_items(test=self.input_data, normalize=normalize)
|
||||
return self.model.predict_training_items(
|
||||
test=self.input_data, normalize=normalize
|
||||
)
|
||||
if items_to_predict == ItemSet.SCORE_ONLY:
|
||||
return self.model.predict(test=self.input_data, normalize=normalize)
|
||||
raise ValueError(f"Got unexpected 'items to predict': {items_to_predict}.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
'--trained-model', help='The directory contains trained SAR model.')
|
||||
"--trained-model", help="The directory contains trained SAR model."
|
||||
)
|
||||
parser.add_argument("--dataset-to-score", help="Dataset to score")
|
||||
parser.add_argument(
|
||||
'--dataset-to-score', help='Dataset to score')
|
||||
"--score-type",
|
||||
type=str,
|
||||
help="The type of score which the recommender should output",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--score-type', type=str, help='The type of score which the recommender should output')
|
||||
"--items-to-predict",
|
||||
type=str,
|
||||
help="The set of items to predict for test users",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--items-to-predict', type=str, help='The set of items to predict for test users')
|
||||
"--normalize",
|
||||
type=str,
|
||||
help="Normalize predictions to scale of original ratings",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--normalize', type=str, help='Normalize predictions to scale of original ratings')
|
||||
"--ranking-metric",
|
||||
type=str,
|
||||
help="The metric of ranking used in item recommendation",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--ranking-metric', type=str, help='The metric of ranking used in item recommendation')
|
||||
"--top-k", type=int, help="The number of top items to recommend."
|
||||
)
|
||||
parser.add_argument("--sort-top-k", type=str, help="Sort top k results.")
|
||||
parser.add_argument(
|
||||
'--top-k', type=int, help='The number of top items to recommend.')
|
||||
parser.add_argument(
|
||||
'--sort-top-k', type=str, help='Sort top k results.')
|
||||
parser.add_argument(
|
||||
'--remove-seen-items', type=str, help='Remove items seen in training from recommendation')
|
||||
parser.add_argument(
|
||||
'--score-result', help='Ratings or items to output')
|
||||
"--remove-seen-items",
|
||||
type=str,
|
||||
help="Remove items seen in training from recommendation",
|
||||
)
|
||||
parser.add_argument("--score-result", help="Ratings or items to output")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
logger.info(f"Arguments: {args}")
|
||||
sort_top_k = strtobool(args.sort_top_k) if args.sort_top_k else None
|
||||
remove_seen_items = strtobool(args.remove_seen_items) if args.remove_seen_items else None
|
||||
remove_seen_items = (
|
||||
strtobool(args.remove_seen_items) if args.remove_seen_items else None
|
||||
)
|
||||
normalize = strtobool(args.normalize) if args.normalize else None
|
||||
|
||||
sar_model = load_model_from_directory(args.trained_model, model_loader=joblib_loader).data
|
||||
sar_model = load_model_from_directory(
|
||||
args.trained_model, model_loader=joblib_loader
|
||||
).data
|
||||
dataset_to_score = load_data_frame_from_directory(args.dataset_to_score).data
|
||||
logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}")
|
||||
|
||||
|
@ -102,14 +136,22 @@ if __name__ == '__main__':
|
|||
|
||||
score_type = ScoreType(args.score_type)
|
||||
if score_type == ScoreType.ITEM_RECOMMENDATION:
|
||||
score_result = score_sar_module.recommend_items(ranking_metric=RankingMetric(args.ranking_metric),
|
||||
top_k=args.top_k, sort_top_k=sort_top_k,
|
||||
remove_seen=args.remove_seen_items, normalize=normalize)
|
||||
score_result = score_sar_module.recommend_items(
|
||||
ranking_metric=RankingMetric(args.ranking_metric),
|
||||
top_k=args.top_k,
|
||||
sort_top_k=sort_top_k,
|
||||
remove_seen=args.remove_seen_items,
|
||||
normalize=normalize,
|
||||
)
|
||||
elif score_type == ScoreType.RATING_PREDICTION:
|
||||
score_result = score_sar_module.predict_ratings(items_to_predict=ItemSet(args.items_to_predict),
|
||||
normalize=normalize)
|
||||
score_result = score_sar_module.predict_ratings(
|
||||
items_to_predict=ItemSet(args.items_to_predict), normalize=normalize
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Got unexpected score type: {score_type}.")
|
||||
|
||||
save_data_frame_to_directory(args.score_result, data=score_result,
|
||||
schema=DataFrameSchema.data_frame_to_dict(score_result))
|
||||
save_data_frame_to_directory(
|
||||
args.score_result,
|
||||
data=score_result,
|
||||
schema=DataFrameSchema.data_frame_to_dict(score_result),
|
||||
)
|
||||
|
|
|
@ -3,44 +3,40 @@ import argparse
|
|||
from azureml.studio.core.logger import module_logger as logger
|
||||
from reco_utils.dataset.python_splitters import python_stratified_split
|
||||
from azureml.studio.core.data_frame_schema import DataFrameSchema
|
||||
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory
|
||||
from azureml.studio.core.io.data_frame_directory import (
|
||||
load_data_frame_from_directory,
|
||||
save_data_frame_to_directory,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
'--input-path',
|
||||
help='The input directory.',
|
||||
"--input-path", help="The input directory.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--ratio', type=float,
|
||||
help='A float parameter.',
|
||||
"--ratio", type=float, help="A float parameter.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--col-user', type=str,
|
||||
help='A string parameter.',
|
||||
"--col-user", type=str, help="A string parameter.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--col-item', type=str,
|
||||
help='A string parameter.',
|
||||
"--col-item", type=str, help="A string parameter.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--seed', type=int,
|
||||
help='An int parameter.',
|
||||
"--seed", type=int, help="An int parameter.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output-train',
|
||||
help='The output training data directory.',
|
||||
"--output-train", help="The output training data directory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-test',
|
||||
help='The output test data directory.',
|
||||
"--output-test", help="The output test data directory.",
|
||||
)
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
@ -62,12 +58,24 @@ if __name__ == '__main__':
|
|||
logger.debug(f"Shape of loaded DataFrame: {input_df.shape}")
|
||||
logger.debug(f"Cols of DataFrame: {input_df.columns}")
|
||||
|
||||
output_train, output_test = python_stratified_split(input_df, ratio=args.ratio, col_user=args.col_user,
|
||||
col_item=args.col_item, seed=args.seed)
|
||||
output_train, output_test = python_stratified_split(
|
||||
input_df,
|
||||
ratio=args.ratio,
|
||||
col_user=args.col_user,
|
||||
col_item=args.col_item,
|
||||
seed=args.seed,
|
||||
)
|
||||
|
||||
logger.debug(f"Output path: {args.output_train}")
|
||||
logger.debug(f"Output path: {args.output_test}")
|
||||
|
||||
save_data_frame_to_directory(args.output_train, output_train, schema=DataFrameSchema.data_frame_to_dict(output_train))
|
||||
save_data_frame_to_directory(args.output_test, output_test, schema=DataFrameSchema.data_frame_to_dict(output_test))
|
||||
|
||||
save_data_frame_to_directory(
|
||||
args.output_train,
|
||||
output_train,
|
||||
schema=DataFrameSchema.data_frame_to_dict(output_train),
|
||||
)
|
||||
save_data_frame_to_directory(
|
||||
args.output_test,
|
||||
output_test,
|
||||
schema=DataFrameSchema.data_frame_to_dict(output_test),
|
||||
)
|
||||
|
|
|
@ -9,10 +9,12 @@ import shutil
|
|||
|
||||
import papermill as pm
|
||||
import tensorflow as tf
|
||||
|
||||
print("TensorFlow version:", tf.VERSION)
|
||||
|
||||
try:
|
||||
from azureml.core import Run
|
||||
|
||||
run = Run.get_context()
|
||||
except ImportError:
|
||||
run = None
|
||||
|
@ -20,15 +22,11 @@ except ImportError:
|
|||
from reco_utils.common.constants import (
|
||||
DEFAULT_USER_COL,
|
||||
DEFAULT_ITEM_COL,
|
||||
DEFAULT_RATING_COL
|
||||
DEFAULT_RATING_COL,
|
||||
)
|
||||
|
||||
|
||||
NOTEBOOK_NAME = os.path.join(
|
||||
"notebooks",
|
||||
"00_quick_start",
|
||||
"wide_deep_movielens.ipynb"
|
||||
)
|
||||
NOTEBOOK_NAME = os.path.join("notebooks", "00_quick_start", "wide_deep_movielens.ipynb")
|
||||
OUTPUT_NOTEBOOK = "wide_deep.ipynb"
|
||||
|
||||
|
||||
|
@ -39,7 +37,11 @@ def _log(metric, value):
|
|||
Otherwise, record as a single value of the metric.
|
||||
"""
|
||||
if run is not None:
|
||||
if isinstance(value, list) and len(value) > 0 and isinstance(value[0], (int, float)):
|
||||
if (
|
||||
isinstance(value, list)
|
||||
and len(value) > 0
|
||||
and isinstance(value[0], (int, float))
|
||||
):
|
||||
run.log_list(metric, value)
|
||||
else:
|
||||
# Force cast to str since run.log will raise an error if the value is iterable.
|
||||
|
@ -50,58 +52,96 @@ def _log(metric, value):
|
|||
# Parse arguments passed by Hyperdrive
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('--top-k', type=int, dest='TOP_K', help="Top k recommendation", default=10)
|
||||
parser.add_argument(
|
||||
"--top-k", type=int, dest="TOP_K", help="Top k recommendation", default=10
|
||||
)
|
||||
# Data path
|
||||
parser.add_argument('--datastore', type=str, dest='DATA_DIR', help="Datastore path")
|
||||
parser.add_argument('--train-datapath', type=str, dest='TRAIN_PICKLE_PATH')
|
||||
parser.add_argument('--test-datapath', type=str, dest='TEST_PICKLE_PATH')
|
||||
parser.add_argument('--model-dir', type=str, dest='MODEL_DIR', default='model_checkpoints')
|
||||
parser.add_argument("--datastore", type=str, dest="DATA_DIR", help="Datastore path")
|
||||
parser.add_argument("--train-datapath", type=str, dest="TRAIN_PICKLE_PATH")
|
||||
parser.add_argument("--test-datapath", type=str, dest="TEST_PICKLE_PATH")
|
||||
parser.add_argument(
|
||||
"--model-dir", type=str, dest="MODEL_DIR", default="model_checkpoints"
|
||||
)
|
||||
# Data column names
|
||||
parser.add_argument('--user-col', type=str, dest='USER_COL', default=DEFAULT_USER_COL)
|
||||
parser.add_argument('--item-col', type=str, dest='ITEM_COL', default=DEFAULT_ITEM_COL)
|
||||
parser.add_argument('--rating-col', type=str, dest='RATING_COL', default=DEFAULT_RATING_COL)
|
||||
parser.add_argument('--item-feat-col', type=str, dest='ITEM_FEAT_COL') # Optional
|
||||
parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='RANKING_METRICS', default=['ndcg_at_k'])
|
||||
parser.add_argument('--rating-metrics', type=str, nargs='*', dest='RATING_METRICS', default=['rmse'])
|
||||
parser.add_argument("--user-col", type=str, dest="USER_COL", default=DEFAULT_USER_COL)
|
||||
parser.add_argument("--item-col", type=str, dest="ITEM_COL", default=DEFAULT_ITEM_COL)
|
||||
parser.add_argument(
|
||||
"--rating-col", type=str, dest="RATING_COL", default=DEFAULT_RATING_COL
|
||||
)
|
||||
parser.add_argument("--item-feat-col", type=str, dest="ITEM_FEAT_COL") # Optional
|
||||
parser.add_argument(
|
||||
"--ranking-metrics",
|
||||
type=str,
|
||||
nargs="*",
|
||||
dest="RANKING_METRICS",
|
||||
default=["ndcg_at_k"],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rating-metrics", type=str, nargs="*", dest="RATING_METRICS", default=["rmse"]
|
||||
)
|
||||
# Model type: either 'wide', 'deep', or 'wide_deep'
|
||||
parser.add_argument('--model-type', type=str, dest='MODEL_TYPE', default='wide_deep')
|
||||
parser.add_argument("--model-type", type=str, dest="MODEL_TYPE", default="wide_deep")
|
||||
# Wide model params
|
||||
parser.add_argument('--linear-optimizer', type=str, dest='LINEAR_OPTIMIZER', default='Ftrl')
|
||||
parser.add_argument('--linear-optimizer-lr', type=float, dest='LINEAR_OPTIMIZER_LR', default=0.01)
|
||||
parser.add_argument('--linear-l1-reg', type=float, dest='LINEAR_L1_REG', default=0.0)
|
||||
parser.add_argument('--linear-l2-reg', type=float, dest='LINEAR_L2_REG', default=0.0)
|
||||
parser.add_argument('--linear-momentum', type=float, dest='LINEAR_MOMENTUM', default=0.9)
|
||||
parser.add_argument(
|
||||
"--linear-optimizer", type=str, dest="LINEAR_OPTIMIZER", default="Ftrl"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--linear-optimizer-lr", type=float, dest="LINEAR_OPTIMIZER_LR", default=0.01
|
||||
)
|
||||
parser.add_argument("--linear-l1-reg", type=float, dest="LINEAR_L1_REG", default=0.0)
|
||||
parser.add_argument("--linear-l2-reg", type=float, dest="LINEAR_L2_REG", default=0.0)
|
||||
parser.add_argument(
|
||||
"--linear-momentum", type=float, dest="LINEAR_MOMENTUM", default=0.9
|
||||
)
|
||||
# Deep model params
|
||||
parser.add_argument('--dnn-optimizer', type=str, dest='DNN_OPTIMIZER', default='Adagrad')
|
||||
parser.add_argument('--dnn-optimizer-lr', type=float, dest='DNN_OPTIMIZER_LR', default=0.01)
|
||||
parser.add_argument('--dnn-l1-reg', type=float, dest='DNN_L1_REG', default=0.0)
|
||||
parser.add_argument('--dnn-l2-reg', type=float, dest='DNN_L2_REG', default=0.0)
|
||||
parser.add_argument('--dnn-momentum', type=float, dest='DNN_MOMENTUM', default=0.9)
|
||||
parser.add_argument('--dnn-hidden-layer-1', type=int, dest='DNN_HIDDEN_LAYER_1', default=0)
|
||||
parser.add_argument('--dnn-hidden-layer-2', type=int, dest='DNN_HIDDEN_LAYER_2', default=0)
|
||||
parser.add_argument('--dnn-hidden-layer-3', type=int, dest='DNN_HIDDEN_LAYER_3', default=128)
|
||||
parser.add_argument('--dnn-hidden-layer-4', type=int, dest='DNN_HIDDEN_LAYER_4', default=128)
|
||||
parser.add_argument('--dnn-user-embedding-dim', type=int, dest='DNN_USER_DIM', default=8)
|
||||
parser.add_argument('--dnn-item-embedding-dim', type=int, dest='DNN_ITEM_DIM', default=8)
|
||||
parser.add_argument('--dnn-batch-norm', type=int, dest='DNN_BATCH_NORM', default=1)
|
||||
parser.add_argument('--dnn-dropout', type=float, dest='DNN_DROPOUT', default=0.0)
|
||||
parser.add_argument(
|
||||
"--dnn-optimizer", type=str, dest="DNN_OPTIMIZER", default="Adagrad"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dnn-optimizer-lr", type=float, dest="DNN_OPTIMIZER_LR", default=0.01
|
||||
)
|
||||
parser.add_argument("--dnn-l1-reg", type=float, dest="DNN_L1_REG", default=0.0)
|
||||
parser.add_argument("--dnn-l2-reg", type=float, dest="DNN_L2_REG", default=0.0)
|
||||
parser.add_argument("--dnn-momentum", type=float, dest="DNN_MOMENTUM", default=0.9)
|
||||
parser.add_argument(
|
||||
"--dnn-hidden-layer-1", type=int, dest="DNN_HIDDEN_LAYER_1", default=0
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dnn-hidden-layer-2", type=int, dest="DNN_HIDDEN_LAYER_2", default=0
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dnn-hidden-layer-3", type=int, dest="DNN_HIDDEN_LAYER_3", default=128
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dnn-hidden-layer-4", type=int, dest="DNN_HIDDEN_LAYER_4", default=128
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dnn-user-embedding-dim", type=int, dest="DNN_USER_DIM", default=8
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dnn-item-embedding-dim", type=int, dest="DNN_ITEM_DIM", default=8
|
||||
)
|
||||
parser.add_argument("--dnn-batch-norm", type=int, dest="DNN_BATCH_NORM", default=1)
|
||||
parser.add_argument("--dnn-dropout", type=float, dest="DNN_DROPOUT", default=0.0)
|
||||
# Training parameters
|
||||
parser.add_argument('--steps', type=int, dest='STEPS', default=10000)
|
||||
parser.add_argument('--batch-size', type=int, dest='BATCH_SIZE', default=128)
|
||||
parser.add_argument('--evaluate-while-training', dest='EVALUATE_WHILE_TRAINING', action='store_true')
|
||||
parser.add_argument("--steps", type=int, dest="STEPS", default=10000)
|
||||
parser.add_argument("--batch-size", type=int, dest="BATCH_SIZE", default=128)
|
||||
parser.add_argument(
|
||||
"--evaluate-while-training", dest="EVALUATE_WHILE_TRAINING", action="store_true"
|
||||
)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
params = vars(args)
|
||||
|
||||
if params['TOP_K'] <= 0:
|
||||
if params["TOP_K"] <= 0:
|
||||
raise ValueError("Top K should be larger than 0")
|
||||
|
||||
if params['MODEL_TYPE'] not in {'wide', 'deep', 'wide_deep'}:
|
||||
if params["MODEL_TYPE"] not in {"wide", "deep", "wide_deep"}:
|
||||
raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'")
|
||||
|
||||
if params['DATA_DIR'] is None:
|
||||
if params["DATA_DIR"] is None:
|
||||
raise ValueError("Datastore path should be given")
|
||||
|
||||
print("Args:")
|
||||
|
@ -111,10 +151,7 @@ for k, v in params.items():
|
|||
print("Run", NOTEBOOK_NAME)
|
||||
|
||||
pm.execute_notebook(
|
||||
NOTEBOOK_NAME,
|
||||
OUTPUT_NOTEBOOK,
|
||||
parameters=params,
|
||||
kernel_name='python3'
|
||||
NOTEBOOK_NAME, OUTPUT_NOTEBOOK, parameters=params, kernel_name="python3"
|
||||
)
|
||||
nb = pm.read_notebook(OUTPUT_NOTEBOOK)
|
||||
|
||||
|
@ -123,4 +160,4 @@ for m, v in nb.data.items():
|
|||
|
||||
# clean-up
|
||||
os.remove(OUTPUT_NOTEBOOK)
|
||||
shutil.rmtree(params['MODEL_DIR'], ignore_errors=True)
|
||||
shutil.rmtree(params["MODEL_DIR"], ignore_errors=True)
|
||||
|
|
|
@ -9,10 +9,10 @@ DEFAULT_LABEL_COL = "label"
|
|||
DEFAULT_TIMESTAMP_COL = "timestamp"
|
||||
DEFAULT_PREDICTION_COL = "prediction"
|
||||
COL_DICT = {
|
||||
"col_user": DEFAULT_USER_COL,
|
||||
"col_item": DEFAULT_ITEM_COL,
|
||||
"col_rating": DEFAULT_RATING_COL,
|
||||
"col_prediction": DEFAULT_PREDICTION_COL
|
||||
"col_user": DEFAULT_USER_COL,
|
||||
"col_item": DEFAULT_ITEM_COL,
|
||||
"col_rating": DEFAULT_RATING_COL,
|
||||
"col_prediction": DEFAULT_PREDICTION_COL,
|
||||
}
|
||||
|
||||
# Filtering variables
|
||||
|
|
|
@ -125,4 +125,3 @@ def get_cudnn_version():
|
|||
else:
|
||||
raise ValueError("Not in Windows, Linux or Mac")
|
||||
return find_cudnn_in_headers(candidates)
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ from __future__ import division # 1/2 == 0.5, as in Py3
|
|||
from __future__ import absolute_import # avoid hiding global modules with locals
|
||||
from __future__ import print_function # force use of print("hello")
|
||||
from __future__ import (
|
||||
unicode_literals
|
||||
unicode_literals,
|
||||
) # force unadorned strings "" to be Unicode without prepending u""
|
||||
import time
|
||||
import memory_profiler
|
||||
|
@ -96,4 +96,3 @@ def pre_run_cell():
|
|||
"""Capture current time before we execute the current command"""
|
||||
global t1
|
||||
t1 = time.time()
|
||||
|
||||
|
|
|
@ -34,10 +34,14 @@ def line_graph(
|
|||
# Setup figure only once
|
||||
if subplot[2] == 1:
|
||||
if plot_size:
|
||||
plt.figure(figsize=(
|
||||
plot_size[0]*subplot[1], # fig width = plot width * num columns
|
||||
plot_size[1]*subplot[0] # fig height = plot height * num rows
|
||||
))
|
||||
plt.figure(
|
||||
figsize=(
|
||||
plot_size[0]
|
||||
* subplot[1], # fig width = plot width * num columns
|
||||
plot_size[1]
|
||||
* subplot[0], # fig height = plot height * num rows
|
||||
)
|
||||
)
|
||||
plt.subplots_adjust(wspace=0.5)
|
||||
plt.subplot(*subplot)
|
||||
else:
|
||||
|
|
|
@ -354,4 +354,4 @@ class MetricsLogger:
|
|||
Returns:
|
||||
dict: Log metrics.
|
||||
"""
|
||||
return self._log
|
||||
return self._log
|
||||
|
|
|
@ -68,4 +68,3 @@ class Timer(object):
|
|||
raise ValueError("Timer has not been stopped, please use stop().")
|
||||
else:
|
||||
return self._interval
|
||||
|
||||
|
|
|
@ -96,4 +96,3 @@ def find_database(client, id):
|
|||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
|
|
@ -174,4 +174,3 @@ def get_spark_schema(header=DEFAULT_HEADER):
|
|||
for i in range(26):
|
||||
schema.add(StructField(header[i + n_ints], StringType()))
|
||||
return schema
|
||||
|
||||
|
|
|
@ -78,4 +78,4 @@ def download_path(path=None):
|
|||
tmp_dir.cleanup()
|
||||
else:
|
||||
path = os.path.realpath(path)
|
||||
yield path
|
||||
yield path
|
||||
|
|
|
@ -199,7 +199,7 @@ def load_pandas_df(
|
|||
movie_col = header[1]
|
||||
|
||||
with download_path(local_cache_path) as path:
|
||||
filepath = os.path.join(path, "ml-{}.zip".format(size))
|
||||
filepath = os.path.join(path, "ml-{}.zip".format(size))
|
||||
datapath, item_datapath = _maybe_download_and_extract(size, filepath)
|
||||
|
||||
# Load movie features such as title, genres, and release year
|
||||
|
@ -256,7 +256,7 @@ def load_item_df(
|
|||
raise ValueError(ERROR_MOVIE_LENS_SIZE)
|
||||
|
||||
with download_path(local_cache_path) as path:
|
||||
filepath = os.path.join(path, "ml-{}.zip".format(size))
|
||||
filepath = os.path.join(path, "ml-{}.zip".format(size))
|
||||
_, item_datapath = _maybe_download_and_extract(size, filepath)
|
||||
item_df = _load_item_df(
|
||||
size, item_datapath, movie_col, title_col, genres_col, year_col
|
||||
|
@ -404,14 +404,16 @@ def load_spark_df(
|
|||
movie_col = schema[1].name
|
||||
|
||||
with download_path(local_cache_path) as path:
|
||||
filepath = os.path.join(path, "ml-{}.zip".format(size))
|
||||
filepath = os.path.join(path, "ml-{}.zip".format(size))
|
||||
datapath, item_datapath = _maybe_download_and_extract(size, filepath)
|
||||
spark_datapath = "file:///" + datapath # shorten form of file://localhost/
|
||||
|
||||
# Load movie features such as title, genres, and release year.
|
||||
# Since the file size is small, we directly load as pd.DataFrame from the driver node
|
||||
# and then convert into spark.DataFrame
|
||||
item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col)
|
||||
item_pd_df = _load_item_df(
|
||||
size, item_datapath, movie_col, title_col, genres_col, year_col
|
||||
)
|
||||
item_df = spark.createDataFrame(item_pd_df) if item_pd_df is not None else None
|
||||
|
||||
if is_databricks():
|
||||
|
@ -467,8 +469,7 @@ def _get_schema(header, schema):
|
|||
schema = StructType()
|
||||
try:
|
||||
(
|
||||
schema
|
||||
.add(StructField(header[0], IntegerType()))
|
||||
schema.add(StructField(header[0], IntegerType()))
|
||||
.add(StructField(header[1], IntegerType()))
|
||||
.add(StructField(header[2], FloatType()))
|
||||
.add(StructField(header[3], LongType()))
|
||||
|
|
|
@ -220,7 +220,7 @@ class LibffmConverter:
|
|||
|
||||
def _convert(field, feature, field_index, field_feature_index_dict):
|
||||
field_feature_index = field_feature_index_dict[(field, feature)]
|
||||
if isinstance(feature, str):
|
||||
if isinstance(feature, str):
|
||||
feature = 1
|
||||
return "{}:{}:{}".format(field_index, field_feature_index, feature)
|
||||
|
||||
|
|
|
@ -623,7 +623,9 @@ def map_at_k(
|
|||
|
||||
# calculate reciprocal rank of items for each user and sum them up
|
||||
df_hit_sorted = df_hit.copy()
|
||||
df_hit_sorted["rr"] = (df_hit_sorted.groupby(col_user).cumcount() + 1) / df_hit_sorted["rank"]
|
||||
df_hit_sorted["rr"] = (
|
||||
df_hit_sorted.groupby(col_user).cumcount() + 1
|
||||
) / df_hit_sorted["rank"]
|
||||
df_hit_sorted = df_hit_sorted.groupby(col_user).agg({"rr": "sum"}).reset_index()
|
||||
|
||||
df_merge = pd.merge(df_hit_sorted, df_hit_count, on=col_user)
|
||||
|
|
|
@ -486,6 +486,7 @@ def ndcg_score(y_true, y_score, k=10):
|
|||
actual = dcg_score(y_true, y_score, k)
|
||||
return actual / best
|
||||
|
||||
|
||||
def hit_score(y_true, y_score, k=10):
|
||||
"""Computing hit score metric at k.
|
||||
|
||||
|
@ -503,6 +504,7 @@ def hit_score(y_true, y_score, k=10):
|
|||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def dcg_score(y_true, y_score, k=10):
|
||||
"""Computing dcg score metric at k.
|
||||
|
||||
|
@ -521,7 +523,6 @@ def dcg_score(y_true, y_score, k=10):
|
|||
return np.sum(gains / discounts)
|
||||
|
||||
|
||||
|
||||
def cal_metric(labels, preds, metrics):
|
||||
"""Calculate metrics,such as auc, logloss.
|
||||
|
||||
|
@ -555,7 +556,7 @@ def cal_metric(labels, preds, metrics):
|
|||
res["f1"] = round(f1, 4)
|
||||
elif metric == "mean_mrr":
|
||||
mean_mrr = np.mean(
|
||||
[
|
||||
[
|
||||
mrr_score(each_labels, each_preds)
|
||||
for each_labels, each_preds in zip(labels, preds)
|
||||
]
|
||||
|
@ -563,12 +564,12 @@ def cal_metric(labels, preds, metrics):
|
|||
res["mean_mrr"] = round(mean_mrr, 4)
|
||||
elif metric.startswith("ndcg"): # format like: ndcg@2;4;6;8
|
||||
ndcg_list = [1, 2]
|
||||
ks = metric.split('@')
|
||||
ks = metric.split("@")
|
||||
if len(ks) > 1:
|
||||
ndcg_list = [int(token) for token in ks[1].split(';')]
|
||||
ndcg_list = [int(token) for token in ks[1].split(";")]
|
||||
for k in ndcg_list:
|
||||
ndcg_temp = np.mean(
|
||||
[
|
||||
[
|
||||
ndcg_score(each_labels, each_preds, k)
|
||||
for each_labels, each_preds in zip(labels, preds)
|
||||
]
|
||||
|
@ -576,13 +577,13 @@ def cal_metric(labels, preds, metrics):
|
|||
res["ndcg@{0}".format(k)] = round(ndcg_temp, 4)
|
||||
elif metric.startswith("hit"): # format like: hit@2;4;6;8
|
||||
hit_list = [1, 2]
|
||||
ks = metric.split('@')
|
||||
ks = metric.split("@")
|
||||
if len(ks) > 1:
|
||||
hit_list = [int(token) for token in ks[1].split(';')]
|
||||
hit_list = [int(token) for token in ks[1].split(";")]
|
||||
for k in hit_list:
|
||||
hit_temp = np.mean(
|
||||
[
|
||||
hit_score(each_labels, each_preds, k)
|
||||
hit_score(each_labels, each_preds, k)
|
||||
for each_labels, each_preds in zip(labels, preds)
|
||||
]
|
||||
)
|
||||
|
|
|
@ -146,9 +146,16 @@ class DKNTextIterator(BaseIterator):
|
|||
if not line:
|
||||
break
|
||||
|
||||
label, candidate_news_index, candidate_news_val, click_news_index, click_news_val, candidate_news_entity_index, click_news_entity_index, impression_id = self.parser_one_line(
|
||||
line
|
||||
)
|
||||
(
|
||||
label,
|
||||
candidate_news_index,
|
||||
candidate_news_val,
|
||||
click_news_index,
|
||||
click_news_val,
|
||||
candidate_news_entity_index,
|
||||
click_news_entity_index,
|
||||
impression_id,
|
||||
) = self.parser_one_line(line)
|
||||
|
||||
candidate_news_index_batch.append(candidate_news_index)
|
||||
candidate_news_val_batch.append(candidate_news_val)
|
||||
|
|
|
@ -29,6 +29,7 @@ class FFMTextIterator(BaseIterator):
|
|||
Iterator will not load the whole data into memory. Instead, it loads data into memory
|
||||
per mini-batch, so that large files can be used as input data.
|
||||
"""
|
||||
|
||||
def __init__(self, hparams, graph, col_spliter=" ", ID_spliter="%"):
|
||||
"""Initialize an iterator. Create necessary placeholders for the model.
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ class DKN(BaseModel):
|
|||
Recommendation", in Proceedings of the 2018 World Wide Web Conference on World
|
||||
Wide Web, 2018.
|
||||
"""
|
||||
|
||||
def __init__(self, hparams, iterator_creator):
|
||||
"""Initialization steps for DKN.
|
||||
Compared with the BaseModel, DKN requires two different pre-computed embeddings,
|
||||
|
|
|
@ -23,6 +23,7 @@ class A2SVDModel(SequentialBaseModel):
|
|||
the 28th International Joint Conferences on Artificial Intelligence, IJCAI’19,
|
||||
Pages 4213-4219, AAAI Press, 2019.
|
||||
"""
|
||||
|
||||
def _build_seq_graph(self):
|
||||
"""The main function to create A2SVD model.
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ class CaserModel(SequentialBaseModel):
|
|||
sequence embedding", in Proceedings of the Eleventh ACM International Conference on
|
||||
Web Search and Data Mining, ACM, 2018.
|
||||
"""
|
||||
|
||||
def __init__(self, hparams, iterator_creator):
|
||||
"""Initialization of variables for caser
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ class GRU4RecModel(SequentialBaseModel):
|
|||
B. Hidasi, A. Karatzoglou, L. Baltrunas, D. Tikk, "Session-based Recommendations
|
||||
with Recurrent Neural Networks", ICLR (Poster), 2016.
|
||||
"""
|
||||
|
||||
def _build_seq_graph(self):
|
||||
"""The main function to create GRU4Rec model.
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ class SLI_RECModel(SequentialBaseModel):
|
|||
the 28th International Joint Conferences on Artificial Intelligence, IJCAI’19,
|
||||
Pages 4213-4219, AAAI Press, 2019.
|
||||
"""
|
||||
|
||||
def _build_seq_graph(self):
|
||||
"""The main function to create sli_rec model.
|
||||
|
||||
|
@ -112,7 +113,7 @@ class SLI_RECModel(SequentialBaseModel):
|
|||
shape=[user_embedding.shape.as_list()[-1], query_size],
|
||||
initializer=self.initializer,
|
||||
)
|
||||
att_inputs = tf.tensordot(user_embedding, attention_mat, [[2],[0]])
|
||||
att_inputs = tf.tensordot(user_embedding, attention_mat, [[2], [0]])
|
||||
|
||||
queries = tf.reshape(
|
||||
tf.tile(query, [1, att_inputs.shape[1].value]), tf.shape(att_inputs)
|
||||
|
|
|
@ -18,6 +18,7 @@ class XDeepFMModel(BaseModel):
|
|||
24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining,
|
||||
KDD 2018, London, 2018.
|
||||
"""
|
||||
|
||||
def _build_graph(self):
|
||||
"""The main function to create xdeepfm's logic.
|
||||
|
||||
|
|
|
@ -81,4 +81,3 @@ def hide_fastai_progress_bar():
|
|||
master_bar,
|
||||
progress_bar,
|
||||
)
|
||||
|
||||
|
|
|
@ -15,11 +15,11 @@ def model_perf_plots(df):
|
|||
Returns:
|
||||
matplotlib axes
|
||||
"""
|
||||
g = sns.FacetGrid(df, col="metric", hue='stage', col_wrap=2, sharey=False)
|
||||
g = g.map(sns.scatterplot, "epoch", "value").add_legend()
|
||||
g = sns.FacetGrid(df, col="metric", hue="stage", col_wrap=2, sharey=False)
|
||||
g = g.map(sns.scatterplot, "epoch", "value").add_legend()
|
||||
|
||||
|
||||
def compare_metric(df_list, metric='prec', stage='test'):
|
||||
def compare_metric(df_list, metric="prec", stage="test"):
|
||||
"""Function to combine and prepare list of dataframes into tidy format
|
||||
Args:
|
||||
df_list (list): List of dataframes
|
||||
|
@ -29,18 +29,29 @@ def compare_metric(df_list, metric='prec', stage='test'):
|
|||
Returns:
|
||||
Pandas dataframe
|
||||
"""
|
||||
colnames = ['model'+str(x) for x in list(range(1,len(df_list)+1))]
|
||||
models = [df[(df['stage']==stage) & (df['metric']==metric)]['value'].reset_index(
|
||||
drop=True).values for df in df_list]
|
||||
colnames = ["model" + str(x) for x in list(range(1, len(df_list) + 1))]
|
||||
models = [
|
||||
df[(df["stage"] == stage) & (df["metric"] == metric)]["value"]
|
||||
.reset_index(drop=True)
|
||||
.values
|
||||
for df in df_list
|
||||
]
|
||||
|
||||
output = pd.DataFrame(zip(*models),
|
||||
columns=colnames).stack().reset_index()
|
||||
output.columns = ['epoch','data','value']
|
||||
return output
|
||||
output = pd.DataFrame(zip(*models), columns=colnames).stack().reset_index()
|
||||
output.columns = ["epoch", "data", "value"]
|
||||
return output
|
||||
|
||||
|
||||
def track_model_metrics(model, train_interactions, test_interactions, k=10,
|
||||
no_epochs=100, no_threads=8, show_plot=True, **kwargs):
|
||||
|
||||
def track_model_metrics(
|
||||
model,
|
||||
train_interactions,
|
||||
test_interactions,
|
||||
k=10,
|
||||
no_epochs=100,
|
||||
no_threads=8,
|
||||
show_plot=True,
|
||||
**kwargs
|
||||
):
|
||||
"""Function to record model's performance at each epoch, formats the performance into tidy format,
|
||||
plots the performance and outputs the performance data
|
||||
Args:
|
||||
|
@ -58,36 +69,51 @@ def track_model_metrics(model, train_interactions, test_interactions, k=10,
|
|||
matplotlib axes: side effect of the method
|
||||
"""
|
||||
# initialising temp data storage
|
||||
model_prec_train = [0]*no_epochs
|
||||
model_prec_test = [0]*no_epochs
|
||||
model_prec_train = [0] * no_epochs
|
||||
model_prec_test = [0] * no_epochs
|
||||
|
||||
model_rec_train = [0]*no_epochs
|
||||
model_rec_test = [0]*no_epochs
|
||||
|
||||
# fit model and store train/test metrics at each epoch
|
||||
model_rec_train = [0] * no_epochs
|
||||
model_rec_test = [0] * no_epochs
|
||||
|
||||
# fit model and store train/test metrics at each epoch
|
||||
for epoch in range(no_epochs):
|
||||
# print(f'Epoch: {epoch}/{epochs}')
|
||||
model.fit_partial(interactions=train_interactions, epochs=1,
|
||||
num_threads=no_threads, **kwargs)
|
||||
model_prec_train[epoch] = precision_at_k(model, train_interactions, k=k, **kwargs).mean()
|
||||
model_prec_test[epoch] = precision_at_k(model, test_interactions, k=k, **kwargs).mean()
|
||||
# print(f'Epoch: {epoch}/{epochs}')
|
||||
model.fit_partial(
|
||||
interactions=train_interactions, epochs=1, num_threads=no_threads, **kwargs
|
||||
)
|
||||
model_prec_train[epoch] = precision_at_k(
|
||||
model, train_interactions, k=k, **kwargs
|
||||
).mean()
|
||||
model_prec_test[epoch] = precision_at_k(
|
||||
model, test_interactions, k=k, **kwargs
|
||||
).mean()
|
||||
|
||||
model_rec_train[epoch] = recall_at_k(
|
||||
model, train_interactions, k=k, **kwargs
|
||||
).mean()
|
||||
model_rec_test[epoch] = recall_at_k(
|
||||
model, test_interactions, k=k, **kwargs
|
||||
).mean()
|
||||
|
||||
model_rec_train[epoch] = recall_at_k(model, train_interactions, k=k, **kwargs).mean()
|
||||
model_rec_test[epoch] = recall_at_k(model, test_interactions, k=k, **kwargs).mean()
|
||||
|
||||
# collect the performance metrics into a dataframe
|
||||
fitting_metrics = pd.DataFrame(zip(model_prec_train, model_prec_test,
|
||||
model_rec_train, model_rec_test),
|
||||
columns=['model_prec_train', 'model_prec_test', 'model_rec_train', 'model_rec_test'])
|
||||
fitting_metrics = pd.DataFrame(
|
||||
zip(model_prec_train, model_prec_test, model_rec_train, model_rec_test),
|
||||
columns=[
|
||||
"model_prec_train",
|
||||
"model_prec_test",
|
||||
"model_rec_train",
|
||||
"model_rec_test",
|
||||
],
|
||||
)
|
||||
# convert into tidy format
|
||||
fitting_metrics = fitting_metrics.stack().reset_index()
|
||||
fitting_metrics.columns = ['epoch','level','value']
|
||||
fitting_metrics.columns = ["epoch", "level", "value"]
|
||||
# exact the labels for each observation
|
||||
fitting_metrics['stage'] = fitting_metrics.level.str.split('_').str[-1]
|
||||
fitting_metrics['metric'] = fitting_metrics.level.str.split('_').str[1]
|
||||
fitting_metrics.drop(['level'], axis = 1, inplace=True)
|
||||
fitting_metrics["stage"] = fitting_metrics.level.str.split("_").str[-1]
|
||||
fitting_metrics["metric"] = fitting_metrics.level.str.split("_").str[1]
|
||||
fitting_metrics.drop(["level"], axis=1, inplace=True)
|
||||
# replace the metric keys to improve visualisation
|
||||
metric_keys = {'prec':'Precision', 'rec':'Recall'}
|
||||
metric_keys = {"prec": "Precision", "rec": "Recall"}
|
||||
fitting_metrics.metric.replace(metric_keys, inplace=True)
|
||||
# plots the performance data
|
||||
if show_plot == True:
|
||||
|
@ -115,9 +141,11 @@ def similar_users(user_id, user_features, model, N=10):
|
|||
user_norms[user_norms == 0] = 1e-10
|
||||
scores /= user_norms
|
||||
|
||||
best = np.argpartition(scores, -(N+1))[-(N+1):]
|
||||
return pd.DataFrame(sorted(zip(best, scores[best] / user_norms[user_id]),
|
||||
key=lambda x: -x[1])[1:], columns = ['userID', 'score'])
|
||||
best = np.argpartition(scores, -(N + 1))[-(N + 1) :]
|
||||
return pd.DataFrame(
|
||||
sorted(zip(best, scores[best] / user_norms[user_id]), key=lambda x: -x[1])[1:],
|
||||
columns=["userID", "score"],
|
||||
)
|
||||
|
||||
|
||||
def similar_items(item_id, item_features, model, N=10):
|
||||
|
@ -133,16 +161,18 @@ def similar_items(item_id, item_features, model, N=10):
|
|||
Pandas dataframe of top N most similar items with score
|
||||
"""
|
||||
_, item_representations = model.get_item_representations(features=item_features)
|
||||
|
||||
|
||||
# Cosine similarity
|
||||
scores = item_representations.dot(item_representations[item_id, :])
|
||||
item_norms = np.linalg.norm(item_representations, axis=1)
|
||||
item_norms[item_norms == 0] = 1e-10
|
||||
scores /= item_norms
|
||||
|
||||
best = np.argpartition(scores, -(N+1))[-(N+1):]
|
||||
return pd.DataFrame(sorted(zip(best, scores[best] / item_norms[item_id]),
|
||||
key=lambda x: -x[1])[1:], columns = ['itemID', 'score'])
|
||||
best = np.argpartition(scores, -(N + 1))[-(N + 1) :]
|
||||
return pd.DataFrame(
|
||||
sorted(zip(best, scores[best] / item_norms[item_id]), key=lambda x: -x[1])[1:],
|
||||
columns=["itemID", "score"],
|
||||
)
|
||||
|
||||
|
||||
def prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights):
|
||||
|
@ -158,24 +188,32 @@ def prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights):
|
|||
Returns:
|
||||
Pandas dataframe of user-item selected for testing
|
||||
"""
|
||||
test_df = pd.DataFrame(zip(
|
||||
uids[test_idx],
|
||||
iids[test_idx],
|
||||
[list(uid_map.keys())[x] for x in uids[test_idx]],
|
||||
[list(iid_map.keys())[x] for x in iids[test_idx]]),
|
||||
columns=['uid','iid','userID','itemID'])
|
||||
test_df = pd.DataFrame(
|
||||
zip(
|
||||
uids[test_idx],
|
||||
iids[test_idx],
|
||||
[list(uid_map.keys())[x] for x in uids[test_idx]],
|
||||
[list(iid_map.keys())[x] for x in iids[test_idx]],
|
||||
),
|
||||
columns=["uid", "iid", "userID", "itemID"],
|
||||
)
|
||||
|
||||
dok_weights = weights.todok()
|
||||
test_df['rating'] = test_df.apply(
|
||||
lambda x:dok_weights[x.uid,x.iid], axis=1)
|
||||
test_df["rating"] = test_df.apply(lambda x: dok_weights[x.uid, x.iid], axis=1)
|
||||
|
||||
return test_df[['userID', 'itemID', 'rating']]
|
||||
return test_df[["userID", "itemID", "rating"]]
|
||||
|
||||
|
||||
def prepare_all_predictions(data, uid_map, iid_map, interactions,
|
||||
model, num_threads,
|
||||
user_features=None,
|
||||
item_features=None):
|
||||
def prepare_all_predictions(
|
||||
data,
|
||||
uid_map,
|
||||
iid_map,
|
||||
interactions,
|
||||
model,
|
||||
num_threads,
|
||||
user_features=None,
|
||||
item_features=None,
|
||||
):
|
||||
"""Function to prepare all predictions for evaluation
|
||||
Args:
|
||||
data (pandas df): dataframe of all users, items and ratings as loaded
|
||||
|
@ -193,25 +231,31 @@ def prepare_all_predictions(data, uid_map, iid_map, interactions,
|
|||
users, items, preds = [], [], []
|
||||
item = list(data.itemID.unique())
|
||||
for user in data.userID.unique():
|
||||
user = [user] * len(item)
|
||||
user = [user] * len(item)
|
||||
users.extend(user)
|
||||
items.extend(item)
|
||||
|
||||
all_predictions = pd.DataFrame(data={"userID": users, "itemID":items})
|
||||
all_predictions['uid'] = all_predictions.userID.map(uid_map)
|
||||
all_predictions['iid'] = all_predictions.itemID.map(iid_map)
|
||||
all_predictions = pd.DataFrame(data={"userID": users, "itemID": items})
|
||||
all_predictions["uid"] = all_predictions.userID.map(uid_map)
|
||||
all_predictions["iid"] = all_predictions.itemID.map(iid_map)
|
||||
|
||||
dok_weights = interactions.todok()
|
||||
all_predictions['rating'] = all_predictions.apply(
|
||||
lambda x: dok_weights[x.uid,x.iid], axis=1)
|
||||
|
||||
all_predictions["rating"] = all_predictions.apply(
|
||||
lambda x: dok_weights[x.uid, x.iid], axis=1
|
||||
)
|
||||
|
||||
all_predictions = all_predictions[all_predictions.rating < 1].reset_index(drop=True)
|
||||
all_predictions = all_predictions.drop('rating', axis=1)
|
||||
|
||||
all_predictions['prediction'] = all_predictions.apply(lambda x: model.predict(
|
||||
user_ids=x['uid'], item_ids=[x['iid']],
|
||||
user_features=user_features,
|
||||
item_features=item_features,
|
||||
num_threads=num_threads)[0], axis=1)
|
||||
|
||||
return all_predictions[['userID','itemID','prediction']]
|
||||
all_predictions = all_predictions.drop("rating", axis=1)
|
||||
|
||||
all_predictions["prediction"] = all_predictions.apply(
|
||||
lambda x: model.predict(
|
||||
user_ids=x["uid"],
|
||||
item_ids=[x["iid"]],
|
||||
user_features=user_features,
|
||||
item_features=item_features,
|
||||
num_threads=num_threads,
|
||||
)[0],
|
||||
axis=1,
|
||||
)
|
||||
|
||||
return all_predictions[["userID", "itemID", "prediction"]]
|
||||
|
|
|
@ -53,12 +53,12 @@ class NCF:
|
|||
seed (int): Seed.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
# seed
|
||||
tf.set_random_seed(seed)
|
||||
np.random.seed(seed)
|
||||
self.seed = seed
|
||||
|
||||
|
||||
self.n_users = n_users
|
||||
self.n_items = n_items
|
||||
self.model_type = model_type.lower()
|
||||
|
@ -105,7 +105,10 @@ class NCF:
|
|||
# set embedding table
|
||||
self.embedding_gmf_P = tf.Variable(
|
||||
tf.truncated_normal(
|
||||
shape=[self.n_users, self.n_factors], mean=0.0, stddev=0.01, seed=self.seed,
|
||||
shape=[self.n_users, self.n_factors],
|
||||
mean=0.0,
|
||||
stddev=0.01,
|
||||
seed=self.seed,
|
||||
),
|
||||
name="embedding_gmf_P",
|
||||
dtype=tf.float32,
|
||||
|
@ -113,7 +116,10 @@ class NCF:
|
|||
|
||||
self.embedding_gmf_Q = tf.Variable(
|
||||
tf.truncated_normal(
|
||||
shape=[self.n_items, self.n_factors], mean=0.0, stddev=0.01, seed=self.seed,
|
||||
shape=[self.n_items, self.n_factors],
|
||||
mean=0.0,
|
||||
stddev=0.01,
|
||||
seed=self.seed,
|
||||
),
|
||||
name="embedding_gmf_Q",
|
||||
dtype=tf.float32,
|
||||
|
@ -174,7 +180,9 @@ class NCF:
|
|||
output,
|
||||
num_outputs=layer_size,
|
||||
activation_fn=tf.nn.relu,
|
||||
weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed),
|
||||
weights_initializer=tf.contrib.layers.xavier_initializer(
|
||||
seed=self.seed
|
||||
),
|
||||
)
|
||||
self.mlp_vector = output
|
||||
|
||||
|
@ -189,7 +197,9 @@ class NCF:
|
|||
num_outputs=1,
|
||||
activation_fn=None,
|
||||
biases_initializer=None,
|
||||
weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed),
|
||||
weights_initializer=tf.contrib.layers.xavier_initializer(
|
||||
seed=self.seed
|
||||
),
|
||||
)
|
||||
self.output = tf.sigmoid(output)
|
||||
|
||||
|
@ -200,7 +210,9 @@ class NCF:
|
|||
num_outputs=1,
|
||||
activation_fn=None,
|
||||
biases_initializer=None,
|
||||
weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed),
|
||||
weights_initializer=tf.contrib.layers.xavier_initializer(
|
||||
seed=self.seed
|
||||
),
|
||||
)
|
||||
self.output = tf.sigmoid(output)
|
||||
|
||||
|
@ -213,7 +225,9 @@ class NCF:
|
|||
num_outputs=1,
|
||||
activation_fn=None,
|
||||
biases_initializer=None,
|
||||
weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed),
|
||||
weights_initializer=tf.contrib.layers.xavier_initializer(
|
||||
seed=self.seed
|
||||
),
|
||||
)
|
||||
self.output = tf.sigmoid(output)
|
||||
|
||||
|
|
|
@ -120,7 +120,7 @@ class LSTURModel(BaseModel):
|
|||
recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed),
|
||||
bias_initializer=keras.initializers.Zeros(),
|
||||
)(layers.Masking(mask_value=0.0)(click_title_presents))
|
||||
|
||||
|
||||
user_present = layers.Concatenate()([short_uemb, long_u_emb])
|
||||
user_present = layers.Dense(
|
||||
hparams.gru_unit,
|
||||
|
|
|
@ -51,7 +51,7 @@ class NAMLModel(BaseModel):
|
|||
batch_data["candidate_title_batch"],
|
||||
batch_data["candidate_body_batch"],
|
||||
batch_data["candidate_vert_batch"],
|
||||
batch_data["candidate_subvert_batch"]
|
||||
batch_data["candidate_subvert_batch"],
|
||||
]
|
||||
input_label = batch_data["labels"]
|
||||
return input_feat, input_label
|
||||
|
@ -96,7 +96,9 @@ class NAMLModel(BaseModel):
|
|||
click_news_presents = layers.TimeDistributed(newsencoder)(
|
||||
his_input_title_body_verts
|
||||
)
|
||||
user_present = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(click_news_presents)
|
||||
user_present = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(
|
||||
click_news_presents
|
||||
)
|
||||
|
||||
model = keras.Model(
|
||||
his_input_title_body_verts, user_present, name="user_encoder"
|
||||
|
@ -145,7 +147,9 @@ class NAMLModel(BaseModel):
|
|||
concate_repr = layers.Concatenate(axis=-2)(
|
||||
[title_repr, body_repr, vert_repr, subvert_repr]
|
||||
)
|
||||
news_repr = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(concate_repr)
|
||||
news_repr = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(
|
||||
concate_repr
|
||||
)
|
||||
|
||||
model = keras.Model(input_title_body_verts, news_repr, name="news_encoder")
|
||||
return model
|
||||
|
@ -170,7 +174,7 @@ class NAMLModel(BaseModel):
|
|||
activation=hparams.cnn_activation,
|
||||
padding="same",
|
||||
bias_initializer=keras.initializers.Zeros(),
|
||||
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed)
|
||||
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
|
||||
)(y)
|
||||
y = layers.Dropout(hparams.dropout)(y)
|
||||
pred_title = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)
|
||||
|
@ -199,7 +203,7 @@ class NAMLModel(BaseModel):
|
|||
activation=hparams.cnn_activation,
|
||||
padding="same",
|
||||
bias_initializer=keras.initializers.Zeros(),
|
||||
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed)
|
||||
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
|
||||
)(y)
|
||||
y = layers.Dropout(hparams.dropout)(y)
|
||||
pred_body = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)
|
||||
|
@ -223,10 +227,10 @@ class NAMLModel(BaseModel):
|
|||
|
||||
vert_emb = vert_embedding(input_vert)
|
||||
pred_vert = layers.Dense(
|
||||
hparams.filter_num,
|
||||
hparams.filter_num,
|
||||
activation=hparams.dense_activation,
|
||||
bias_initializer=keras.initializers.Zeros(),
|
||||
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed)
|
||||
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
|
||||
)(vert_emb)
|
||||
pred_vert = layers.Reshape((1, hparams.filter_num))(pred_vert)
|
||||
|
||||
|
@ -248,10 +252,10 @@ class NAMLModel(BaseModel):
|
|||
|
||||
subvert_emb = subvert_embedding(input_subvert)
|
||||
pred_subvert = layers.Dense(
|
||||
hparams.filter_num,
|
||||
hparams.filter_num,
|
||||
activation=hparams.dense_activation,
|
||||
bias_initializer=keras.initializers.Zeros(),
|
||||
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed)
|
||||
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
|
||||
)(subvert_emb)
|
||||
pred_subvert = layers.Reshape((1, hparams.filter_num))(pred_subvert)
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@ log = logging.getLogger(__name__)
|
|||
|
||||
class RBM:
|
||||
"""Restricted Boltzmann Machine"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_units=500,
|
||||
|
@ -27,7 +28,7 @@ class RBM:
|
|||
sampling_protocol=[50, 70, 80, 90, 100],
|
||||
debug=False,
|
||||
with_metrics=False,
|
||||
seed=42
|
||||
seed=42,
|
||||
):
|
||||
"""Implementation of a multinomial Restricted Boltzmann Machine for collaborative filtering
|
||||
in numpy/pandas/tensorflow
|
||||
|
@ -146,8 +147,6 @@ class RBM:
|
|||
tf.Tensor: Float32 tensor of sampled units. The value is 1 if pr>g and 0 otherwise.
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# sample from a Bernoulli distribution with same dimensions as input distribution
|
||||
g = tf.convert_to_tensor(np.random.uniform(size=pr.shape[1]), dtype=tf.float32)
|
||||
|
||||
|
@ -260,7 +259,9 @@ class RBM:
|
|||
self.w = tf.get_variable(
|
||||
"weight",
|
||||
[self.Nvisible, self.Nhidden],
|
||||
initializer=tf.random_normal_initializer(stddev=self.stdv, seed=self.seed),
|
||||
initializer=tf.random_normal_initializer(
|
||||
stddev=self.stdv, seed=self.seed
|
||||
),
|
||||
dtype="float32",
|
||||
)
|
||||
|
||||
|
@ -278,7 +279,6 @@ class RBM:
|
|||
dtype="float32",
|
||||
)
|
||||
|
||||
|
||||
def sample_hidden_units(self, vv):
|
||||
"""Sampling: In RBM we use Contrastive divergence to sample the parameter space. In order to do that we need
|
||||
to initialize the two conditional probabilities:
|
||||
|
@ -355,7 +355,6 @@ class RBM:
|
|||
|
||||
return pvh, v_
|
||||
|
||||
|
||||
def gibbs_sampling(self):
|
||||
"""Gibbs sampling: Determines an estimate of the model configuration via sampling. In the binary
|
||||
RBM we need to impose that unseen movies stay as such, i.e. the sampling phase should not modify
|
||||
|
@ -431,7 +430,6 @@ class RBM:
|
|||
if self.debug:
|
||||
log.info("percentage of epochs covered so far %f2" % (epoch_percentage))
|
||||
|
||||
|
||||
def accuracy(self, vp):
|
||||
"""Train/Test Mean average precision
|
||||
|
||||
|
|
|
@ -17,8 +17,17 @@ from reco_utils.recommender.ncf.ncf_singlenode import NCF
|
|||
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
|
||||
from reco_utils.dataset import movielens
|
||||
from reco_utils.dataset.python_splitters import python_chrono_split
|
||||
from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k,
|
||||
recall_at_k, get_top_k_items)
|
||||
from reco_utils.evaluation.python_evaluation import (
|
||||
rmse,
|
||||
mae,
|
||||
rsquared,
|
||||
exp_var,
|
||||
map_at_k,
|
||||
ndcg_at_k,
|
||||
precision_at_k,
|
||||
recall_at_k,
|
||||
get_top_k_items,
|
||||
)
|
||||
from reco_utils.common.constants import SEED as DEFAULT_SEED
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
@ -39,16 +48,16 @@ def ncf_training(params):
|
|||
|
||||
data = NCFDataset(train=train_data, test=validation_data, seed=DEFAULT_SEED)
|
||||
|
||||
model = NCF (
|
||||
n_users=data.n_users,
|
||||
model = NCF(
|
||||
n_users=data.n_users,
|
||||
n_items=data.n_items,
|
||||
model_type="NeuMF",
|
||||
n_factors=params["n_factors"],
|
||||
layer_sizes=[16,8,4],
|
||||
layer_sizes=[16, 8, 4],
|
||||
n_epochs=params["n_epochs"],
|
||||
learning_rate=params["learning_rate"],
|
||||
verbose=params["verbose"],
|
||||
seed=DEFAULT_SEED
|
||||
seed=DEFAULT_SEED,
|
||||
)
|
||||
|
||||
model.fit(data)
|
||||
|
@ -58,12 +67,18 @@ def ncf_training(params):
|
|||
metrics_dict = {}
|
||||
rating_metrics = params["rating_metrics"]
|
||||
if len(rating_metrics) > 0:
|
||||
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
|
||||
for (_, row) in validation_data.iterrows()]
|
||||
predictions = [
|
||||
[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
|
||||
for (_, row) in validation_data.iterrows()
|
||||
]
|
||||
|
||||
predictions = pd.DataFrame(
|
||||
predictions, columns=["userID", "itemID", "prediction"]
|
||||
)
|
||||
predictions = predictions.astype(
|
||||
{"userID": "int64", "itemID": "int64", "prediction": "float64"}
|
||||
)
|
||||
|
||||
predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
|
||||
predictions = predictions.astype({'userID': 'int64', 'itemID': 'int64', 'prediction': 'float64'})
|
||||
|
||||
for metric in rating_metrics:
|
||||
result = getattr(evaluation, metric)(validation_data, predictions)
|
||||
logger.debug("%s = %g", metric, result)
|
||||
|
@ -77,18 +92,25 @@ def ncf_training(params):
|
|||
users, items, preds = [], [], []
|
||||
item = list(train_data.itemID.unique())
|
||||
for user in train_data.userID.unique():
|
||||
user = [user] * len(item)
|
||||
user = [user] * len(item)
|
||||
users.extend(user)
|
||||
items.extend(item)
|
||||
preds.extend(list(model.predict(user, item, is_list=True)))
|
||||
|
||||
all_predictions = pd.DataFrame(data={"userID": users, "itemID": items, "prediction": preds})
|
||||
all_predictions = pd.DataFrame(
|
||||
data={"userID": users, "itemID": items, "prediction": preds}
|
||||
)
|
||||
|
||||
merged = pd.merge(train_data, all_predictions, on=["userID", "itemID"], how="outer")
|
||||
all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)
|
||||
merged = pd.merge(
|
||||
train_data, all_predictions, on=["userID", "itemID"], how="outer"
|
||||
)
|
||||
all_predictions = merged[merged.rating.isnull()].drop("rating", axis=1)
|
||||
for metric in ranking_metrics:
|
||||
result = getattr(evaluation, metric)(
|
||||
validation_data, all_predictions, col_prediction="prediction", k=params["k"]
|
||||
validation_data,
|
||||
all_predictions,
|
||||
col_prediction="prediction",
|
||||
k=params["k"],
|
||||
)
|
||||
logger.debug("%s@%d = %g", metric, params["k"], result)
|
||||
if metric == params["primary_metric"]:
|
||||
|
|
|
@ -109,7 +109,10 @@ def get_trials(optimize_mode):
|
|||
raise ValueError("optimize_mode should equal either minimize or maximize")
|
||||
all_trials = requests.get(NNI_TRIAL_JOBS_URL).json()
|
||||
trials = [
|
||||
(ast.literal_eval(ast.literal_eval(trial['finalMetricData'][0]['data'])), trial["logPath"].split(":")[-1])
|
||||
(
|
||||
ast.literal_eval(ast.literal_eval(trial["finalMetricData"][0]["data"])),
|
||||
trial["logPath"].split(":")[-1],
|
||||
)
|
||||
for trial in all_trials
|
||||
]
|
||||
sorted_trials = sorted(
|
||||
|
@ -142,8 +145,10 @@ def start_nni(config_path, wait=WAITING_TIME, max_retries=MAX_RETRIES):
|
|||
max_retries (int): max number of retries
|
||||
"""
|
||||
nni_env = os.environ.copy()
|
||||
nni_env['PATH'] = sys.prefix + '/bin:' + nni_env['PATH']
|
||||
proc = subprocess.run([sys.prefix + '/bin/nnictl', 'create', '--config', config_path], env=nni_env)
|
||||
nni_env["PATH"] = sys.prefix + "/bin:" + nni_env["PATH"]
|
||||
proc = subprocess.run(
|
||||
[sys.prefix + "/bin/nnictl", "create", "--config", config_path], env=nni_env
|
||||
)
|
||||
# proc = subprocess.run(["nnictl", "create", "--config", config_path], env=nni_env)
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError("'nnictl create' failed with code %d" % proc.returncode)
|
||||
|
|
|
@ -53,4 +53,3 @@ def generate_param_grid(params):
|
|||
params_exp.append(param_exp)
|
||||
|
||||
return params_exp
|
||||
|
||||
|
|
|
@ -27,4 +27,3 @@ if __name__ == "__main__":
|
|||
deps += list(PIP_WIN32.values())
|
||||
with open("requirements.txt", "w") as f:
|
||||
f.write("\n".join(set(deps)))
|
||||
|
||||
|
|
|
@ -16,59 +16,70 @@ from azureml.core import Run
|
|||
|
||||
|
||||
def create_arg_parser():
|
||||
parser = argparse.ArgumentParser(description='Process inputs')
|
||||
parser = argparse.ArgumentParser(description="Process inputs")
|
||||
# test folder
|
||||
parser.add_argument("--testfolder", "-f",
|
||||
action="store",
|
||||
default="./tests/unit",
|
||||
help="Folder where tests are located")
|
||||
parser.add_argument("--num",
|
||||
action="store",
|
||||
default="99",
|
||||
help="test num")
|
||||
parser.add_argument(
|
||||
"--testfolder",
|
||||
"-f",
|
||||
action="store",
|
||||
default="./tests/unit",
|
||||
help="Folder where tests are located",
|
||||
)
|
||||
parser.add_argument("--num", action="store", default="99", help="test num")
|
||||
# test markers
|
||||
parser.add_argument("--testmarkers", "-m",
|
||||
action="store",
|
||||
default="not notebooks and not spark and not gpu",
|
||||
help="Specify test markers for test selection")
|
||||
parser.add_argument(
|
||||
"--testmarkers",
|
||||
"-m",
|
||||
action="store",
|
||||
default="not notebooks and not spark and not gpu",
|
||||
help="Specify test markers for test selection",
|
||||
)
|
||||
# test results file
|
||||
parser.add_argument("--xmlname", "-j",
|
||||
action="store",
|
||||
default="reports/test-unit.xml",
|
||||
help="Test results")
|
||||
parser.add_argument(
|
||||
"--xmlname",
|
||||
"-j",
|
||||
action="store",
|
||||
default="reports/test-unit.xml",
|
||||
help="Test results",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger = logging.getLogger('submit_azureml_pytest.py')
|
||||
logger = logging.getLogger("submit_azureml_pytest.py")
|
||||
args = create_arg_parser()
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
|
||||
logger.debug('junit_xml {}'.format(args.xmlname))
|
||||
logger.debug("junit_xml {}".format(args.xmlname))
|
||||
|
||||
# Run.get_context() is needed to save context as pytest causes corruption
|
||||
# of env vars
|
||||
run = Run.get_context()
|
||||
'''
|
||||
"""
|
||||
This is an example of a working subprocess.run for a unit test run:
|
||||
subprocess.run(["pytest", "tests/unit",
|
||||
"-m", "not notebooks and not spark and not gpu",
|
||||
"--junitxml=reports/test-unit.xml"])
|
||||
'''
|
||||
logger.debug('args.junitxml {}'.format(args.xmlname))
|
||||
logger.debug('junit= --junitxml={}'.format(args.xmlname))
|
||||
pytest_cmd = ['pytest', args.testfolder, '-m', args.testmarkers,
|
||||
'--junitxml={}'.format(args.xmlname)]
|
||||
logger.info('pytest run:{}'.format(' '.join(pytest_cmd)))
|
||||
"""
|
||||
logger.debug("args.junitxml {}".format(args.xmlname))
|
||||
logger.debug("junit= --junitxml={}".format(args.xmlname))
|
||||
pytest_cmd = [
|
||||
"pytest",
|
||||
args.testfolder,
|
||||
"-m",
|
||||
args.testmarkers,
|
||||
"--junitxml={}".format(args.xmlname),
|
||||
]
|
||||
logger.info("pytest run:{}".format(" ".join(pytest_cmd)))
|
||||
subprocess.run(pytest_cmd)
|
||||
|
||||
#
|
||||
# Leveraged code from this notebook:
|
||||
# https://msdata.visualstudio.com/Vienna/_search?action=contents&text=upload_folder&type=code&lp=code-Project&filters=ProjectFilters%7BVienna%7DRepositoryFilters%7BAzureMlCli%7D&pageSize=25&sortOptions=%5B%7B%22field%22%3A%22relevance%22%2C%22sortOrder%22%3A%22desc%22%7D%5D&result=DefaultCollection%2FVienna%2FAzureMlCli%2FGBmaster%2F%2Fsrc%2Fazureml-core%2Fazureml%2Fcore%2Frun.py
|
||||
logger.debug('os.listdir files {}'.format(os.listdir('.')))
|
||||
logger.debug("os.listdir files {}".format(os.listdir(".")))
|
||||
|
||||
# files for AzureML
|
||||
name_of_upload = "reports"
|
||||
|
|
|
@ -48,8 +48,9 @@ from azureml.core.compute_target import ComputeTargetException
|
|||
from azureml.core.workspace import WorkspaceException
|
||||
|
||||
|
||||
def setup_workspace(workspace_name, subscription_id, resource_group, cli_auth,
|
||||
location):
|
||||
def setup_workspace(
|
||||
workspace_name, subscription_id, resource_group, cli_auth, location
|
||||
):
|
||||
"""
|
||||
This sets up an Azure Workspace.
|
||||
An existing Azure Workspace is used or a new one is created if needed for
|
||||
|
@ -71,35 +72,34 @@ def setup_workspace(workspace_name, subscription_id, resource_group, cli_auth,
|
|||
Returns:
|
||||
ws: workspace reference
|
||||
"""
|
||||
logger.debug('setup: workspace_name is {}'.format(workspace_name))
|
||||
logger.debug('setup: resource_group is {}'.format(resource_group))
|
||||
logger.debug('setup: subid is {}'.format(subscription_id))
|
||||
logger.debug('setup: location is {}'.format(location))
|
||||
logger.debug("setup: workspace_name is {}".format(workspace_name))
|
||||
logger.debug("setup: resource_group is {}".format(resource_group))
|
||||
logger.debug("setup: subid is {}".format(subscription_id))
|
||||
logger.debug("setup: location is {}".format(location))
|
||||
|
||||
try:
|
||||
# use existing workspace if there is one
|
||||
ws = Workspace.get(
|
||||
name=workspace_name,
|
||||
subscription_id=subscription_id,
|
||||
resource_group=resource_group,
|
||||
auth=cli_auth
|
||||
)
|
||||
# use existing workspace if there is one
|
||||
ws = Workspace.get(
|
||||
name=workspace_name,
|
||||
subscription_id=subscription_id,
|
||||
resource_group=resource_group,
|
||||
auth=cli_auth,
|
||||
)
|
||||
except WorkspaceException:
|
||||
# this call might take a minute or two.
|
||||
logger.debug('Creating new workspace')
|
||||
ws = Workspace.create(
|
||||
name=workspace_name,
|
||||
subscription_id=subscription_id,
|
||||
resource_group=resource_group,
|
||||
# create_resource_group=True,
|
||||
location=location,
|
||||
auth=cli_auth
|
||||
)
|
||||
# this call might take a minute or two.
|
||||
logger.debug("Creating new workspace")
|
||||
ws = Workspace.create(
|
||||
name=workspace_name,
|
||||
subscription_id=subscription_id,
|
||||
resource_group=resource_group,
|
||||
# create_resource_group=True,
|
||||
location=location,
|
||||
auth=cli_auth,
|
||||
)
|
||||
return ws
|
||||
|
||||
|
||||
def setup_persistent_compute_target(workspace, cluster_name, vm_size,
|
||||
max_nodes):
|
||||
def setup_persistent_compute_target(workspace, cluster_name, vm_size, max_nodes):
|
||||
"""
|
||||
Set up a persistent compute target on AzureML.
|
||||
A persistent compute target runs noticeably faster than a
|
||||
|
@ -127,15 +127,13 @@ def setup_persistent_compute_target(workspace, cluster_name, vm_size,
|
|||
|
||||
try:
|
||||
cpu_cluster = ComputeTarget(workspace=workspace, name=cluster_name)
|
||||
logger.debug('setup: Found existing cluster, use it.')
|
||||
logger.debug("setup: Found existing cluster, use it.")
|
||||
except ComputeTargetException:
|
||||
logger.debug('setup: create cluster')
|
||||
logger.debug("setup: create cluster")
|
||||
compute_config = AmlCompute.provisioning_configuration(
|
||||
vm_size=vm_size,
|
||||
max_nodes=max_nodes)
|
||||
cpu_cluster = ComputeTarget.create(workspace,
|
||||
cluster_name,
|
||||
compute_config)
|
||||
vm_size=vm_size, max_nodes=max_nodes
|
||||
)
|
||||
cpu_cluster = ComputeTarget.create(workspace, cluster_name, compute_config)
|
||||
cpu_cluster.wait_for_completion(show_output=True)
|
||||
return cpu_cluster
|
||||
|
||||
|
@ -173,7 +171,8 @@ def create_run_config(cpu_cluster, docker_proc_type, conda_env_file):
|
|||
# True means the user will manually configure the environment
|
||||
run_amlcompute.environment.python.user_managed_dependencies = False
|
||||
run_amlcompute.environment.python.conda_dependencies = CondaDependencies(
|
||||
conda_dependencies_file_path=conda_env_file)
|
||||
conda_dependencies_file_path=conda_env_file
|
||||
)
|
||||
return run_amlcompute
|
||||
|
||||
|
||||
|
@ -190,13 +189,14 @@ def create_experiment(workspace, experiment_name):
|
|||
exp - AzureML experiment
|
||||
"""
|
||||
|
||||
logger.debug('create: experiment_name {}'.format(experiment_name))
|
||||
logger.debug("create: experiment_name {}".format(experiment_name))
|
||||
exp = Experiment(workspace=workspace, name=experiment_name)
|
||||
return(exp)
|
||||
return exp
|
||||
|
||||
|
||||
def submit_experiment_to_azureml(test, test_folder, test_markers, junitxml,
|
||||
run_config, experiment):
|
||||
def submit_experiment_to_azureml(
|
||||
test, test_folder, test_markers, junitxml, run_config, experiment
|
||||
):
|
||||
|
||||
"""
|
||||
Submitting the experiment to AzureML actually runs the script.
|
||||
|
@ -219,20 +219,23 @@ def submit_experiment_to_azureml(test, test_folder, test_markers, junitxml,
|
|||
run : AzureML run or trial
|
||||
"""
|
||||
|
||||
logger.debug('submit: testfolder {}'.format(test_folder))
|
||||
logger.debug('junitxml: {}'.format(junitxml))
|
||||
logger.debug("submit: testfolder {}".format(test_folder))
|
||||
logger.debug("junitxml: {}".format(junitxml))
|
||||
project_folder = "."
|
||||
|
||||
script_run_config = ScriptRunConfig(source_directory=project_folder,
|
||||
script=test,
|
||||
run_config=run_config,
|
||||
arguments=["--testfolder",
|
||||
test_folder,
|
||||
"--testmarkers",
|
||||
test_markers,
|
||||
"--xmlname",
|
||||
junitxml]
|
||||
)
|
||||
script_run_config = ScriptRunConfig(
|
||||
source_directory=project_folder,
|
||||
script=test,
|
||||
run_config=run_config,
|
||||
arguments=[
|
||||
"--testfolder",
|
||||
test_folder,
|
||||
"--testmarkers",
|
||||
test_markers,
|
||||
"--xmlname",
|
||||
junitxml,
|
||||
],
|
||||
)
|
||||
run = experiment.submit(script_run_config)
|
||||
# waits only for configuration to complete
|
||||
run.wait_for_completion(show_output=True, wait_post_processing=True)
|
||||
|
@ -240,7 +243,7 @@ def submit_experiment_to_azureml(test, test_folder, test_markers, junitxml,
|
|||
# test logs can also be found on azure
|
||||
# go to azure portal to see log in azure ws and look for experiment name
|
||||
# and look for individual run
|
||||
logger.debug('files {}'.format(run.get_file_names))
|
||||
logger.debug("files {}".format(run.get_file_names))
|
||||
|
||||
return run
|
||||
|
||||
|
@ -251,92 +254,113 @@ def create_arg_parser():
|
|||
use defaults. The user has many options they can select.
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Process some inputs')
|
||||
parser = argparse.ArgumentParser(description="Process some inputs")
|
||||
# script to run pytest
|
||||
parser.add_argument("--test",
|
||||
action="store",
|
||||
default="./tests/ci/run_pytest.py",
|
||||
help="location of script to run pytest")
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
action="store",
|
||||
default="./tests/ci/run_pytest.py",
|
||||
help="location of script to run pytest",
|
||||
)
|
||||
# test folder
|
||||
parser.add_argument("--testfolder",
|
||||
action="store",
|
||||
default="./tests/unit",
|
||||
help="folder where tests are stored")
|
||||
parser.add_argument(
|
||||
"--testfolder",
|
||||
action="store",
|
||||
default="./tests/unit",
|
||||
help="folder where tests are stored",
|
||||
)
|
||||
# pytest test markers
|
||||
parser.add_argument("--testmarkers",
|
||||
action="store",
|
||||
default="not notebooks and not spark and not gpu",
|
||||
help="pytest markers indicate tests to run")
|
||||
parser.add_argument(
|
||||
"--testmarkers",
|
||||
action="store",
|
||||
default="not notebooks and not spark and not gpu",
|
||||
help="pytest markers indicate tests to run",
|
||||
)
|
||||
# test summary file
|
||||
parser.add_argument("--junitxml",
|
||||
action="store",
|
||||
default="reports/test-unit.xml",
|
||||
help="file for returned test results")
|
||||
parser.add_argument(
|
||||
"--junitxml",
|
||||
action="store",
|
||||
default="reports/test-unit.xml",
|
||||
help="file for returned test results",
|
||||
)
|
||||
# max num nodes in Azure cluster
|
||||
parser.add_argument("--maxnodes",
|
||||
action="store",
|
||||
default=4,
|
||||
help="specify the maximum number of nodes for the run")
|
||||
parser.add_argument(
|
||||
"--maxnodes",
|
||||
action="store",
|
||||
default=4,
|
||||
help="specify the maximum number of nodes for the run",
|
||||
)
|
||||
# Azure resource group
|
||||
parser.add_argument("--rg",
|
||||
action="store",
|
||||
default="recommender",
|
||||
help="Azure Resource Group")
|
||||
parser.add_argument(
|
||||
"--rg", action="store", default="recommender", help="Azure Resource Group"
|
||||
)
|
||||
# AzureML workspace Name
|
||||
parser.add_argument("--wsname",
|
||||
action="store",
|
||||
default="RecoWS",
|
||||
help="AzureML workspace name")
|
||||
parser.add_argument(
|
||||
"--wsname", action="store", default="RecoWS", help="AzureML workspace name"
|
||||
)
|
||||
# AzureML clustername
|
||||
parser.add_argument("--clustername",
|
||||
action="store",
|
||||
default="amlcompute",
|
||||
help="Set name of Azure cluster")
|
||||
parser.add_argument(
|
||||
"--clustername",
|
||||
action="store",
|
||||
default="amlcompute",
|
||||
help="Set name of Azure cluster",
|
||||
)
|
||||
# Azure VM size
|
||||
parser.add_argument("--vmsize",
|
||||
action="store",
|
||||
default="STANDARD_D3_V2",
|
||||
help="Set the size of the VM either STANDARD_D3_V2")
|
||||
parser.add_argument(
|
||||
"--vmsize",
|
||||
action="store",
|
||||
default="STANDARD_D3_V2",
|
||||
help="Set the size of the VM either STANDARD_D3_V2",
|
||||
)
|
||||
# cpu or gpu
|
||||
parser.add_argument("--dockerproc",
|
||||
action="store",
|
||||
default="cpu",
|
||||
help="Base image used in docker container")
|
||||
parser.add_argument(
|
||||
"--dockerproc",
|
||||
action="store",
|
||||
default="cpu",
|
||||
help="Base image used in docker container",
|
||||
)
|
||||
# Azure subscription id, when used in a pipeline, it is stored in keyvault
|
||||
parser.add_argument("--subid",
|
||||
action="store",
|
||||
default="123456",
|
||||
help="Azure Subscription ID")
|
||||
parser.add_argument(
|
||||
"--subid", action="store", default="123456", help="Azure Subscription ID"
|
||||
)
|
||||
# ./reco.yaml is created in the azure devops pipeline.
|
||||
# Not recommended to change this.
|
||||
parser.add_argument("--condafile",
|
||||
action="store",
|
||||
default="./reco.yaml",
|
||||
help="file with environment variables")
|
||||
parser.add_argument(
|
||||
"--condafile",
|
||||
action="store",
|
||||
default="./reco.yaml",
|
||||
help="file with environment variables",
|
||||
)
|
||||
# AzureML experiment name
|
||||
parser.add_argument("--expname",
|
||||
action="store",
|
||||
default="persistentAML",
|
||||
help="experiment name on Azure")
|
||||
parser.add_argument(
|
||||
"--expname",
|
||||
action="store",
|
||||
default="persistentAML",
|
||||
help="experiment name on Azure",
|
||||
)
|
||||
# Azure datacenter location
|
||||
parser.add_argument("--location",
|
||||
default="EastUS",
|
||||
help="Azure location")
|
||||
parser.add_argument("--location", default="EastUS", help="Azure location")
|
||||
# github repo, stored in AzureML experiment for info purposes
|
||||
parser.add_argument("--reponame",
|
||||
action="store",
|
||||
default="--reponame MyGithubRepo",
|
||||
help="GitHub repo being tested")
|
||||
parser.add_argument(
|
||||
"--reponame",
|
||||
action="store",
|
||||
default="--reponame MyGithubRepo",
|
||||
help="GitHub repo being tested",
|
||||
)
|
||||
# github branch, stored in AzureML experiment for info purposes
|
||||
parser.add_argument("--branch",
|
||||
action="store",
|
||||
default="--branch MyGithubBranch",
|
||||
help=" Identify the branch test test is run on")
|
||||
parser.add_argument(
|
||||
"--branch",
|
||||
action="store",
|
||||
default="--branch MyGithubBranch",
|
||||
help=" Identify the branch test test is run on",
|
||||
)
|
||||
# github pull request, stored in AzureML experiment for info purposes
|
||||
parser.add_argument("--pr",
|
||||
action="store",
|
||||
default="--pr PRTestRun",
|
||||
help="If a pr triggered the test, list it here")
|
||||
parser.add_argument(
|
||||
"--pr",
|
||||
action="store",
|
||||
default="--pr PRTestRun",
|
||||
help="If a pr triggered the test, list it here",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -344,52 +368,60 @@ def create_arg_parser():
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logger = logging.getLogger('submit_azureml_pytest.py')
|
||||
logger = logging.getLogger("submit_azureml_pytest.py")
|
||||
# logger.setLevel(logging.DEBUG)
|
||||
# logging.basicConfig(level=logging.DEBUG)
|
||||
args = create_arg_parser()
|
||||
|
||||
if args.dockerproc == "cpu":
|
||||
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
|
||||
|
||||
docker_proc_type = DEFAULT_CPU_IMAGE
|
||||
else:
|
||||
from azureml.core.runconfig import DEFAULT_GPU_IMAGE
|
||||
|
||||
docker_proc_type = DEFAULT_GPU_IMAGE
|
||||
|
||||
cli_auth = AzureCliAuthentication()
|
||||
|
||||
workspace = setup_workspace(workspace_name=args.wsname,
|
||||
subscription_id=args.subid,
|
||||
resource_group=args.rg,
|
||||
cli_auth=cli_auth,
|
||||
location=args.location)
|
||||
workspace = setup_workspace(
|
||||
workspace_name=args.wsname,
|
||||
subscription_id=args.subid,
|
||||
resource_group=args.rg,
|
||||
cli_auth=cli_auth,
|
||||
location=args.location,
|
||||
)
|
||||
|
||||
cpu_cluster = setup_persistent_compute_target(
|
||||
workspace=workspace,
|
||||
cluster_name=args.clustername,
|
||||
vm_size=args.vmsize,
|
||||
max_nodes=args.maxnodes)
|
||||
workspace=workspace,
|
||||
cluster_name=args.clustername,
|
||||
vm_size=args.vmsize,
|
||||
max_nodes=args.maxnodes,
|
||||
)
|
||||
|
||||
run_config = create_run_config(cpu_cluster=cpu_cluster,
|
||||
docker_proc_type=docker_proc_type,
|
||||
conda_env_file=args.condafile)
|
||||
run_config = create_run_config(
|
||||
cpu_cluster=cpu_cluster,
|
||||
docker_proc_type=docker_proc_type,
|
||||
conda_env_file=args.condafile,
|
||||
)
|
||||
|
||||
logger.info('exp: In Azure, look for experiment named {}'.format(
|
||||
args.expname))
|
||||
logger.info("exp: In Azure, look for experiment named {}".format(args.expname))
|
||||
|
||||
# create new or use existing experiment
|
||||
experiment = Experiment(workspace=workspace, name=args.expname)
|
||||
run = submit_experiment_to_azureml(test=args.test,
|
||||
test_folder=args.testfolder,
|
||||
test_markers=args.testmarkers,
|
||||
junitxml=args.junitxml,
|
||||
run_config=run_config,
|
||||
experiment=experiment)
|
||||
run = submit_experiment_to_azureml(
|
||||
test=args.test,
|
||||
test_folder=args.testfolder,
|
||||
test_markers=args.testmarkers,
|
||||
junitxml=args.junitxml,
|
||||
run_config=run_config,
|
||||
experiment=experiment,
|
||||
)
|
||||
|
||||
# add helpful information to experiment on Azure
|
||||
run.tag('RepoName', args.reponame)
|
||||
run.tag('Branch', args.branch)
|
||||
run.tag('PR', args.pr)
|
||||
run.tag("RepoName", args.reponame)
|
||||
run.tag("Branch", args.branch)
|
||||
run.tag("PR", args.pr)
|
||||
# download files from AzureML
|
||||
run.download_files(prefix='reports', output_paths='./reports')
|
||||
run.download_files(prefix="reports", output_paths="./reports")
|
||||
run.complete()
|
||||
|
|
|
@ -23,4 +23,3 @@ def test_criteo_load_spark_df(spark, criteo_first_row):
|
|||
assert len(df.columns) == 40
|
||||
first_row = df.limit(1).collect()[0].asDict()
|
||||
assert first_row == criteo_first_row
|
||||
|
||||
|
|
|
@ -137,13 +137,7 @@ def test_load_pandas_df(
|
|||
],
|
||||
)
|
||||
def test_load_item_df(
|
||||
size,
|
||||
num_movies,
|
||||
movie_example,
|
||||
title_example,
|
||||
genres_example,
|
||||
year_example,
|
||||
tmp,
|
||||
size, num_movies, movie_example, title_example, genres_example, year_example, tmp,
|
||||
):
|
||||
"""Test movielens item data load (not rating data)
|
||||
"""
|
||||
|
@ -154,7 +148,13 @@ def test_load_item_df(
|
|||
assert df["title"][0] == title_example
|
||||
|
||||
# Test title and genres
|
||||
df = load_item_df(size, local_cache_path=tmp, movie_col="item", genres_col="genres", year_col="year")
|
||||
df = load_item_df(
|
||||
size,
|
||||
local_cache_path=tmp,
|
||||
movie_col="item",
|
||||
genres_col="genres",
|
||||
year_col="year",
|
||||
)
|
||||
assert len(df) == num_movies
|
||||
# movile_col, genres_col and year_col
|
||||
assert len(df.columns) == 3
|
||||
|
@ -214,10 +214,7 @@ def test_load_spark_df(
|
|||
# Test if correct data are loaded
|
||||
header = ["1", "2", "3"]
|
||||
schema = StructType(
|
||||
[
|
||||
StructField("u", IntegerType()),
|
||||
StructField("m", IntegerType()),
|
||||
]
|
||||
[StructField("u", IntegerType()), StructField("m", IntegerType()),]
|
||||
)
|
||||
with pytest.warns(Warning):
|
||||
df = load_spark_df(
|
||||
|
|
|
@ -11,9 +11,7 @@ from reco_utils.common.constants import (
|
|||
DEFAULT_RATING_COL,
|
||||
DEFAULT_TIMESTAMP_COL,
|
||||
)
|
||||
from reco_utils.dataset.python_splitters import (
|
||||
python_chrono_split,
|
||||
)
|
||||
from reco_utils.dataset.python_splitters import python_chrono_split
|
||||
|
||||
# ncf data generation
|
||||
@pytest.fixture(scope="module")
|
||||
|
|
|
@ -14,4 +14,3 @@ def path_notebooks():
|
|||
return os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), os.path.pardir, "notebooks")
|
||||
)
|
||||
|
||||
|
|
|
@ -49,4 +49,3 @@ def read_matrix(file, row_map=None, col_map=None):
|
|||
array = array[row_index, :]
|
||||
array = array[:, col_index]
|
||||
return array, row_ids, col_ids
|
||||
|
||||
|
|
|
@ -37,4 +37,4 @@ def test_extract_criteo(tmp_path):
|
|||
filepath = criteo.download_criteo(size="sample", work_directory=tmp_path)
|
||||
filename = criteo.extract_criteo(size="sample", compressed_file=filepath)
|
||||
statinfo = os.stat(filename)
|
||||
assert statinfo.st_size == 24328072
|
||||
assert statinfo.st_size == 24328072
|
||||
|
|
|
@ -101,13 +101,7 @@ def test_load_pandas_df(
|
|||
[("100k", 1682, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995")],
|
||||
)
|
||||
def test_load_item_df(
|
||||
size,
|
||||
num_movies,
|
||||
movie_example,
|
||||
title_example,
|
||||
genres_example,
|
||||
year_example,
|
||||
tmp,
|
||||
size, num_movies, movie_example, title_example, genres_example, year_example, tmp,
|
||||
):
|
||||
"""Test movielens item data load (not rating data)
|
||||
"""
|
||||
|
@ -118,7 +112,13 @@ def test_load_item_df(
|
|||
assert df["title"][0] == title_example
|
||||
|
||||
# Test title and genres
|
||||
df = load_item_df(size, local_cache_path=tmp, movie_col="item", genres_col="genres", year_col="year")
|
||||
df = load_item_df(
|
||||
size,
|
||||
local_cache_path=tmp,
|
||||
movie_col="item",
|
||||
genres_col="genres",
|
||||
year_col="year",
|
||||
)
|
||||
assert len(df) == num_movies
|
||||
# movile_col, genres_col and year_col
|
||||
assert len(df.columns) == 3
|
||||
|
@ -160,10 +160,7 @@ def test_load_spark_df(
|
|||
# Test if correct data are loaded
|
||||
header = ["1", "2", "3"]
|
||||
schema = StructType(
|
||||
[
|
||||
StructField("u", IntegerType()),
|
||||
StructField("m", IntegerType()),
|
||||
]
|
||||
[StructField("u", IntegerType()), StructField("m", IntegerType()),]
|
||||
)
|
||||
with pytest.warns(Warning):
|
||||
df = load_spark_df(
|
||||
|
|
|
@ -36,18 +36,14 @@ def test_als_pyspark_smoke(notebooks):
|
|||
|
||||
@pytest.mark.smoke
|
||||
@pytest.mark.spark
|
||||
@pytest.mark.skipif(sys.platform == 'win32', reason="Not implemented on Windows")
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Not implemented on Windows")
|
||||
def test_mmlspark_lightgbm_criteo_smoke(notebooks):
|
||||
notebook_path = notebooks["mmlspark_lightgbm_criteo"]
|
||||
pm.execute_notebook(
|
||||
notebook_path,
|
||||
OUTPUT_NOTEBOOK,
|
||||
kernel_name=KERNEL_NAME,
|
||||
parameters=dict(
|
||||
DATA_SIZE="sample",
|
||||
NUM_ITERATIONS=50,
|
||||
EARLY_STOPPING_ROUND=10
|
||||
)
|
||||
parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10),
|
||||
)
|
||||
nb = pm.read_notebook(OUTPUT_NOTEBOOK)
|
||||
results = nb.dataframe.set_index("name")["value"]
|
||||
|
|
|
@ -30,4 +30,3 @@ def test_download_path():
|
|||
with download_path(tmp_dir.name) as path:
|
||||
assert os.path.isdir(path)
|
||||
assert os.path.isdir(path)
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ def test_clear_memory_all_gpus():
|
|||
|
||||
|
||||
@pytest.mark.gpu
|
||||
@pytest.mark.skipif(sys.platform == 'win32', reason="Not implemented on Windows")
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="Not implemented on Windows")
|
||||
def test_get_cuda_version():
|
||||
assert get_cuda_version() > "9.0.0"
|
||||
|
||||
|
|
|
@ -3,7 +3,11 @@
|
|||
|
||||
import pytest
|
||||
from reco_utils.recommender.lightfm.lightfm_utils import (
|
||||
compare_metric, track_model_metrics, similar_users, similar_items)
|
||||
compare_metric,
|
||||
track_model_metrics,
|
||||
similar_users,
|
||||
similar_items,
|
||||
)
|
||||
import itertools
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
@ -20,95 +24,124 @@ TEST_ITEM_ID = 1
|
|||
# note user and item ID need to be sequential for similar users and similar items to work
|
||||
@pytest.fixture(scope="module")
|
||||
def df():
|
||||
mock_data = {
|
||||
'userID':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||
'itemID':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||
'rating':[2.0,4.0,1.0,4.0,1.0,2.0,5.0,1.0,1.0,2.0],
|
||||
'genre':['Action|Comedy','Drama','Drama|Romance|War',
|
||||
'Drama|Sci-Fi','Horror','Action|Horror|Sci-Fi|Thriller',
|
||||
'Drama|Romance|War','Western','Comedy','Horror'],
|
||||
'occupation':['engineer','student','retired',
|
||||
'administrator','writer','administrator','student','executive','student','other']
|
||||
}
|
||||
return pd.DataFrame(mock_data)
|
||||
mock_data = {
|
||||
"userID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||
"itemID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||
"rating": [2.0, 4.0, 1.0, 4.0, 1.0, 2.0, 5.0, 1.0, 1.0, 2.0],
|
||||
"genre": [
|
||||
"Action|Comedy",
|
||||
"Drama",
|
||||
"Drama|Romance|War",
|
||||
"Drama|Sci-Fi",
|
||||
"Horror",
|
||||
"Action|Horror|Sci-Fi|Thriller",
|
||||
"Drama|Romance|War",
|
||||
"Western",
|
||||
"Comedy",
|
||||
"Horror",
|
||||
],
|
||||
"occupation": [
|
||||
"engineer",
|
||||
"student",
|
||||
"retired",
|
||||
"administrator",
|
||||
"writer",
|
||||
"administrator",
|
||||
"student",
|
||||
"executive",
|
||||
"student",
|
||||
"other",
|
||||
],
|
||||
}
|
||||
return pd.DataFrame(mock_data)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def interactions(df):
|
||||
movie_genre = [x.split('|') for x in df['genre']]
|
||||
all_movie_genre = sorted(list(set(itertools.chain.from_iterable(movie_genre))))
|
||||
movie_genre = [x.split("|") for x in df["genre"]]
|
||||
all_movie_genre = sorted(list(set(itertools.chain.from_iterable(movie_genre))))
|
||||
|
||||
all_occupations = sorted(list(set(df['occupation'])))
|
||||
all_occupations = sorted(list(set(df["occupation"])))
|
||||
|
||||
dataset = Dataset()
|
||||
dataset.fit(df['userID'],
|
||||
df['itemID'],
|
||||
item_features=all_movie_genre,
|
||||
user_features=all_occupations)
|
||||
dataset = Dataset()
|
||||
dataset.fit(
|
||||
df["userID"],
|
||||
df["itemID"],
|
||||
item_features=all_movie_genre,
|
||||
user_features=all_occupations,
|
||||
)
|
||||
|
||||
item_features = dataset.build_item_features(
|
||||
(x, y) for x,y in zip(df.itemID, movie_genre))
|
||||
item_features = dataset.build_item_features(
|
||||
(x, y) for x, y in zip(df.itemID, movie_genre)
|
||||
)
|
||||
|
||||
user_features = dataset.build_user_features(
|
||||
(x, [y]) for x,y in zip(df.userID, df['occupation']))
|
||||
user_features = dataset.build_user_features(
|
||||
(x, [y]) for x, y in zip(df.userID, df["occupation"])
|
||||
)
|
||||
|
||||
(interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values)
|
||||
(interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values)
|
||||
|
||||
train_interactions, test_interactions = cross_validation.random_train_test_split(
|
||||
interactions, test_percentage=TEST_PERCENTAGE,
|
||||
random_state=np.random.RandomState(SEEDNO))
|
||||
return train_interactions, test_interactions, item_features, user_features
|
||||
train_interactions, test_interactions = cross_validation.random_train_test_split(
|
||||
interactions,
|
||||
test_percentage=TEST_PERCENTAGE,
|
||||
random_state=np.random.RandomState(SEEDNO),
|
||||
)
|
||||
return train_interactions, test_interactions, item_features, user_features
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model():
|
||||
return LightFM(loss='warp', random_state=np.random.RandomState(SEEDNO))
|
||||
return LightFM(loss="warp", random_state=np.random.RandomState(SEEDNO))
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def fitting(model, interactions, df):
|
||||
train_interactions, test_interactions, item_features, user_features = interactions
|
||||
output, fitted_model = track_model_metrics(model=model,
|
||||
train_interactions=train_interactions,
|
||||
test_interactions=test_interactions,
|
||||
user_features = user_features,
|
||||
item_features = item_features,
|
||||
show_plot=False)
|
||||
return output, fitted_model
|
||||
train_interactions, test_interactions, item_features, user_features = interactions
|
||||
output, fitted_model = track_model_metrics(
|
||||
model=model,
|
||||
train_interactions=train_interactions,
|
||||
test_interactions=test_interactions,
|
||||
user_features=user_features,
|
||||
item_features=item_features,
|
||||
show_plot=False,
|
||||
)
|
||||
return output, fitted_model
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def sim_users(interactions, fitting):
|
||||
_, _, _, user_features = interactions
|
||||
_, fitted_model = fitting
|
||||
return similar_users(user_id=TEST_USER_ID, user_features=user_features,
|
||||
model=fitted_model, N=5)
|
||||
_, _, _, user_features = interactions
|
||||
_, fitted_model = fitting
|
||||
return similar_users(
|
||||
user_id=TEST_USER_ID, user_features=user_features, model=fitted_model, N=5
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def sim_items(interactions, fitting):
|
||||
_, _, item_features, _ = interactions
|
||||
_, fitted_model = fitting
|
||||
return similar_items(item_id=TEST_ITEM_ID, item_features=item_features,
|
||||
model=fitted_model, N=5)
|
||||
|
||||
_, _, item_features, _ = interactions
|
||||
_, fitted_model = fitting
|
||||
return similar_items(
|
||||
item_id=TEST_ITEM_ID, item_features=item_features, model=fitted_model, N=5
|
||||
)
|
||||
|
||||
|
||||
def test_interactions(interactions):
|
||||
train_interactions, test_interactions, item_features, user_features = interactions
|
||||
assert train_interactions.shape == (10, 10)
|
||||
assert test_interactions.shape == (10, 10)
|
||||
assert item_features.shape == (10, 19)
|
||||
assert user_features.shape == (10, 17)
|
||||
train_interactions, test_interactions, item_features, user_features = interactions
|
||||
assert train_interactions.shape == (10, 10)
|
||||
assert test_interactions.shape == (10, 10)
|
||||
assert item_features.shape == (10, 19)
|
||||
assert user_features.shape == (10, 17)
|
||||
|
||||
|
||||
def test_fitting(fitting):
|
||||
output, _ = fitting
|
||||
assert output.shape == (600, 4)
|
||||
output, _ = fitting
|
||||
assert output.shape == (600, 4)
|
||||
|
||||
|
||||
def test_sim_users(sim_users):
|
||||
assert sim_users.shape == (5, 2)
|
||||
assert sim_users.shape == (5, 2)
|
||||
|
||||
|
||||
def test_sim_items(sim_items):
|
||||
assert sim_items.shape == (5, 2)
|
||||
assert sim_items.shape == (5, 2)
|
||||
|
|
|
@ -17,8 +17,10 @@ BATCH_SIZE = 32
|
|||
|
||||
def test_data_preprocessing(python_dataset_ncf):
|
||||
train, test = python_dataset_ncf
|
||||
data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST, seed=SEED)
|
||||
|
||||
data = Dataset(
|
||||
train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST, seed=SEED
|
||||
)
|
||||
|
||||
# shape
|
||||
assert len(data.train) == len(train)
|
||||
assert len(data.test) == len(test)
|
||||
|
@ -39,7 +41,9 @@ def test_data_preprocessing(python_dataset_ncf):
|
|||
|
||||
def test_train_loader(python_dataset_ncf):
|
||||
train, test = python_dataset_ncf
|
||||
data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST, seed=SEED)
|
||||
data = Dataset(
|
||||
train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST, seed=SEED
|
||||
)
|
||||
|
||||
# collect positvie user-item dict
|
||||
positive_pool = {}
|
||||
|
@ -49,7 +53,7 @@ def test_train_loader(python_dataset_ncf):
|
|||
# without negative sampling
|
||||
for batch in data.train_loader(batch_size=BATCH_SIZE, shuffle=False):
|
||||
user, item, labels = batch
|
||||
#shape
|
||||
# shape
|
||||
assert len(user) == BATCH_SIZE
|
||||
assert len(item) == BATCH_SIZE
|
||||
assert len(labels) == BATCH_SIZE
|
||||
|
@ -58,9 +62,9 @@ def test_train_loader(python_dataset_ncf):
|
|||
# right labels
|
||||
for u, i, is_pos in zip(user, item, labels):
|
||||
if is_pos:
|
||||
assert i in positive_pool[u]
|
||||
else:
|
||||
assert i not in positive_pool[u]
|
||||
assert i in positive_pool[u]
|
||||
else:
|
||||
assert i not in positive_pool[u]
|
||||
|
||||
data.negative_sampling()
|
||||
label_list = []
|
||||
|
@ -73,9 +77,9 @@ def test_train_loader(python_dataset_ncf):
|
|||
# right labels
|
||||
for u, i, is_pos in zip(user, item, labels):
|
||||
if is_pos:
|
||||
assert i in positive_pool[u]
|
||||
else:
|
||||
assert i not in positive_pool[u]
|
||||
assert i in positive_pool[u]
|
||||
else:
|
||||
assert i not in positive_pool[u]
|
||||
|
||||
label_list.append(is_pos)
|
||||
|
||||
|
@ -90,7 +94,7 @@ def test_test_loader(python_dataset_ncf):
|
|||
# positive user-item dict, noting that the pool is train+test
|
||||
positive_pool = {}
|
||||
df = train.append(test)
|
||||
for u in df[DEFAULT_USER_COL].unique():
|
||||
for u in df[DEFAULT_USER_COL].unique():
|
||||
positive_pool[u] = set(df[df[DEFAULT_USER_COL] == u][DEFAULT_ITEM_COL])
|
||||
|
||||
for batch in data.test_loader():
|
||||
|
@ -104,9 +108,9 @@ def test_test_loader(python_dataset_ncf):
|
|||
|
||||
for u, i, is_pos in zip(user, item, labels):
|
||||
if is_pos:
|
||||
assert i in positive_pool[u]
|
||||
else:
|
||||
assert i not in positive_pool[u]
|
||||
assert i in positive_pool[u]
|
||||
else:
|
||||
assert i not in positive_pool[u]
|
||||
|
||||
label_list.append(is_pos)
|
||||
|
||||
|
|
|
@ -24,7 +24,9 @@ N_NEG_TEST = 10
|
|||
"model_type, n_users, n_items", [("NeuMF", 1, 1), ("GMF", 10, 10), ("MLP", 4, 8)]
|
||||
)
|
||||
def test_init(model_type, n_users, n_items):
|
||||
model = NCF(n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED)
|
||||
model = NCF(
|
||||
n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED
|
||||
)
|
||||
# model type
|
||||
assert model.model_type == model_type.lower()
|
||||
# number of users in dataset
|
||||
|
@ -39,7 +41,7 @@ def test_init(model_type, n_users, n_items):
|
|||
assert model.embedding_mlp_P.shape == [n_users, model.n_factors]
|
||||
# dimension of mlp item embedding
|
||||
assert model.embedding_mlp_Q.shape == [n_items, model.n_factors]
|
||||
|
||||
|
||||
# TODO: more parameters
|
||||
|
||||
|
||||
|
@ -52,7 +54,9 @@ def test_regular_save_load(model_type, n_users, n_items):
|
|||
if os.path.exists(ckpt):
|
||||
shutil.rmtree(ckpt)
|
||||
|
||||
model = NCF(n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED)
|
||||
model = NCF(
|
||||
n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED
|
||||
)
|
||||
model.save(ckpt)
|
||||
if model.model_type == "neumf":
|
||||
P = model.sess.run(model.embedding_gmf_P)
|
||||
|
@ -65,7 +69,9 @@ def test_regular_save_load(model_type, n_users, n_items):
|
|||
Q = model.sess.run(model.embedding_mlp_Q)
|
||||
|
||||
del model
|
||||
model = NCF(n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED)
|
||||
model = NCF(
|
||||
n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED
|
||||
)
|
||||
|
||||
if model.model_type == "neumf":
|
||||
model.load(neumf_dir=ckpt)
|
||||
|
@ -89,9 +95,7 @@ def test_regular_save_load(model_type, n_users, n_items):
|
|||
|
||||
|
||||
@pytest.mark.gpu
|
||||
@pytest.mark.parametrize(
|
||||
"n_users, n_items", [(5, 5), (4, 8)]
|
||||
)
|
||||
@pytest.mark.parametrize("n_users, n_items", [(5, 5), (4, 8)])
|
||||
def test_neumf_save_load(n_users, n_items):
|
||||
model_type = "gmf"
|
||||
ckpt_gmf = ".%s" % model_type
|
||||
|
@ -137,31 +141,31 @@ def test_neumf_save_load(n_users, n_items):
|
|||
|
||||
|
||||
@pytest.mark.gpu
|
||||
@pytest.mark.parametrize(
|
||||
"model_type", ["NeuMF", "GMF", "MLP"]
|
||||
)
|
||||
@pytest.mark.parametrize("model_type", ["NeuMF", "GMF", "MLP"])
|
||||
def test_fit(python_dataset_ncf, model_type):
|
||||
train, test = python_dataset_ncf
|
||||
data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
|
||||
model = NCF(n_users=data.n_users, n_items=data.n_items, model_type=model_type, n_epochs=1)
|
||||
model = NCF(
|
||||
n_users=data.n_users, n_items=data.n_items, model_type=model_type, n_epochs=1
|
||||
)
|
||||
model.fit(data)
|
||||
|
||||
|
||||
@pytest.mark.gpu
|
||||
@pytest.mark.parametrize(
|
||||
"model_type", ["NeuMF", "GMF", "MLP"]
|
||||
)
|
||||
@pytest.mark.parametrize("model_type", ["NeuMF", "GMF", "MLP"])
|
||||
def test_predict(python_dataset_ncf, model_type):
|
||||
# test data format
|
||||
train, test = python_dataset_ncf
|
||||
data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
|
||||
model = NCF(n_users=data.n_users, n_items=data.n_items, model_type=model_type, n_epochs=1)
|
||||
model = NCF(
|
||||
n_users=data.n_users, n_items=data.n_items, model_type=model_type, n_epochs=1
|
||||
)
|
||||
model.fit(data)
|
||||
|
||||
test_users, test_items = list(test[DEFAULT_USER_COL]), list(test[DEFAULT_ITEM_COL])
|
||||
|
||||
assert type(model.predict(test_users[0], test_items[0])) == float
|
||||
|
||||
|
||||
res = model.predict(test_users, test_items, is_list=True)
|
||||
|
||||
assert type(res) == list
|
||||
|
|
|
@ -15,7 +15,7 @@ from reco_utils.tuning.nni.nni_utils import (
|
|||
check_metrics_written,
|
||||
get_trials,
|
||||
NNI_STATUS_URL,
|
||||
NNI_TRIAL_JOBS_URL
|
||||
NNI_TRIAL_JOBS_URL,
|
||||
)
|
||||
|
||||
|
||||
|
@ -33,6 +33,7 @@ def mocked_status_get(url, content, error):
|
|||
assert url.startswith(NNI_STATUS_URL)
|
||||
return MockResponse(content, error)
|
||||
|
||||
|
||||
class MockResponseTrials:
|
||||
# Class that mocks requests.models.Response
|
||||
def __init__(self, content):
|
||||
|
@ -41,10 +42,11 @@ class MockResponseTrials:
|
|||
def json(self):
|
||||
return self._content
|
||||
|
||||
|
||||
def mocked_trials_get(url, content):
|
||||
assert url.startswith(NNI_TRIAL_JOBS_URL)
|
||||
return MockResponseTrials(content)
|
||||
|
||||
|
||||
|
||||
def mock_exception():
|
||||
raise Exception()
|
||||
|
@ -54,17 +56,21 @@ def mock_exception():
|
|||
def test_get_experiment_status():
|
||||
content = "some_status"
|
||||
error = ""
|
||||
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
|
||||
with patch(
|
||||
"requests.get", side_effect=lambda url: mocked_status_get(url, content, error)
|
||||
):
|
||||
nni_status = get_experiment_status(NNI_STATUS_URL)
|
||||
assert nni_status["status"] == "some_status"
|
||||
assert nni_status["errors"] == [""]
|
||||
assert nni_status["errors"] == [""]
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
|
||||
def test_check_experiment_status_done():
|
||||
content = "DONE"
|
||||
error = ""
|
||||
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
|
||||
with patch(
|
||||
"requests.get", side_effect=lambda url: mocked_status_get(url, content, error)
|
||||
):
|
||||
check_experiment_status(wait=0.1, max_retries=1)
|
||||
|
||||
|
||||
|
@ -72,7 +78,9 @@ def test_check_experiment_status_done():
|
|||
def test_check_experiment_status_tuner_no_more_trial():
|
||||
content = "TUNER_NO_MORE_TRIAL"
|
||||
error = ""
|
||||
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
|
||||
with patch(
|
||||
"requests.get", side_effect=lambda url: mocked_status_get(url, content, error)
|
||||
):
|
||||
check_experiment_status(wait=0.1, max_retries=1)
|
||||
|
||||
|
||||
|
@ -81,7 +89,10 @@ def test_check_experiment_status_running():
|
|||
content = "RUNNING"
|
||||
error = ""
|
||||
with pytest.raises(TimeoutError) as excinfo:
|
||||
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
|
||||
with patch(
|
||||
"requests.get",
|
||||
side_effect=lambda url: mocked_status_get(url, content, error),
|
||||
):
|
||||
check_experiment_status(wait=0.1, max_retries=1)
|
||||
assert "check_experiment_status() timed out" == str(excinfo.value)
|
||||
|
||||
|
@ -91,7 +102,10 @@ def test_check_experiment_status_no_more_trial():
|
|||
content = "NO_MORE_TRIAL"
|
||||
error = ""
|
||||
with pytest.raises(TimeoutError) as excinfo:
|
||||
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
|
||||
with patch(
|
||||
"requests.get",
|
||||
side_effect=lambda url: mocked_status_get(url, content, error),
|
||||
):
|
||||
check_experiment_status(wait=0.1, max_retries=1)
|
||||
assert "check_experiment_status() timed out" == str(excinfo.value)
|
||||
|
||||
|
@ -101,9 +115,15 @@ def test_check_experiment_status_failed():
|
|||
content = "some_failed_status"
|
||||
error = "NNI_ERROR"
|
||||
with pytest.raises(RuntimeError) as excinfo:
|
||||
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
|
||||
with patch(
|
||||
"requests.get",
|
||||
side_effect=lambda url: mocked_status_get(url, content, error),
|
||||
):
|
||||
check_experiment_status(wait=0.1, max_retries=1)
|
||||
assert "NNI experiment failed to complete with status some_failed_status - NNI_ERROR" == str(excinfo.value)
|
||||
assert (
|
||||
"NNI experiment failed to complete with status some_failed_status - NNI_ERROR"
|
||||
== str(excinfo.value)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
|
||||
|
@ -111,30 +131,35 @@ def test_check_stopped_timeout():
|
|||
content = "some_status"
|
||||
error = ""
|
||||
with pytest.raises(TimeoutError) as excinfo:
|
||||
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
|
||||
check_stopped(wait=.1, max_retries=1)
|
||||
with patch(
|
||||
"requests.get",
|
||||
side_effect=lambda url: mocked_status_get(url, content, error),
|
||||
):
|
||||
check_stopped(wait=0.1, max_retries=1)
|
||||
assert "check_stopped() timed out" == str(excinfo.value)
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
|
||||
def test_check_stopped():
|
||||
with patch("requests.get", side_effect=mock_exception):
|
||||
check_stopped(wait=.1, max_retries=1)
|
||||
check_stopped(wait=0.1, max_retries=1)
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
|
||||
def test_check_metrics_written():
|
||||
content = [{"finalMetricData": None}, {"finalMetricData": None}]
|
||||
with patch("requests.get", side_effect=lambda url: mocked_trials_get(url, content)):
|
||||
check_metrics_written(wait=.1, max_retries=1)
|
||||
check_metrics_written(wait=0.1, max_retries=1)
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
|
||||
def test_check_metrics_written_timeout():
|
||||
content = [{"logPath": "/p"}, {"logPath": "/q"}]
|
||||
with pytest.raises(TimeoutError) as excinfo:
|
||||
with patch("requests.get", side_effect=lambda url: mocked_trials_get(url, content)):
|
||||
check_metrics_written(wait=.1, max_retries=1)
|
||||
with patch(
|
||||
"requests.get", side_effect=lambda url: mocked_trials_get(url, content)
|
||||
):
|
||||
check_metrics_written(wait=0.1, max_retries=1)
|
||||
assert "check_metrics_written() timed out" == str(excinfo.value)
|
||||
|
||||
|
||||
|
@ -142,31 +167,47 @@ def test_check_metrics_written_timeout():
|
|||
def test_get_trials():
|
||||
with TemporaryDirectory() as tmp_dir1, TemporaryDirectory() as tmp_dir2:
|
||||
mock_trials = [
|
||||
{"finalMetricData": [{"data": '{"rmse":0.8,"default":0.3}'}],
|
||||
"logPath": "file://localhost:{}".format(tmp_dir1)},
|
||||
{"finalMetricData": [{"data": '{"rmse":0.9,"default":0.2}'}],
|
||||
"logPath": "file://localhost:{}".format(tmp_dir2)},
|
||||
{
|
||||
"finalMetricData": [{"data": '{"rmse":0.8,"default":0.3}'}],
|
||||
"logPath": "file://localhost:{}".format(tmp_dir1),
|
||||
},
|
||||
{
|
||||
"finalMetricData": [{"data": '{"rmse":0.9,"default":0.2}'}],
|
||||
"logPath": "file://localhost:{}".format(tmp_dir2),
|
||||
},
|
||||
]
|
||||
metrics1 = {"rmse": 0.8, "precision_at_k": 0.3}
|
||||
with open(os.path.join(tmp_dir1, "metrics.json"), "w") as f:
|
||||
json.dump(metrics1, f)
|
||||
params1 = {"parameter_id": 1, "parameter_source": "algorithm",
|
||||
"parameters": {"n_factors": 100, "reg": 0.1}}
|
||||
params1 = {
|
||||
"parameter_id": 1,
|
||||
"parameter_source": "algorithm",
|
||||
"parameters": {"n_factors": 100, "reg": 0.1},
|
||||
}
|
||||
with open(os.path.join(tmp_dir1, "parameter.cfg"), "w") as f:
|
||||
json.dump(params1, f)
|
||||
metrics2 = {"rmse": 0.9, "precision_at_k": 0.2}
|
||||
with open(os.path.join(tmp_dir2, "metrics.json"), "w") as f:
|
||||
json.dump(metrics2, f)
|
||||
params2 = {"parameter_id": 2, "parameter_source": "algorithm",
|
||||
"parameters": {"n_factors": 50, "reg": 0.02}}
|
||||
params2 = {
|
||||
"parameter_id": 2,
|
||||
"parameter_source": "algorithm",
|
||||
"parameters": {"n_factors": 50, "reg": 0.02},
|
||||
}
|
||||
with open(os.path.join(tmp_dir2, "parameter.cfg"), "w") as f:
|
||||
json.dump(params2, f)
|
||||
|
||||
with patch("requests.get", side_effect=lambda url: mocked_trials_get(url, mock_trials)):
|
||||
trials, best_metrics, best_params, best_trial_path = get_trials(optimize_mode="maximize")
|
||||
with patch(
|
||||
"requests.get", side_effect=lambda url: mocked_trials_get(url, mock_trials)
|
||||
):
|
||||
trials, best_metrics, best_params, best_trial_path = get_trials(
|
||||
optimize_mode="maximize"
|
||||
)
|
||||
|
||||
expected_trials = [({"rmse": 0.8, "default": 0.3}, tmp_dir1),
|
||||
({"rmse": 0.9, "default": 0.2}, tmp_dir2)]
|
||||
expected_trials = [
|
||||
({"rmse": 0.8, "default": 0.3}, tmp_dir1),
|
||||
({"rmse": 0.9, "default": 0.2}, tmp_dir2),
|
||||
]
|
||||
assert trials == expected_trials
|
||||
assert best_metrics == metrics1
|
||||
assert best_params == params1
|
||||
|
|
|
@ -17,9 +17,7 @@ def test_is_jupyter():
|
|||
# Test on Jupyter notebook
|
||||
path = os.path.join("tests", "unit", "test_notebook_utils.ipynb")
|
||||
pm.execute_notebook(
|
||||
path,
|
||||
OUTPUT_NOTEBOOK,
|
||||
kernel_name=KERNEL_NAME,
|
||||
path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME,
|
||||
)
|
||||
nb = pm.read_notebook(OUTPUT_NOTEBOOK)
|
||||
df = nb.dataframe
|
||||
|
@ -28,6 +26,7 @@ def test_is_jupyter():
|
|||
result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0]
|
||||
assert result_is_databricks is False
|
||||
|
||||
|
||||
# @pytest.mark.notebooks
|
||||
# def test_is_databricks():
|
||||
# TODO Currently, we cannot pytest modules on Databricks
|
||||
|
|
|
@ -80,37 +80,31 @@ def test_wide_deep(notebooks, tmp):
|
|||
model_dir = os.path.join(tmp, "wide_deep_0")
|
||||
os.mkdir(model_dir)
|
||||
params = {
|
||||
'MOVIELENS_DATA_SIZE': '100k',
|
||||
'STEPS': 1,
|
||||
'EVALUATE_WHILE_TRAINING': False,
|
||||
'MODEL_DIR': model_dir,
|
||||
'EXPORT_DIR_BASE': model_dir,
|
||||
'RATING_METRICS': ['rmse'],
|
||||
'RANKING_METRICS': ['ndcg_at_k'],
|
||||
"MOVIELENS_DATA_SIZE": "100k",
|
||||
"STEPS": 1,
|
||||
"EVALUATE_WHILE_TRAINING": False,
|
||||
"MODEL_DIR": model_dir,
|
||||
"EXPORT_DIR_BASE": model_dir,
|
||||
"RATING_METRICS": ["rmse"],
|
||||
"RANKING_METRICS": ["ndcg_at_k"],
|
||||
}
|
||||
pm.execute_notebook(
|
||||
notebook_path,
|
||||
OUTPUT_NOTEBOOK,
|
||||
kernel_name=KERNEL_NAME,
|
||||
parameters=params,
|
||||
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params,
|
||||
)
|
||||
|
||||
# Test with different parameters
|
||||
model_dir = os.path.join(tmp, "wide_deep_1")
|
||||
os.mkdir(model_dir)
|
||||
params = {
|
||||
'MOVIELENS_DATA_SIZE': '100k',
|
||||
'STEPS': 1,
|
||||
'ITEM_FEAT_COL': None,
|
||||
'EVALUATE_WHILE_TRAINING': True,
|
||||
'MODEL_DIR': model_dir,
|
||||
'EXPORT_DIR_BASE': model_dir,
|
||||
'RATING_METRICS': ['rsquared'],
|
||||
'RANKING_METRICS': ['map_at_k'],
|
||||
"MOVIELENS_DATA_SIZE": "100k",
|
||||
"STEPS": 1,
|
||||
"ITEM_FEAT_COL": None,
|
||||
"EVALUATE_WHILE_TRAINING": True,
|
||||
"MODEL_DIR": model_dir,
|
||||
"EXPORT_DIR_BASE": model_dir,
|
||||
"RATING_METRICS": ["rsquared"],
|
||||
"RANKING_METRICS": ["map_at_k"],
|
||||
}
|
||||
pm.execute_notebook(
|
||||
notebook_path,
|
||||
OUTPUT_NOTEBOOK,
|
||||
kernel_name=KERNEL_NAME,
|
||||
parameters=params,
|
||||
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params,
|
||||
)
|
||||
|
|
|
@ -20,15 +20,13 @@ from reco_utils.dataset.pandas_df_utils import (
|
|||
@pytest.fixture(scope="module")
|
||||
def user_item_dataset():
|
||||
"""Get users and items dataframe"""
|
||||
user_df = pd.DataFrame({
|
||||
'user_id': [1, 2, 3, 4, 5],
|
||||
'user_age': [23, 24, 25, 26, 27]
|
||||
})
|
||||
user_df = pd.DataFrame(
|
||||
{"user_id": [1, 2, 3, 4, 5], "user_age": [23, 24, 25, 26, 27]}
|
||||
)
|
||||
|
||||
item_df = pd.DataFrame({
|
||||
'item_id': [6, 7, 8],
|
||||
'item_feat': [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]]
|
||||
})
|
||||
item_df = pd.DataFrame(
|
||||
{"item_id": [6, 7, 8], "item_feat": [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]]}
|
||||
)
|
||||
|
||||
return user_df, item_df
|
||||
|
||||
|
@ -39,14 +37,15 @@ def test_user_item_pairs(user_item_dataset):
|
|||
user_item = user_item_pairs(
|
||||
user_df=user_df,
|
||||
item_df=item_df,
|
||||
user_col='user_id',
|
||||
item_col='item_id',
|
||||
shuffle=False
|
||||
user_col="user_id",
|
||||
item_col="item_id",
|
||||
shuffle=False,
|
||||
)
|
||||
# Validate cross-join
|
||||
assert len(user_df) * len(item_df) == len(user_item)
|
||||
assert user_item.loc[(user_item['user_id'] == 3) & (user_item['item_id'] == 7)].values.tolist()[0]\
|
||||
== [3, 25, 7, [0.2, 0.2]]
|
||||
assert user_item.loc[
|
||||
(user_item["user_id"] == 3) & (user_item["item_id"] == 7)
|
||||
].values.tolist()[0] == [3, 25, 7, [0.2, 0.2]]
|
||||
|
||||
# Check if result is deterministic
|
||||
assert user_item.iloc[0].values.tolist() == [1, 23, 6, [0.1, 0.1]]
|
||||
|
@ -55,124 +54,153 @@ def test_user_item_pairs(user_item_dataset):
|
|||
user_item_shuffled = user_item_pairs(
|
||||
user_df=user_df,
|
||||
item_df=item_df,
|
||||
user_col='user_id',
|
||||
item_col='item_id',
|
||||
shuffle=True
|
||||
user_col="user_id",
|
||||
item_col="item_id",
|
||||
shuffle=True,
|
||||
)
|
||||
# Check shuffled result is still valid
|
||||
assert len(user_df) * len(item_df) == len(user_item_shuffled)
|
||||
row = user_item.loc[(user_item['user_id'] == 2) & (user_item['item_id'] == 6)]
|
||||
assert row['user_age'].iloc[0] == 24
|
||||
assert row['item_feat'].iloc[0] == [0.1, 0.1]
|
||||
row = user_item.loc[(user_item["user_id"] == 2) & (user_item["item_id"] == 6)]
|
||||
assert row["user_age"].iloc[0] == 24
|
||||
assert row["item_feat"].iloc[0] == [0.1, 0.1]
|
||||
# Check shuffled result is different from not-shuffled dataframe
|
||||
assert [*user_item_shuffled['user_id'].values] != [*user_item['user_id'].values]
|
||||
assert [*user_item_shuffled["user_id"].values] != [*user_item["user_id"].values]
|
||||
|
||||
# Check filter
|
||||
seen_df = pd.DataFrame({
|
||||
'user_id': [1, 9, 3, 5, 5, 1],
|
||||
'item_id': [1, 6, 7, 6, 8, 9]
|
||||
})
|
||||
seen_df = pd.DataFrame(
|
||||
{"user_id": [1, 9, 3, 5, 5, 1], "item_id": [1, 6, 7, 6, 8, 9]}
|
||||
)
|
||||
user_item_filtered = user_item_pairs(
|
||||
user_df=user_df,
|
||||
item_df=item_df,
|
||||
user_col='user_id',
|
||||
item_col='item_id',
|
||||
user_col="user_id",
|
||||
item_col="item_id",
|
||||
user_item_filter_df=seen_df,
|
||||
shuffle=False
|
||||
shuffle=False,
|
||||
)
|
||||
# Check filtered out number
|
||||
assert len(user_item_filtered) == len(user_item) - 3
|
||||
# Check filtered out record
|
||||
assert len(user_item_filtered.loc[(user_item['user_id'] == 3) & (user_item['item_id'] == 7)]) == 0
|
||||
assert (
|
||||
len(
|
||||
user_item_filtered.loc[
|
||||
(user_item["user_id"] == 3) & (user_item["item_id"] == 7)
|
||||
]
|
||||
)
|
||||
== 0
|
||||
)
|
||||
|
||||
|
||||
def test_filter_by():
|
||||
user_df = pd.DataFrame({
|
||||
'user_id': [1, 9, 3, 5, 5, 1],
|
||||
'item_id': [1, 6, 7, 6, 8, 9]
|
||||
})
|
||||
user_df = pd.DataFrame(
|
||||
{"user_id": [1, 9, 3, 5, 5, 1], "item_id": [1, 6, 7, 6, 8, 9]}
|
||||
)
|
||||
|
||||
seen_df = pd.DataFrame({
|
||||
'user_id': [1, 2, 4],
|
||||
})
|
||||
seen_df = pd.DataFrame({"user_id": [1, 2, 4],})
|
||||
|
||||
filtered_df = filter_by(user_df, seen_df, ['user_id'])
|
||||
filtered_df = filter_by(user_df, seen_df, ["user_id"])
|
||||
|
||||
# Check filtered out number
|
||||
assert len(filtered_df) == len(user_df) - 2
|
||||
# Check filtered out record
|
||||
assert len(filtered_df.loc[(user_df['user_id'] == 1)]) == 0
|
||||
assert len(filtered_df.loc[(user_df["user_id"] == 1)]) == 0
|
||||
|
||||
|
||||
def test_csv_to_libffm():
|
||||
df_feature = pd.DataFrame({
|
||||
'rating': [1, 0, 0, 1, 1],
|
||||
'field1': ['xxx1', 'xxx2', 'xxx4', 'xxx4', 'xxx4'],
|
||||
'field2': [3, 4, 5, 6, 7],
|
||||
'field3': [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
'field4': ['1', '2', '3', '4', '5']
|
||||
})
|
||||
df_feature = pd.DataFrame(
|
||||
{
|
||||
"rating": [1, 0, 0, 1, 1],
|
||||
"field1": ["xxx1", "xxx2", "xxx4", "xxx4", "xxx4"],
|
||||
"field2": [3, 4, 5, 6, 7],
|
||||
"field3": [1.0, 2.0, 3.0, 4.0, 5.0],
|
||||
"field4": ["1", "2", "3", "4", "5"],
|
||||
}
|
||||
)
|
||||
|
||||
with TemporaryDirectory() as td:
|
||||
filepath = os.path.join(td, "test")
|
||||
|
||||
converter = LibffmConverter(filepath=filepath).fit(df_feature)
|
||||
df_feature_libffm = converter.transform(df_feature)
|
||||
|
||||
|
||||
# Check the input column types. For example, a bool type is not allowed.
|
||||
df_feature_wrong_type = df_feature.copy()
|
||||
df_feature_wrong_type['field4'] = True
|
||||
df_feature_wrong_type["field4"] = True
|
||||
with pytest.raises(TypeError) as e:
|
||||
LibffmConverter().fit(df_feature_wrong_type)
|
||||
assert e.value == "Input columns should be only object and/or numeric types."
|
||||
assert (
|
||||
e.value == "Input columns should be only object and/or numeric types."
|
||||
)
|
||||
|
||||
# Check if the dim is the same.
|
||||
assert df_feature_libffm.shape == df_feature.shape
|
||||
|
||||
# Check if the columns are converted successfully.
|
||||
assert df_feature_libffm.iloc[0, :].values.tolist() == [1, '1:1:1', '2:4:3', '3:5:1.0', '4:6:1']
|
||||
assert df_feature_libffm.iloc[0, :].values.tolist() == [
|
||||
1,
|
||||
"1:1:1",
|
||||
"2:4:3",
|
||||
"3:5:1.0",
|
||||
"4:6:1",
|
||||
]
|
||||
|
||||
# Check if the duplicated column entries are indexed correctly.
|
||||
# It should skip counting the duplicated features in a field column.
|
||||
assert df_feature_libffm.iloc[-1, :].values.tolist() == [1, '1:3:1', '2:4:7', '3:5:5.0', '4:10:1']
|
||||
assert df_feature_libffm.iloc[-1, :].values.tolist() == [
|
||||
1,
|
||||
"1:3:1",
|
||||
"2:4:7",
|
||||
"3:5:5.0",
|
||||
"4:10:1",
|
||||
]
|
||||
|
||||
# Check if the file is written successfully.
|
||||
assert os.path.isfile(filepath)
|
||||
|
||||
with open(filepath, 'r') as f:
|
||||
with open(filepath, "r") as f:
|
||||
line = f.readline()
|
||||
assert line == '1 1:1:1 2:4:3 3:5:1.0 4:6:1\n'
|
||||
assert line == "1 1:1:1 2:4:3 3:5:1.0 4:6:1\n"
|
||||
|
||||
# Parameters in the transformation should be reported correctly.
|
||||
params = converter.get_params()
|
||||
assert params == {
|
||||
'field count': 4,
|
||||
'feature count': 10,
|
||||
'file path': filepath
|
||||
}
|
||||
assert params == {"field count": 4, "feature count": 10, "file path": filepath}
|
||||
|
||||
# Dataset with the same columns should be transformable with a fitted converter.
|
||||
df_feature_new = pd.DataFrame({
|
||||
'rating': [1, 0, 0, 1, 1, 1],
|
||||
'field1': ['xxx1', 'xxx2', 'xxx4', 'xxx4', 'xxx4', 'xxx3'],
|
||||
'field2': [3, 4, 5, 6, 7, 8],
|
||||
'field3': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
|
||||
'field4': ['1', '2', '3', '4', '5', '6']
|
||||
})
|
||||
df_feature_new = pd.DataFrame(
|
||||
{
|
||||
"rating": [1, 0, 0, 1, 1, 1],
|
||||
"field1": ["xxx1", "xxx2", "xxx4", "xxx4", "xxx4", "xxx3"],
|
||||
"field2": [3, 4, 5, 6, 7, 8],
|
||||
"field3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
|
||||
"field4": ["1", "2", "3", "4", "5", "6"],
|
||||
}
|
||||
)
|
||||
df_feature_new_libffm = converter.transform(df_feature_new)
|
||||
|
||||
assert df_feature_new_libffm.iloc[0, :].values.tolist() == [1, '1:1:1', '2:5:3', '3:6:1.0', '4:7:1']
|
||||
assert df_feature_new_libffm.iloc[-1, :].values.tolist() == [1, '1:4:1', '2:5:8', '3:6:6.0', '4:12:1']
|
||||
assert df_feature_new_libffm.iloc[0, :].values.tolist() == [
|
||||
1,
|
||||
"1:1:1",
|
||||
"2:5:3",
|
||||
"3:6:1.0",
|
||||
"4:7:1",
|
||||
]
|
||||
assert df_feature_new_libffm.iloc[-1, :].values.tolist() == [
|
||||
1,
|
||||
"1:4:1",
|
||||
"2:5:8",
|
||||
"3:6:6.0",
|
||||
"4:12:1",
|
||||
]
|
||||
|
||||
|
||||
def test_has_columns():
|
||||
df_1 = pd.DataFrame(dict(a=[1, 2, 3]))
|
||||
df_2 = pd.DataFrame(dict(b=[7, 8, 9], a=[1, 2, 3]))
|
||||
|
||||
assert has_columns(df_1, ['a'])
|
||||
assert has_columns(df_2, ['a'])
|
||||
assert has_columns(df_2, ['a', 'b'])
|
||||
assert not has_columns(df_2, ['a', 'b', 'c'])
|
||||
assert has_columns(df_1, ["a"])
|
||||
assert has_columns(df_2, ["a"])
|
||||
assert has_columns(df_2, ["a", "b"])
|
||||
assert not has_columns(df_2, ["a", "b", "c"])
|
||||
|
||||
|
||||
def test_has_same_base_dtype():
|
||||
|
@ -180,7 +208,7 @@ def test_has_same_base_dtype():
|
|||
arr_int64 = np.array([1, 2, 3], dtype=np.int64)
|
||||
arr_float32 = np.array([1, 2, 3], dtype=np.float32)
|
||||
arr_float64 = np.array([1, 2, 3], dtype=np.float64)
|
||||
arr_str = ['a', 'b', 'c']
|
||||
arr_str = ["a", "b", "c"]
|
||||
|
||||
df_1 = pd.DataFrame(dict(a=arr_int32, b=arr_int64))
|
||||
df_2 = pd.DataFrame(dict(a=arr_int64, b=arr_int32))
|
||||
|
@ -192,42 +220,60 @@ def test_has_same_base_dtype():
|
|||
# all columns match
|
||||
assert has_same_base_dtype(df_1, df_2)
|
||||
# specific column matches
|
||||
assert has_same_base_dtype(df_3, df_4, columns=['a'])
|
||||
assert has_same_base_dtype(df_3, df_4, columns=["a"])
|
||||
# some column types do not match
|
||||
assert not has_same_base_dtype(df_3, df_4)
|
||||
# column types do not match
|
||||
assert not has_same_base_dtype(df_1, df_3, columns=['a'])
|
||||
assert not has_same_base_dtype(df_1, df_3, columns=["a"])
|
||||
# all columns are not shared
|
||||
assert not has_same_base_dtype(df_4, df_5)
|
||||
# column types do not match
|
||||
assert not has_same_base_dtype(df_5, df_6, columns=['a'])
|
||||
assert not has_same_base_dtype(df_5, df_6, columns=["a"])
|
||||
# assert string columns match
|
||||
assert has_same_base_dtype(df_6, df_6)
|
||||
|
||||
|
||||
def test_lru_cache_df():
|
||||
df1 = pd.DataFrame(dict(a=[1, 2, 3], b=['a', 'b', 'c']))
|
||||
df2 = pd.DataFrame(dict(a=[1, 2, 3], c=['a', 'b', 'c']))
|
||||
df3 = pd.DataFrame(dict(a=[1, 2, 3], b=['a', 'b', 'd']))
|
||||
df1 = pd.DataFrame(dict(a=[1, 2, 3], b=["a", "b", "c"]))
|
||||
df2 = pd.DataFrame(dict(a=[1, 2, 3], c=["a", "b", "c"]))
|
||||
df3 = pd.DataFrame(dict(a=[1, 2, 3], b=["a", "b", "d"]))
|
||||
|
||||
@lru_cache_df(maxsize=2)
|
||||
def cached_func(df):
|
||||
pass
|
||||
|
||||
assert 'CacheInfo(hits=0, misses=0, maxsize=2, currsize=0)' == str(cached_func.cache_info())
|
||||
assert "CacheInfo(hits=0, misses=0, maxsize=2, currsize=0)" == str(
|
||||
cached_func.cache_info()
|
||||
)
|
||||
cached_func(df1)
|
||||
assert 'CacheInfo(hits=0, misses=1, maxsize=2, currsize=1)' == str(cached_func.cache_info())
|
||||
assert "CacheInfo(hits=0, misses=1, maxsize=2, currsize=1)" == str(
|
||||
cached_func.cache_info()
|
||||
)
|
||||
cached_func(df1)
|
||||
assert 'CacheInfo(hits=1, misses=1, maxsize=2, currsize=1)' == str(cached_func.cache_info())
|
||||
assert "CacheInfo(hits=1, misses=1, maxsize=2, currsize=1)" == str(
|
||||
cached_func.cache_info()
|
||||
)
|
||||
cached_func(df2)
|
||||
assert 'CacheInfo(hits=1, misses=2, maxsize=2, currsize=2)' == str(cached_func.cache_info())
|
||||
assert "CacheInfo(hits=1, misses=2, maxsize=2, currsize=2)" == str(
|
||||
cached_func.cache_info()
|
||||
)
|
||||
cached_func(df2)
|
||||
assert 'CacheInfo(hits=2, misses=2, maxsize=2, currsize=2)' == str(cached_func.cache_info())
|
||||
assert "CacheInfo(hits=2, misses=2, maxsize=2, currsize=2)" == str(
|
||||
cached_func.cache_info()
|
||||
)
|
||||
cached_func(df3)
|
||||
assert 'CacheInfo(hits=2, misses=3, maxsize=2, currsize=2)' == str(cached_func.cache_info())
|
||||
assert "CacheInfo(hits=2, misses=3, maxsize=2, currsize=2)" == str(
|
||||
cached_func.cache_info()
|
||||
)
|
||||
cached_func(df1)
|
||||
assert 'CacheInfo(hits=2, misses=4, maxsize=2, currsize=2)' == str(cached_func.cache_info())
|
||||
assert "CacheInfo(hits=2, misses=4, maxsize=2, currsize=2)" == str(
|
||||
cached_func.cache_info()
|
||||
)
|
||||
cached_func(df3)
|
||||
assert 'CacheInfo(hits=3, misses=4, maxsize=2, currsize=2)' == str(cached_func.cache_info())
|
||||
assert "CacheInfo(hits=3, misses=4, maxsize=2, currsize=2)" == str(
|
||||
cached_func.cache_info()
|
||||
)
|
||||
cached_func.cache_clear()
|
||||
assert 'CacheInfo(hits=0, misses=0, maxsize=2, currsize=0)' == str(cached_func.cache_info())
|
||||
assert "CacheInfo(hits=0, misses=0, maxsize=2, currsize=0)" == str(
|
||||
cached_func.cache_info()
|
||||
)
|
||||
|
|
|
@ -11,15 +11,13 @@ def test_line_graph():
|
|||
x_guides=[0, 1],
|
||||
x_name="Epoch",
|
||||
y_name="Accuracy",
|
||||
legend_loc='best'
|
||||
legend_loc="best",
|
||||
)
|
||||
plt.close()
|
||||
|
||||
# Single graph as a subplot
|
||||
line_graph(
|
||||
values=[1, 2, 3],
|
||||
labels="Train",
|
||||
subplot=(1, 1, 1),
|
||||
values=[1, 2, 3], labels="Train", subplot=(1, 1, 1),
|
||||
)
|
||||
plt.close()
|
||||
|
||||
|
|
|
@ -393,14 +393,22 @@ def test_python_errors(rating_true, rating_pred):
|
|||
rmse(rating_true, rating_true, col_user="not_user")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
mae(rating_pred, rating_pred, col_rating=DEFAULT_PREDICTION_COL, col_user="not_user")
|
||||
mae(
|
||||
rating_pred,
|
||||
rating_pred,
|
||||
col_rating=DEFAULT_PREDICTION_COL,
|
||||
col_user="not_user",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
rsquared(rating_true, rating_pred, col_item="not_item")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
exp_var(
|
||||
rating_pred, rating_pred, col_rating=DEFAULT_PREDICTION_COL, col_item="not_item"
|
||||
rating_pred,
|
||||
rating_pred,
|
||||
col_rating=DEFAULT_PREDICTION_COL,
|
||||
col_item="not_item",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -414,5 +422,8 @@ def test_python_errors(rating_true, rating_pred):
|
|||
|
||||
with pytest.raises(ValueError):
|
||||
map_at_k(
|
||||
rating_pred, rating_pred, col_rating=DEFAULT_PREDICTION_COL, col_user="not_user"
|
||||
rating_pred,
|
||||
rating_pred,
|
||||
col_rating=DEFAULT_PREDICTION_COL,
|
||||
col_user="not_user",
|
||||
)
|
||||
|
|
|
@ -119,7 +119,7 @@ def test_min_rating_filter():
|
|||
{
|
||||
DEFAULT_USER_COL: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5],
|
||||
DEFAULT_ITEM_COL: [5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1],
|
||||
DEFAULT_RATING_COL: np.random.randint(1, 6, 15)
|
||||
DEFAULT_RATING_COL: np.random.randint(1, 6, 15),
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -198,10 +198,11 @@ def test_random_splitter(test_specs, python_dataset):
|
|||
|
||||
# check values sum to 1
|
||||
splits = python_random_split(
|
||||
python_dataset, ratio=[.7, .2, .1], seed=test_specs["seed"]
|
||||
python_dataset, ratio=[0.7, 0.2, 0.1], seed=test_specs["seed"]
|
||||
)
|
||||
|
||||
assert(len(splits)) == 3
|
||||
assert (len(splits)) == 3
|
||||
|
||||
|
||||
def test_chrono_splitter(test_specs, python_dataset):
|
||||
splits = python_chrono_split(
|
||||
|
@ -435,4 +436,3 @@ def test_float_numpy_stratified_splitter(test_specs, python_float_dataset):
|
|||
assert Xtst_rated / X_rated == pytest.approx(
|
||||
(1 - test_specs["ratio"]), rel=test_specs["fluctuation"]
|
||||
)
|
||||
|
||||
|
|
|
@ -17,37 +17,43 @@ TOL = 0.0001
|
|||
|
||||
@pytest.fixture
|
||||
def target_matrices(scope="module"):
|
||||
J1 = np.array([[1.0, 0.0, 0.5],
|
||||
[0.0, 1.0, 0.33333],
|
||||
[0.5, 0.33333, 1.0]])
|
||||
J2 = np.array([[1.0, 0.0, 0.0, 0.2],
|
||||
[0.0, 1.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 1.0, 0.5],
|
||||
[0.2, 0.0, 0.5, 1.0]])
|
||||
L1 = np.array([[1.0, 0.0, 0.5],
|
||||
[0.0, 0.5, 0.25],
|
||||
[0.5, 0.25, 0.5]])
|
||||
L2 = np.array([[0.5, 0.0, 0.0, 0.125],
|
||||
[0.0, 0.33333, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.5, 0.25],
|
||||
[0.125, 0.0, 0.25, 0.25]])
|
||||
J1 = np.array([[1.0, 0.0, 0.5], [0.0, 1.0, 0.33333], [0.5, 0.33333, 1.0]])
|
||||
J2 = np.array(
|
||||
[
|
||||
[1.0, 0.0, 0.0, 0.2],
|
||||
[0.0, 1.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 1.0, 0.5],
|
||||
[0.2, 0.0, 0.5, 1.0],
|
||||
]
|
||||
)
|
||||
L1 = np.array([[1.0, 0.0, 0.5], [0.0, 0.5, 0.25], [0.5, 0.25, 0.5]])
|
||||
L2 = np.array(
|
||||
[
|
||||
[0.5, 0.0, 0.0, 0.125],
|
||||
[0.0, 0.33333, 0.0, 0.0],
|
||||
[0.0, 0.0, 0.5, 0.25],
|
||||
[0.125, 0.0, 0.25, 0.25],
|
||||
]
|
||||
)
|
||||
return {
|
||||
"jaccard1": pytest.approx(J1, TOL),
|
||||
"jaccard2": pytest.approx(J2, TOL),
|
||||
"lift1": pytest.approx(L1, TOL),
|
||||
"lift2": pytest.approx(L2, TOL)
|
||||
"lift2": pytest.approx(L2, TOL),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def python_data():
|
||||
cooccurrence1 = np.array([[1.0, 0.0, 1.0],
|
||||
[0.0, 2.0, 1.0],
|
||||
[1.0, 1.0, 2.0]])
|
||||
cooccurrence2 = np.array([[2.0, 0.0, 0.0, 1.0],
|
||||
[0.0, 3.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 2.0, 2.0],
|
||||
[1.0, 0.0, 2.0, 4.0]])
|
||||
cooccurrence1 = np.array([[1.0, 0.0, 1.0], [0.0, 2.0, 1.0], [1.0, 1.0, 2.0]])
|
||||
cooccurrence2 = np.array(
|
||||
[
|
||||
[2.0, 0.0, 0.0, 1.0],
|
||||
[0.0, 3.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 2.0, 2.0],
|
||||
[1.0, 0.0, 2.0, 4.0],
|
||||
]
|
||||
)
|
||||
return cooccurrence1, cooccurrence2
|
||||
|
||||
|
||||
|
@ -75,14 +81,16 @@ def test_python_lift(python_data, target_matrices):
|
|||
|
||||
def test_exponential_decay():
|
||||
values = np.array([1, 2, 3, 4, 5, 6])
|
||||
expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1., 1.])
|
||||
expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1.0, 1.0])
|
||||
actual = exponential_decay(value=values, max_val=5, half_life=2)
|
||||
assert np.allclose(actual, expected, atol=TOL)
|
||||
|
||||
|
||||
def test_get_top_k_scored_items():
|
||||
scores = np.array([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [1, 5, 3, 4, 2]])
|
||||
top_items, top_scores = get_top_k_scored_items(scores=scores, top_k=3, sort_top_k=True)
|
||||
top_items, top_scores = get_top_k_scored_items(
|
||||
scores=scores, top_k=3, sort_top_k=True
|
||||
)
|
||||
|
||||
assert np.array_equal(top_items, np.array([[4, 3, 2], [0, 1, 2], [1, 3, 2]]))
|
||||
assert np.array_equal(top_scores, np.array([[5, 4, 3], [5, 4, 3], [5, 4, 3]]))
|
||||
|
|
|
@ -274,30 +274,42 @@ def test_get_popularity_based_topk(header):
|
|||
|
||||
|
||||
def test_get_normalized_scores(header):
|
||||
train = pd.DataFrame({header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2],
|
||||
header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7],
|
||||
header["col_rating"]: [3., 4., 5., 4., 3., 2., 1., 5.],
|
||||
header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800]})
|
||||
test = pd.DataFrame({header["col_user"]: [1, 1, 1, 2, 2, 2],
|
||||
header["col_item"]: [5, 6, 7, 2, 3, 4],
|
||||
header["col_rating"]: [2., 1., 5., 3., 4., 5.]})
|
||||
train = pd.DataFrame(
|
||||
{
|
||||
header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2],
|
||||
header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7],
|
||||
header["col_rating"]: [3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0, 5.0],
|
||||
header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800],
|
||||
}
|
||||
)
|
||||
test = pd.DataFrame(
|
||||
{
|
||||
header["col_user"]: [1, 1, 1, 2, 2, 2],
|
||||
header["col_item"]: [5, 6, 7, 2, 3, 4],
|
||||
header["col_rating"]: [2.0, 1.0, 5.0, 3.0, 4.0, 5.0],
|
||||
}
|
||||
)
|
||||
|
||||
model = SARSingleNode(**header, timedecay_formula=True, normalize=True)
|
||||
model.fit(train)
|
||||
actual = model.score(test, remove_seen=True, normalize=True)
|
||||
expected = np.array([
|
||||
[-np.inf, -np.inf, -np.inf, -np.inf, 3., 3., 3.],
|
||||
[-np.inf, 3., 3., 3., -np.inf, -np.inf, -np.inf],
|
||||
])
|
||||
expected = np.array(
|
||||
[
|
||||
[-np.inf, -np.inf, -np.inf, -np.inf, 3.0, 3.0, 3.0],
|
||||
[-np.inf, 3.0, 3.0, 3.0, -np.inf, -np.inf, -np.inf],
|
||||
]
|
||||
)
|
||||
assert actual.shape == (2, 7)
|
||||
assert isinstance(actual, np.ndarray)
|
||||
assert np.isclose(expected, actual).all()
|
||||
|
||||
actual = model.score(test, normalize=True)
|
||||
expected = np.array([
|
||||
[3.80000633, 4.14285448, 4.14285448, 4.14285448, 3., 3., 3.],
|
||||
[2.8000859, 3., 3., 3., 2.71441353, 2.71441353, 2.71441353]
|
||||
])
|
||||
expected = np.array(
|
||||
[
|
||||
[3.80000633, 4.14285448, 4.14285448, 4.14285448, 3.0, 3.0, 3.0],
|
||||
[2.8000859, 3.0, 3.0, 3.0, 2.71441353, 2.71441353, 2.71441353],
|
||||
]
|
||||
)
|
||||
|
||||
assert actual.shape == (2, 7)
|
||||
assert isinstance(actual, np.ndarray)
|
||||
|
|
|
@ -29,16 +29,23 @@ from reco_utils.common.constants import (
|
|||
)
|
||||
from reco_utils.evaluation.python_evaluation import rmse
|
||||
|
||||
ITEM_FEAT_COL = 'itemFeat'
|
||||
ITEM_FEAT_COL = "itemFeat"
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
@pytest.fixture(scope="module")
|
||||
def pd_df():
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
DEFAULT_USER_COL: [1, 1, 1, 2, 2, 2],
|
||||
DEFAULT_ITEM_COL: [1, 2, 3, 1, 4, 5],
|
||||
ITEM_FEAT_COL: [[1, 1, 1], [2, 2, 2], [3, 3, 3], [1, 1, 1], [4, 4, 4], [5, 5, 5]],
|
||||
ITEM_FEAT_COL: [
|
||||
[1, 1, 1],
|
||||
[2, 2, 2],
|
||||
[3, 3, 3],
|
||||
[1, 1, 1],
|
||||
[4, 4, 4],
|
||||
[5, 5, 5],
|
||||
],
|
||||
DEFAULT_RATING_COL: [5, 4, 3, 5, 5, 3],
|
||||
}
|
||||
)
|
||||
|
@ -56,10 +63,10 @@ def test_pandas_input_fn(pd_df):
|
|||
batch = dataset.make_one_shot_iterator().get_next()
|
||||
with tf.Session() as sess:
|
||||
features = sess.run(batch)
|
||||
|
||||
|
||||
# check the input function returns all the columns
|
||||
assert len(features) == len(df.columns)
|
||||
|
||||
|
||||
for k, v in features.items():
|
||||
assert k in df.columns.values
|
||||
# check if a list feature column converted correctly
|
||||
|
@ -67,7 +74,7 @@ def test_pandas_input_fn(pd_df):
|
|||
assert np.array_equal(v, df[k].values)
|
||||
elif len(v.shape) == 2:
|
||||
assert v.shape[1] == len(df[k][0])
|
||||
|
||||
|
||||
# check dataset with shuffles
|
||||
dataset = pandas_input_fn(df, shuffle=True, seed=SEED)()
|
||||
batch = dataset.make_one_shot_iterator().get_next()
|
||||
|
@ -76,7 +83,7 @@ def test_pandas_input_fn(pd_df):
|
|||
print(features)
|
||||
# check the input function returns all the columns
|
||||
assert len(features) == len(df.columns)
|
||||
|
||||
|
||||
for k, v in features.items():
|
||||
assert k in df.columns.values
|
||||
# check if a list feature column converted correctly
|
||||
|
@ -90,30 +97,32 @@ def test_pandas_input_fn(pd_df):
|
|||
batch = dataset_with_label.make_one_shot_iterator().get_next()
|
||||
with tf.Session() as sess:
|
||||
features, label = sess.run(batch)
|
||||
assert len(features) == len(df.columns) - 1 # label should not be in the features
|
||||
assert (
|
||||
len(features) == len(df.columns) - 1
|
||||
) # label should not be in the features
|
||||
|
||||
|
||||
@pytest.mark.gpu
|
||||
def test_build_optimizer():
|
||||
adadelta = build_optimizer('Adadelta')
|
||||
adadelta = build_optimizer("Adadelta")
|
||||
assert isinstance(adadelta, tf.train.AdadeltaOptimizer)
|
||||
|
||||
adagrad = build_optimizer('Adagrad')
|
||||
adagrad = build_optimizer("Adagrad")
|
||||
assert isinstance(adagrad, tf.train.AdagradOptimizer)
|
||||
|
||||
adam = build_optimizer('Adam')
|
||||
adam = build_optimizer("Adam")
|
||||
assert isinstance(adam, tf.train.AdamOptimizer)
|
||||
|
||||
ftrl = build_optimizer('Ftrl', **{'l1_regularization_strength': 0.001})
|
||||
assert isinstance(ftrl, tf.train.FtrlOptimizer)
|
||||
ftrl = build_optimizer("Ftrl", **{"l1_regularization_strength": 0.001})
|
||||
assert isinstance(ftrl, tf.train.FtrlOptimizer)
|
||||
|
||||
momentum = build_optimizer('Momentum', **{'momentum': 0.5})
|
||||
momentum = build_optimizer("Momentum", **{"momentum": 0.5})
|
||||
assert isinstance(momentum, tf.train.MomentumOptimizer)
|
||||
|
||||
rmsprop = build_optimizer('RMSProp')
|
||||
rmsprop = build_optimizer("RMSProp")
|
||||
assert isinstance(rmsprop, tf.train.RMSPropOptimizer)
|
||||
|
||||
sgd = build_optimizer('SGD')
|
||||
sgd = build_optimizer("SGD")
|
||||
assert isinstance(sgd, tf.train.GradientDescentOptimizer)
|
||||
|
||||
|
||||
|
@ -125,12 +134,12 @@ def test_evaluation_log_hook(pd_df, tmp):
|
|||
hook_frequency = 10
|
||||
train_steps = 10
|
||||
|
||||
_, deep_columns = build_feature_columns(users, items, model_type='deep')
|
||||
_, deep_columns = build_feature_columns(users, items, model_type="deep")
|
||||
|
||||
model = build_model(
|
||||
tmp,
|
||||
deep_columns=deep_columns,
|
||||
save_checkpoints_steps=train_steps//hook_frequency
|
||||
save_checkpoints_steps=train_steps // hook_frequency,
|
||||
)
|
||||
|
||||
evaluation_logger = MetricsLogger()
|
||||
|
@ -143,7 +152,7 @@ def test_evaluation_log_hook(pd_df, tmp):
|
|||
true_df=data,
|
||||
y_col=DEFAULT_RATING_COL,
|
||||
eval_df=data.drop(DEFAULT_RATING_COL, axis=1),
|
||||
every_n_iter=train_steps//hook_frequency,
|
||||
every_n_iter=train_steps // hook_frequency,
|
||||
model_dir=tmp,
|
||||
eval_fns=[rmse],
|
||||
)
|
||||
|
@ -154,12 +163,12 @@ def test_evaluation_log_hook(pd_df, tmp):
|
|||
y_col=DEFAULT_RATING_COL,
|
||||
batch_size=1,
|
||||
num_epochs=None,
|
||||
shuffle=True
|
||||
shuffle=True,
|
||||
),
|
||||
hooks=hooks,
|
||||
steps=train_steps
|
||||
steps=train_steps,
|
||||
)
|
||||
|
||||
|
||||
# Check if hook logged the given metric
|
||||
assert rmse.__name__ in evaluation_logger.get_log()
|
||||
assert len(evaluation_logger.get_log()[rmse.__name__]) == hook_frequency
|
||||
|
@ -175,20 +184,13 @@ def test_pandas_input_fn_for_saved_model(pd_df, tmp):
|
|||
data, users, items = pd_df
|
||||
model_dir = os.path.join(tmp, "model")
|
||||
export_dir = os.path.join(tmp, "export")
|
||||
|
||||
_, deep_columns = build_feature_columns(users, items, model_type='deep')
|
||||
|
||||
_, deep_columns = build_feature_columns(users, items, model_type="deep")
|
||||
|
||||
# Train a model
|
||||
model = build_model(
|
||||
model_dir,
|
||||
deep_columns=deep_columns,
|
||||
)
|
||||
model = build_model(model_dir, deep_columns=deep_columns,)
|
||||
train_fn = pandas_input_fn(
|
||||
df=data,
|
||||
y_col=DEFAULT_RATING_COL,
|
||||
batch_size=1,
|
||||
num_epochs=None,
|
||||
shuffle=True
|
||||
df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=None, shuffle=True
|
||||
)
|
||||
model.train(input_fn=train_fn, steps=1)
|
||||
|
||||
|
@ -196,32 +198,31 @@ def test_pandas_input_fn_for_saved_model(pd_df, tmp):
|
|||
exported_path = export_model(
|
||||
model=model,
|
||||
train_input_fn=train_fn,
|
||||
eval_input_fn=pandas_input_fn(
|
||||
df=data, y_col=DEFAULT_RATING_COL
|
||||
),
|
||||
eval_input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL),
|
||||
tf_feat_cols=deep_columns,
|
||||
base_dir=export_dir
|
||||
base_dir=export_dir,
|
||||
)
|
||||
saved_model = tf.contrib.estimator.SavedModelEstimator(exported_path)
|
||||
|
||||
# Test pandas_input_fn_for_saved_model with the saved model
|
||||
test = data.drop(DEFAULT_RATING_COL, axis=1)
|
||||
test.reset_index(drop=True, inplace=True)
|
||||
list(itertools.islice(
|
||||
saved_model.predict(
|
||||
pandas_input_fn_for_saved_model(
|
||||
df=test,
|
||||
feat_name_type={
|
||||
DEFAULT_USER_COL: int,
|
||||
DEFAULT_ITEM_COL: int,
|
||||
ITEM_FEAT_COL: list
|
||||
}
|
||||
)
|
||||
),
|
||||
len(test)
|
||||
))
|
||||
list(
|
||||
itertools.islice(
|
||||
saved_model.predict(
|
||||
pandas_input_fn_for_saved_model(
|
||||
df=test,
|
||||
feat_name_type={
|
||||
DEFAULT_USER_COL: int,
|
||||
DEFAULT_ITEM_COL: int,
|
||||
ITEM_FEAT_COL: list,
|
||||
},
|
||||
)
|
||||
),
|
||||
len(test),
|
||||
)
|
||||
)
|
||||
|
||||
# Close the event file so that the model folder can be cleaned up.
|
||||
summary_writer = tf.summary.FileWriterCache.get(model.model_dir)
|
||||
summary_writer.close()
|
||||
|
||||
|
|
|
@ -132,4 +132,3 @@ def test_wide_deep_model(pd_df, tmp):
|
|||
# Close the event file so that the model folder can be cleaned up.
|
||||
summary_writer = tf.summary.FileWriterCache.get(model.model_dir)
|
||||
summary_writer.close()
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче