This commit is contained in:
miguelgfierro 2020-05-18 16:56:15 +00:00
Родитель 8106c0354e
Коммит 1564033ebb
69 изменённых файлов: 1434 добавлений и 962 удалений

Просмотреть файл

@ -29,4 +29,3 @@ class SARModel:
def predict(self, items, ratings, top_k, remove_seen):
return self.model.predict(items, ratings, top_k, remove_seen)

Просмотреть файл

@ -5,7 +5,14 @@ This is the one and only (to rule them all) implementation of SAR.
import logging
import pyspark.sql.functions as F
import pandas as pd
from pyspark.sql.types import StringType, DoubleType, StructType, StructField, IntegerType, FloatType
from pyspark.sql.types import (
StringType,
DoubleType,
StructType,
StructField,
IntegerType,
FloatType,
)
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pysarplus import SARModel
@ -14,7 +21,8 @@ SIM_JACCARD = "jaccard"
SIM_LIFT = "lift"
logging.basicConfig(level=logging.INFO)
log = logging.getLogger('sarplus')
log = logging.getLogger("sarplus")
class SARPlus:
"""SAR implementation for PySpark"""
@ -31,7 +39,7 @@ class SARPlus:
time_decay_coefficient=30,
time_now=None,
timedecay_formula=False,
threshold=1
threshold=1,
):
assert threshold > 0
@ -44,7 +52,7 @@ class SARPlus:
"prefix": table_prefix,
"time_now": time_now,
"time_decay_coefficient": time_decay_coefficient,
"threshold": threshold
"threshold": threshold,
}
self.similarity_type = similarity_type
@ -83,7 +91,7 @@ class SARPlus:
# the folling is the query which we want to run
query = self.f(
"""
"""
SELECT
{col_user}, {col_item},
SUM({col_rating} * EXP(-log(2) * (latest_timestamp - CAST({col_timestamp} AS long)) / ({time_decay_coefficient} * 3600 * 24))) as {col_rating}
@ -91,16 +99,19 @@ class SARPlus:
(SELECT CAST(MAX({col_timestamp}) AS long) latest_timestamp FROM {prefix}df_train_input)
GROUP BY {col_user}, {col_item}
CLUSTER BY {col_user}
""")
"""
)
# replace with timedecayed version
df = self.spark.sql(query)
else:
# since SQL is case insensitive, this check needs to be performed similar
if self.header['col_timestamp'].lower() in [s.name.lower() for s in df.schema]:
if self.header["col_timestamp"].lower() in [
s.name.lower() for s in df.schema
]:
# we need to de-duplicate items by using the latest item
query = self.f(
"""
"""
SELECT {col_user}, {col_item}, {col_rating}
FROM
(
@ -112,7 +123,7 @@ class SARPlus:
WHERE latest = 1
"""
)
df = self.spark.sql(query)
df.createOrReplaceTempView(self.f("{prefix}df_train"))
@ -128,7 +139,8 @@ class SARPlus:
GROUP BY A.{col_item}, B.{col_item}
HAVING COUNT(*) >= {threshold}
CLUSTER BY i1, i2
""")
"""
)
item_cooccurrence = self.spark.sql(query)
item_cooccurrence.write.mode("overwrite").saveAsTable(
@ -148,7 +160,7 @@ class SARPlus:
self.item_similarity = item_cooccurrence
elif self.similarity_type == SIM_JACCARD:
query = self.f(
"""
"""
SELECT i1, i2, value / (M1.margin + M2.margin - value) AS value
FROM {prefix}item_cooccurrence A
INNER JOIN {prefix}item_marginal M1 ON A.i1 = M1.i
@ -159,7 +171,7 @@ class SARPlus:
self.item_similarity = self.spark.sql(query)
elif self.similarity_type == SIM_LIFT:
query = self.f(
"""
"""
SELECT i1, i2, value / (M1.margin * M2.margin) AS value
FROM {prefix}item_cooccurrence A
INNER JOIN {prefix}item_marginal M1 ON A.i1 = M1.i
@ -169,11 +181,14 @@ class SARPlus:
)
self.item_similarity = self.spark.sql(query)
else:
raise ValueError("Unknown similarity type: {0}".format(self.similarity_type))
raise ValueError(
"Unknown similarity type: {0}".format(self.similarity_type)
)
# store upper triangular
log.info("sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type)
log.info(
"sarplus.fit 2/2: compute similiarity metric %s..." % self.similarity_type
)
self.item_similarity.write.mode("overwrite").saveAsTable(
self.f("{prefix}item_similarity_upper")
)
@ -181,7 +196,7 @@ class SARPlus:
# expand upper triangular to full matrix
query = self.f(
"""
"""
SELECT i1, i2, value
FROM
(
@ -223,7 +238,7 @@ class SARPlus:
)
query = self.f(
"""
"""
SELECT a.{col_user}, a.{col_item}, CAST(a.{col_rating} AS double) {col_rating}
FROM {prefix}df_train a INNER JOIN {prefix}df_test_users b ON a.{col_user} = b.{col_user}
DISTRIBUTE BY {col_user}
@ -233,39 +248,59 @@ class SARPlus:
return self.spark.sql(query)
def recommend_k_items(self, test, cache_path, top_k=10, remove_seen=True, n_user_prediction_partitions=200):
def recommend_k_items(
self,
test,
cache_path,
top_k=10,
remove_seen=True,
n_user_prediction_partitions=200,
):
# create item id to continuous index mapping
log.info("sarplus.recommend_k_items 1/3: create item index")
self.spark.sql(self.f("SELECT i1, row_number() OVER(ORDER BY i1)-1 idx FROM (SELECT DISTINCT i1 FROM {prefix}item_similarity) CLUSTER BY i1"))\
.write.mode("overwrite").saveAsTable(self.f("{prefix}item_mapping"))
self.spark.sql(
self.f(
"SELECT i1, row_number() OVER(ORDER BY i1)-1 idx FROM (SELECT DISTINCT i1 FROM {prefix}item_similarity) CLUSTER BY i1"
)
).write.mode("overwrite").saveAsTable(self.f("{prefix}item_mapping"))
# map similarity matrix into index space
self.spark.sql(self.f("""
self.spark.sql(
self.f(
"""
SELECT a.idx i1, b.idx i2, is.value
FROM {prefix}item_similarity is, {prefix}item_mapping a, {prefix}item_mapping b
WHERE is.i1 = a.i1 AND i2 = b.i1
"""))\
.write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity_mapped"))
"""
)
).write.mode("overwrite").saveAsTable(self.f("{prefix}item_similarity_mapped"))
cache_path_output = cache_path
if cache_path.startswith('dbfs:'):
cache_path_input = '/dbfs' + cache_path[5:]
if cache_path.startswith("dbfs:"):
cache_path_input = "/dbfs" + cache_path[5:]
else:
cache_path_input = cache_path
# export similarity matrix for C++ backed UDF
log.info("sarplus.recommend_k_items 2/3: prepare similarity matrix")
self.spark.sql(self.f("SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2"))\
.coalesce(1)\
.write.format("com.microsoft.sarplus").mode("overwrite")\
.save(cache_path_output)
self.spark.sql(
self.f(
"SELECT i1, i2, CAST(value AS DOUBLE) value FROM {prefix}item_similarity_mapped ORDER BY i1, i2"
)
).coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save(
cache_path_output
)
self.get_user_affinity(test).createOrReplaceTempView(self.f("{prefix}user_affinity"))
self.get_user_affinity(test).createOrReplaceTempView(
self.f("{prefix}user_affinity")
)
# map item ids to index space
pred_input = self.spark.sql(self.f("""
pred_input = self.spark.sql(
self.f(
"""
SELECT {col_user}, idx, rating
FROM
(
@ -273,13 +308,19 @@ class SARPlus:
FROM {prefix}user_affinity JOIN {prefix}item_mapping b ON {col_item} = b.i1
)
CLUSTER BY {col_user}
"""))
"""
)
)
schema = StructType([
StructField("userID", pred_input.schema[self.header['col_user']].dataType, True),
StructField("itemID", IntegerType(), True),
StructField("score", FloatType(), True)
])
schema = StructType(
[
StructField(
"userID", pred_input.schema[self.header["col_user"]].dataType, True
),
StructField("itemID", IntegerType(), True),
StructField("score", FloatType(), True),
]
)
# make sure only the header is pickled
local_header = self.header
@ -291,33 +332,42 @@ class SARPlus:
# The cache_path points to file write to by com.microsoft.sarplus
# This has exactly the memory layout we need and since the file is
# memory mapped, the memory consumption only happens ones per worker
# for all python processes
# for all python processes
model = SARModel(cache_path_input)
preds = model.predict(df['idx'].values, df['rating'].values, top_k, remove_seen)
user = df[local_header['col_user']].iloc[0]
preds = model.predict(
df["idx"].values, df["rating"].values, top_k, remove_seen
)
user = df[local_header["col_user"]].iloc[0]
preds_ret = pd.DataFrame(
[(user, x.id, x.score) for x in preds],
columns=range(3))
[(user, x.id, x.score) for x in preds], columns=range(3)
)
return preds_ret
log.info("sarplus.recommend_k_items 3/3: compute recommendations")
df_preds = pred_input\
.repartition(n_user_prediction_partitions, self.header['col_user'])\
.groupby(self.header['col_user'])\
df_preds = (
pred_input.repartition(
n_user_prediction_partitions, self.header["col_user"]
)
.groupby(self.header["col_user"])
.apply(sar_predict_udf)
)
df_preds.createOrReplaceTempView(self.f("{prefix}predictions"))
return self.spark.sql(self.f("""
return self.spark.sql(
self.f(
"""
SELECT userID {col_user}, b.i1 {col_item}, score
FROM {prefix}predictions p, {prefix}item_mapping b
WHERE p.itemID = b.idx
"""))
"""
)
)
def recommend_k_items_slow(self, test, top_k=10, remove_seen=True):
"""Recommend top K items for all users which are in the test set.
@ -331,9 +381,9 @@ class SARPlus:
if remove_seen:
raise ValueError("Not implemented")
self.get_user_affinity(test)\
.write.mode("overwrite")\
.saveAsTable(self.f("{prefix}user_affinity"))
self.get_user_affinity(test).write.mode("overwrite").saveAsTable(
self.f("{prefix}user_affinity")
)
# user_affinity * item_similarity
# filter top-k
@ -357,4 +407,4 @@ class SARPlus:
top_k=top_k,
)
return self.spark.sql(query)
return self.spark.sql(query)

Просмотреть файл

@ -1,2 +1,2 @@
from .SARModel import SARModel
from .SARPlus import SARPlus
from .SARPlus import SARPlus

Просмотреть файл

@ -11,9 +11,13 @@ from pyspark.sql import SparkSession
from pysarplus import SARPlus, SARModel
def assert_compare(expected_id, expected_score, actual_prediction):
assert expected_id == actual_prediction.id
assert math.isclose(expected_score, actual_prediction.score, rel_tol=1e-3, abs_tol=1e-3)
assert math.isclose(
expected_score, actual_prediction.score, rel_tol=1e-3, abs_tol=1e-3
)
@pytest.fixture(scope="module")
def spark(app_name="Sample", url="local[*]", memory="1G"):
@ -27,7 +31,11 @@ def spark(app_name="Sample", url="local[*]", memory="1G"):
spark = (
SparkSession.builder.appName(app_name)
.master(url)
.config("spark.jars", os.path.dirname(__file__) + "/../../scala/target/scala-2.11/sarplus_2.11-0.2.6.jar")
.config(
"spark.jars",
os.path.dirname(__file__)
+ "/../../scala/target/scala-2.11/sarplus_2.11-0.2.6.jar",
)
.config("spark.driver.memory", memory)
.config("spark.sql.shuffle.partitions", "1")
.config("spark.default.parallelism", "1")
@ -39,19 +47,18 @@ def spark(app_name="Sample", url="local[*]", memory="1G"):
return spark
@pytest.fixture(scope="module")
def sample_cache(spark):
df = spark.read.csv("tests/sample-input.txt", header=True, inferSchema=True)
path = "tests/sample-output.sar"
df.coalesce(1)\
.write.format("com.microsoft.sarplus")\
.mode("overwrite")\
.save(path)
df.coalesce(1).write.format("com.microsoft.sarplus").mode("overwrite").save(path)
return path
@pytest.fixture(scope="module")
def header():
header = {
@ -62,6 +69,7 @@ def header():
}
return header
@pytest.fixture(scope="module")
def pandas_dummy_dataset(header):
"""Load sample dataset in pandas for testing; can be used to create a Spark dataframe
@ -75,6 +83,7 @@ def pandas_dummy_dataset(header):
}
return pd.DataFrame(ratings_dict)
@pytest.mark.spark
def test_good(spark, sample_cache):
model = SARModel(sample_cache)
@ -84,6 +93,7 @@ def test_good(spark, sample_cache):
assert_compare(1, 44, y[1])
assert_compare(2, 64, y[2])
@pytest.mark.spark
def test_good_less(spark, sample_cache):
model = SARModel(sample_cache)
@ -93,6 +103,7 @@ def test_good_less(spark, sample_cache):
assert_compare(1, 11.6, y[1])
assert_compare(2, 12.3, y[2])
@pytest.mark.spark
def test_good_require_sort(spark, sample_cache):
model = SARModel(sample_cache)
@ -104,6 +115,7 @@ def test_good_require_sort(spark, sample_cache):
assert 3 == len(y)
@pytest.mark.spark
def test_good_require_sort_remove_seen(spark, sample_cache):
model = SARModel(sample_cache)
@ -112,52 +124,65 @@ def test_good_require_sort_remove_seen(spark, sample_cache):
assert_compare(2, 64, y[0])
assert 1 == len(y)
@pytest.mark.spark
def test_pandas(spark, sample_cache):
item_scores = pd.DataFrame([(0, 2.3), (1, 3.1)], columns=["itemID", "score"])
model = SARModel(sample_cache)
y = model.predict(item_scores["itemID"].values, item_scores["score"].values, top_k=10, remove_seen=False)
y = model.predict(
item_scores["itemID"].values,
item_scores["score"].values,
top_k=10,
remove_seen=False,
)
assert_compare(0, 0.85, y[0])
assert_compare(1, 6.9699, y[1])
assert_compare(2, 9.92, y[2])
@pytest.mark.spark
def test_e2e(spark, pandas_dummy_dataset, header):
sar = SARPlus(spark, **header)
df = spark.createDataFrame(pandas_dummy_dataset)
sar.fit(df)
sar.fit(df)
# assert 4*4 + 32 == sar.item_similarity.count()
# print(sar.item_similarity
# .toPandas()
# .pivot_table(index='i1', columns='i2', values='value'))
# .toPandas()
# .pivot_table(index='i1', columns='i2', values='value'))
test_df = spark.createDataFrame(pd.DataFrame({
header['col_user']: [3],
header['col_item']: [2]
}))
r1 = sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False)\
.toPandas()\
.sort_values([header['col_user'], header['col_item']])\
.reset_index(drop=True)
r2 = sar.recommend_k_items(test_df, "tests/test_e2e_cache", top_k=3, n_user_prediction_partitions=2, remove_seen=False)\
.toPandas()\
.sort_values([header['col_user'], header['col_item']])\
.reset_index(drop=True)
assert (r1.iloc[:,:2] == r2.iloc[:,:2]).all().all()
assert np.allclose(
r1.score.values,
r2.score.values,
1e-3
test_df = spark.createDataFrame(
pd.DataFrame({header["col_user"]: [3], header["col_item"]: [2]})
)
r1 = (
sar.recommend_k_items_slow(test_df, top_k=3, remove_seen=False)
.toPandas()
.sort_values([header["col_user"], header["col_item"]])
.reset_index(drop=True)
)
r2 = (
sar.recommend_k_items(
test_df,
"tests/test_e2e_cache",
top_k=3,
n_user_prediction_partitions=2,
remove_seen=False,
)
.toPandas()
.sort_values([header["col_user"], header["col_item"]])
.reset_index(drop=True)
)
assert (r1.iloc[:, :2] == r2.iloc[:, :2]).all().all()
assert np.allclose(r1.score.values, r2.score.values, 1e-3)
@pytest.fixture(scope="module")
def pandas_dummy(header):
ratings_dict = {
@ -233,10 +258,16 @@ def sar_settings():
@pytest.mark.parametrize(
"similarity_type, timedecay_formula", [("jaccard", False), ("lift", True)]
)
def test_fit(spark, similarity_type, timedecay_formula, train_test_dummy_timestamp, header):
model = SARPlus(spark, **header, timedecay_formula=timedecay_formula,
similarity_type=similarity_type)
def test_fit(
spark, similarity_type, timedecay_formula, train_test_dummy_timestamp, header
):
model = SARPlus(
spark,
**header,
timedecay_formula=timedecay_formula,
similarity_type=similarity_type
)
trainset, testset = train_test_dummy_timestamp
df = spark.createDataFrame(trainset)
@ -244,7 +275,7 @@ def test_fit(spark, similarity_type, timedecay_formula, train_test_dummy_timesta
df = spark.table("trainset")
model.fit(df)
model.fit(df)
"""
@ -267,77 +298,98 @@ def test_sar_item_similarity(
spark, threshold, similarity_type, file, demo_usage_data, sar_settings, header
):
model = SARPlus(spark,
**header,
timedecay_formula=False,
time_decay_coefficient=30,
time_now=None,
threshold=threshold,
similarity_type=similarity_type)
model = SARPlus(
spark,
**header,
timedecay_formula=False,
time_decay_coefficient=30,
time_now=None,
threshold=threshold,
similarity_type=similarity_type
)
df = spark.createDataFrame(demo_usage_data)
model.fit(df)
# reference
item_similarity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv")
item_similarity_ref = pd.read_csv(
sar_settings["FILE_DIR"] + "sim_" + file + str(threshold) + ".csv"
)
item_similarity_ref = pd.melt(item_similarity_ref,
item_similarity_ref = pd.melt(
item_similarity_ref,
item_similarity_ref.columns[0],
item_similarity_ref.columns[1:],
'i2',
'value')
item_similarity_ref.columns = ['i1', 'i2', 'value']
"i2",
"value",
)
item_similarity_ref.columns = ["i1", "i2", "value"]
item_similarity_ref = item_similarity_ref[item_similarity_ref.value > 0]\
.sort_values(['i1', 'i2'])\
.reset_index(drop=True)\
# actual
item_similarity = model.item_similarity\
.toPandas()\
.sort_values(['i1', 'i2'])\
item_similarity_ref = (
item_similarity_ref[item_similarity_ref.value > 0]
.sort_values(["i1", "i2"])
.reset_index(drop=True)
)
# actual
item_similarity = (
model.item_similarity.toPandas()
.sort_values(["i1", "i2"])
.reset_index(drop=True)
)
if similarity_type is "cooccurrence":
assert((item_similarity_ref == item_similarity).all().all())
assert (item_similarity_ref == item_similarity).all().all()
else:
assert((item_similarity.iloc[:,:1] == item_similarity_ref.iloc[:,:1]).all().all())
assert (
(item_similarity.iloc[:, :1] == item_similarity_ref.iloc[:, :1]).all().all()
)
assert np.allclose(
item_similarity.value.values,
item_similarity_ref.value.values
item_similarity.value.values, item_similarity_ref.value.values
)
# Test 7
def test_user_affinity(spark, demo_usage_data, sar_settings, header):
time_now = demo_usage_data[header["col_timestamp"]].max()
model = SARPlus(spark,
**header,
timedecay_formula=True,
time_decay_coefficient=30,
time_now=time_now,
similarity_type="cooccurrence")
model = SARPlus(
spark,
**header,
timedecay_formula=True,
time_decay_coefficient=30,
time_now=time_now,
similarity_type="cooccurrence"
)
df = spark.createDataFrame(demo_usage_data)
model.fit(df)
user_affinity_ref = pd.read_csv(sar_settings["FILE_DIR"] + "user_aff.csv")
user_affinity_ref = pd.melt(user_affinity_ref, user_affinity_ref.columns[0], user_affinity_ref.columns[1:], 'ItemId', 'Rating')
user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0]\
.reset_index(drop=True)
user_affinity_ref = pd.melt(
user_affinity_ref,
user_affinity_ref.columns[0],
user_affinity_ref.columns[1:],
"ItemId",
"Rating",
)
user_affinity_ref = user_affinity_ref[user_affinity_ref.Rating > 0].reset_index(
drop=True
)
# construct dataframe with test user id we'd like to get the affinity for
df_test = spark.createDataFrame(pd.DataFrame({header['col_user']:[sar_settings["TEST_USER_ID"]]}))
df_test = spark.createDataFrame(
pd.DataFrame({header["col_user"]: [sar_settings["TEST_USER_ID"]]})
)
user_affinity = model.get_user_affinity(df_test).toPandas().reset_index(drop=True)
# verify the that item ids are the same
assert (user_affinity[header['col_item']] == user_affinity_ref.ItemId).all()
assert (user_affinity[header["col_item"]] == user_affinity_ref.ItemId).all()
assert np.allclose(
user_affinity_ref[header['col_rating']].values,
user_affinity['Rating'].values,
atol=sar_settings["ATOL"]
user_affinity_ref[header["col_rating"]].values,
user_affinity["Rating"].values,
atol=sar_settings["ATOL"],
)
@ -351,43 +403,52 @@ def test_userpred(
):
time_now = demo_usage_data[header["col_timestamp"]].max()
test_id = '{0}_{1}_{2}'.format(threshold, similarity_type, file)
test_id = "{0}_{1}_{2}".format(threshold, similarity_type, file)
model = SARPlus(spark,
**header,
table_prefix=test_id,
timedecay_formula=True,
time_decay_coefficient=30,
time_now=time_now,
threshold=threshold,
similarity_type=similarity_type)
model = SARPlus(
spark,
**header,
table_prefix=test_id,
timedecay_formula=True,
time_decay_coefficient=30,
time_now=time_now,
threshold=threshold,
similarity_type=similarity_type
)
df = spark.createDataFrame(demo_usage_data)
model.fit(df)
url = (sar_settings["FILE_DIR"]
url = (
sar_settings["FILE_DIR"]
+ "userpred_"
+ file
+ str(threshold)
+ "_userid_only.csv")
+ "_userid_only.csv"
)
pred_ref = pd.read_csv(url)
pred_ref = pd.wide_to_long(pred_ref, ['rec','score'], 'user', 'idx')\
.sort_values('score', ascending=False)\
pred_ref = (
pd.wide_to_long(pred_ref, ["rec", "score"], "user", "idx")
.sort_values("score", ascending=False)
.reset_index(drop=True)
)
# Note: it's important to have a separate cache_path for each run as they're interferring with each other
pred = model.recommend_k_items(
spark.createDataFrame(demo_usage_data[
demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
]),
cache_path='test_userpred-' + test_id,
spark.createDataFrame(
demo_usage_data[
demo_usage_data[header["col_user"]] == sar_settings["TEST_USER_ID"]
]
),
cache_path="test_userpred-" + test_id,
top_k=10,
n_user_prediction_partitions=1)
n_user_prediction_partitions=1,
)
pred = pred.toPandas()\
.sort_values('score', ascending=False)\
.reset_index(drop=True)
pred = pred.toPandas().sort_values("score", ascending=False).reset_index(drop=True)
assert (pred.MovieId.values == pred_ref.rec.values).all()
assert np.allclose(pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"])
assert np.allclose(
pred.score.values, pred_ref.score.values, atol=sar_settings["ATOL"]
)

Просмотреть файл

@ -1,10 +1,11 @@
from distutils.core import setup
setup(name='pysarplus_dummy',
version='0.2',
description='pysarplus dummy package to trigger spark packaging',
author='Markus Cozowicz',
author_email='marcozo@microsoft.com',
url='https://github.com/Microsoft/Recommenders/contrib/sarplus',
packages=['pysarplus_dummy'],
)
setup(
name="pysarplus_dummy",
version="0.2",
description="pysarplus dummy package to trigger spark packaging",
author="Markus Cozowicz",
author_email="marcozo@microsoft.com",
url="https://github.com/Microsoft/Recommenders/contrib/sarplus",
packages=["pysarplus_dummy"],
)

Просмотреть файл

@ -91,4 +91,3 @@ if __name__ == "__main__":
score_result,
schema=DataFrameSchema.data_frame_to_dict(score_result),
)

Просмотреть файл

@ -91,4 +91,3 @@ if __name__ == "__main__":
score_result,
schema=DataFrameSchema.data_frame_to_dict(score_result),
)

Просмотреть файл

@ -91,4 +91,3 @@ if __name__ == "__main__":
score_result,
schema=DataFrameSchema.data_frame_to_dict(score_result),
)

Просмотреть файл

@ -91,4 +91,3 @@ if __name__ == "__main__":
score_result,
schema=DataFrameSchema.data_frame_to_dict(score_result),
)

Просмотреть файл

@ -6,29 +6,32 @@ import joblib
from azureml.studio.core.data_frame_schema import DataFrameSchema
from azureml.studio.core.logger import module_logger as logger
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory
from azureml.studio.core.io.data_frame_directory import (
load_data_frame_from_directory,
save_data_frame_to_directory,
)
from azureml.studio.core.io.model_directory import load_model_from_directory
class ScoreType(Enum):
ITEM_RECOMMENDATION = 'Item recommendation'
RATING_PREDICTION = 'Rating prediction'
ITEM_RECOMMENDATION = "Item recommendation"
RATING_PREDICTION = "Rating prediction"
class RankingMetric(Enum):
RATING = 'Rating'
SIMILARITY = 'Similarity'
POPULARITY = 'Popularity'
RATING = "Rating"
SIMILARITY = "Similarity"
POPULARITY = "Popularity"
class ItemSet(Enum):
TRAIN_ONLY = 'Items in training set'
SCORE_ONLY = 'Items in score set'
TRAIN_ONLY = "Items in training set"
SCORE_ONLY = "Items in score set"
def joblib_loader(load_from_dir, model_spec):
file_name = model_spec['file_name']
with open(Path(load_from_dir) / file_name, 'rb') as fin:
file_name = model_spec["file_name"]
with open(Path(load_from_dir) / file_name, "rb") as fin:
return joblib.load(fin)
@ -45,56 +48,87 @@ class ScoreSARModule:
def input_data(self):
return self._input_data
def recommend_items(self, ranking_metric, top_k, sort_top_k, remove_seen, normalize):
def recommend_items(
self, ranking_metric, top_k, sort_top_k, remove_seen, normalize
):
if ranking_metric == RankingMetric.RATING:
return self.model.recommend_k_items(test=self.input_data, top_k=top_k, sort_top_k=sort_top_k,
remove_seen=remove_seen, normalize=normalize)
return self.model.recommend_k_items(
test=self.input_data,
top_k=top_k,
sort_top_k=sort_top_k,
remove_seen=remove_seen,
normalize=normalize,
)
if ranking_metric == RankingMetric.SIMILARITY:
return self.model.get_item_based_topk(items=self.input_data, top_k=top_k, sort_top_k=sort_top_k)
return self.model.get_item_based_topk(
items=self.input_data, top_k=top_k, sort_top_k=sort_top_k
)
if ranking_metric == RankingMetric.POPULARITY:
return self.model.get_popularity_based_topk(top_k=top_k, sort_top_k=sort_top_k)
return self.model.get_popularity_based_topk(
top_k=top_k, sort_top_k=sort_top_k
)
raise ValueError(f"Got unexpected ranking metric: {ranking_metric}.")
def predict_ratings(self, items_to_predict, normalize):
if items_to_predict == ItemSet.TRAIN_ONLY:
return self.model.predict_training_items(test=self.input_data, normalize=normalize)
return self.model.predict_training_items(
test=self.input_data, normalize=normalize
)
if items_to_predict == ItemSet.SCORE_ONLY:
return self.model.predict(test=self.input_data, normalize=normalize)
raise ValueError(f"Got unexpected 'items to predict': {items_to_predict}.")
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--trained-model', help='The directory contains trained SAR model.')
"--trained-model", help="The directory contains trained SAR model."
)
parser.add_argument("--dataset-to-score", help="Dataset to score")
parser.add_argument(
'--dataset-to-score', help='Dataset to score')
"--score-type",
type=str,
help="The type of score which the recommender should output",
)
parser.add_argument(
'--score-type', type=str, help='The type of score which the recommender should output')
"--items-to-predict",
type=str,
help="The set of items to predict for test users",
)
parser.add_argument(
'--items-to-predict', type=str, help='The set of items to predict for test users')
"--normalize",
type=str,
help="Normalize predictions to scale of original ratings",
)
parser.add_argument(
'--normalize', type=str, help='Normalize predictions to scale of original ratings')
"--ranking-metric",
type=str,
help="The metric of ranking used in item recommendation",
)
parser.add_argument(
'--ranking-metric', type=str, help='The metric of ranking used in item recommendation')
"--top-k", type=int, help="The number of top items to recommend."
)
parser.add_argument("--sort-top-k", type=str, help="Sort top k results.")
parser.add_argument(
'--top-k', type=int, help='The number of top items to recommend.')
parser.add_argument(
'--sort-top-k', type=str, help='Sort top k results.')
parser.add_argument(
'--remove-seen-items', type=str, help='Remove items seen in training from recommendation')
parser.add_argument(
'--score-result', help='Ratings or items to output')
"--remove-seen-items",
type=str,
help="Remove items seen in training from recommendation",
)
parser.add_argument("--score-result", help="Ratings or items to output")
args, _ = parser.parse_known_args()
logger.info(f"Arguments: {args}")
sort_top_k = strtobool(args.sort_top_k) if args.sort_top_k else None
remove_seen_items = strtobool(args.remove_seen_items) if args.remove_seen_items else None
remove_seen_items = (
strtobool(args.remove_seen_items) if args.remove_seen_items else None
)
normalize = strtobool(args.normalize) if args.normalize else None
sar_model = load_model_from_directory(args.trained_model, model_loader=joblib_loader).data
sar_model = load_model_from_directory(
args.trained_model, model_loader=joblib_loader
).data
dataset_to_score = load_data_frame_from_directory(args.dataset_to_score).data
logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}")
@ -102,14 +136,22 @@ if __name__ == '__main__':
score_type = ScoreType(args.score_type)
if score_type == ScoreType.ITEM_RECOMMENDATION:
score_result = score_sar_module.recommend_items(ranking_metric=RankingMetric(args.ranking_metric),
top_k=args.top_k, sort_top_k=sort_top_k,
remove_seen=args.remove_seen_items, normalize=normalize)
score_result = score_sar_module.recommend_items(
ranking_metric=RankingMetric(args.ranking_metric),
top_k=args.top_k,
sort_top_k=sort_top_k,
remove_seen=args.remove_seen_items,
normalize=normalize,
)
elif score_type == ScoreType.RATING_PREDICTION:
score_result = score_sar_module.predict_ratings(items_to_predict=ItemSet(args.items_to_predict),
normalize=normalize)
score_result = score_sar_module.predict_ratings(
items_to_predict=ItemSet(args.items_to_predict), normalize=normalize
)
else:
raise ValueError(f"Got unexpected score type: {score_type}.")
save_data_frame_to_directory(args.score_result, data=score_result,
schema=DataFrameSchema.data_frame_to_dict(score_result))
save_data_frame_to_directory(
args.score_result,
data=score_result,
schema=DataFrameSchema.data_frame_to_dict(score_result),
)

Просмотреть файл

@ -3,44 +3,40 @@ import argparse
from azureml.studio.core.logger import module_logger as logger
from reco_utils.dataset.python_splitters import python_stratified_split
from azureml.studio.core.data_frame_schema import DataFrameSchema
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory
from azureml.studio.core.io.data_frame_directory import (
load_data_frame_from_directory,
save_data_frame_to_directory,
)
if __name__ == '__main__':
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--input-path',
help='The input directory.',
"--input-path", help="The input directory.",
)
parser.add_argument(
'--ratio', type=float,
help='A float parameter.',
"--ratio", type=float, help="A float parameter.",
)
parser.add_argument(
'--col-user', type=str,
help='A string parameter.',
"--col-user", type=str, help="A string parameter.",
)
parser.add_argument(
'--col-item', type=str,
help='A string parameter.',
"--col-item", type=str, help="A string parameter.",
)
parser.add_argument(
'--seed', type=int,
help='An int parameter.',
"--seed", type=int, help="An int parameter.",
)
parser.add_argument(
'--output-train',
help='The output training data directory.',
"--output-train", help="The output training data directory.",
)
parser.add_argument(
'--output-test',
help='The output test data directory.',
"--output-test", help="The output test data directory.",
)
args, _ = parser.parse_known_args()
@ -62,12 +58,24 @@ if __name__ == '__main__':
logger.debug(f"Shape of loaded DataFrame: {input_df.shape}")
logger.debug(f"Cols of DataFrame: {input_df.columns}")
output_train, output_test = python_stratified_split(input_df, ratio=args.ratio, col_user=args.col_user,
col_item=args.col_item, seed=args.seed)
output_train, output_test = python_stratified_split(
input_df,
ratio=args.ratio,
col_user=args.col_user,
col_item=args.col_item,
seed=args.seed,
)
logger.debug(f"Output path: {args.output_train}")
logger.debug(f"Output path: {args.output_test}")
save_data_frame_to_directory(args.output_train, output_train, schema=DataFrameSchema.data_frame_to_dict(output_train))
save_data_frame_to_directory(args.output_test, output_test, schema=DataFrameSchema.data_frame_to_dict(output_test))
save_data_frame_to_directory(
args.output_train,
output_train,
schema=DataFrameSchema.data_frame_to_dict(output_train),
)
save_data_frame_to_directory(
args.output_test,
output_test,
schema=DataFrameSchema.data_frame_to_dict(output_test),
)

Просмотреть файл

@ -9,10 +9,12 @@ import shutil
import papermill as pm
import tensorflow as tf
print("TensorFlow version:", tf.VERSION)
try:
from azureml.core import Run
run = Run.get_context()
except ImportError:
run = None
@ -20,15 +22,11 @@ except ImportError:
from reco_utils.common.constants import (
DEFAULT_USER_COL,
DEFAULT_ITEM_COL,
DEFAULT_RATING_COL
DEFAULT_RATING_COL,
)
NOTEBOOK_NAME = os.path.join(
"notebooks",
"00_quick_start",
"wide_deep_movielens.ipynb"
)
NOTEBOOK_NAME = os.path.join("notebooks", "00_quick_start", "wide_deep_movielens.ipynb")
OUTPUT_NOTEBOOK = "wide_deep.ipynb"
@ -39,7 +37,11 @@ def _log(metric, value):
Otherwise, record as a single value of the metric.
"""
if run is not None:
if isinstance(value, list) and len(value) > 0 and isinstance(value[0], (int, float)):
if (
isinstance(value, list)
and len(value) > 0
and isinstance(value[0], (int, float))
):
run.log_list(metric, value)
else:
# Force cast to str since run.log will raise an error if the value is iterable.
@ -50,58 +52,96 @@ def _log(metric, value):
# Parse arguments passed by Hyperdrive
parser = argparse.ArgumentParser()
parser.add_argument('--top-k', type=int, dest='TOP_K', help="Top k recommendation", default=10)
parser.add_argument(
"--top-k", type=int, dest="TOP_K", help="Top k recommendation", default=10
)
# Data path
parser.add_argument('--datastore', type=str, dest='DATA_DIR', help="Datastore path")
parser.add_argument('--train-datapath', type=str, dest='TRAIN_PICKLE_PATH')
parser.add_argument('--test-datapath', type=str, dest='TEST_PICKLE_PATH')
parser.add_argument('--model-dir', type=str, dest='MODEL_DIR', default='model_checkpoints')
parser.add_argument("--datastore", type=str, dest="DATA_DIR", help="Datastore path")
parser.add_argument("--train-datapath", type=str, dest="TRAIN_PICKLE_PATH")
parser.add_argument("--test-datapath", type=str, dest="TEST_PICKLE_PATH")
parser.add_argument(
"--model-dir", type=str, dest="MODEL_DIR", default="model_checkpoints"
)
# Data column names
parser.add_argument('--user-col', type=str, dest='USER_COL', default=DEFAULT_USER_COL)
parser.add_argument('--item-col', type=str, dest='ITEM_COL', default=DEFAULT_ITEM_COL)
parser.add_argument('--rating-col', type=str, dest='RATING_COL', default=DEFAULT_RATING_COL)
parser.add_argument('--item-feat-col', type=str, dest='ITEM_FEAT_COL') # Optional
parser.add_argument('--ranking-metrics', type=str, nargs='*', dest='RANKING_METRICS', default=['ndcg_at_k'])
parser.add_argument('--rating-metrics', type=str, nargs='*', dest='RATING_METRICS', default=['rmse'])
parser.add_argument("--user-col", type=str, dest="USER_COL", default=DEFAULT_USER_COL)
parser.add_argument("--item-col", type=str, dest="ITEM_COL", default=DEFAULT_ITEM_COL)
parser.add_argument(
"--rating-col", type=str, dest="RATING_COL", default=DEFAULT_RATING_COL
)
parser.add_argument("--item-feat-col", type=str, dest="ITEM_FEAT_COL") # Optional
parser.add_argument(
"--ranking-metrics",
type=str,
nargs="*",
dest="RANKING_METRICS",
default=["ndcg_at_k"],
)
parser.add_argument(
"--rating-metrics", type=str, nargs="*", dest="RATING_METRICS", default=["rmse"]
)
# Model type: either 'wide', 'deep', or 'wide_deep'
parser.add_argument('--model-type', type=str, dest='MODEL_TYPE', default='wide_deep')
parser.add_argument("--model-type", type=str, dest="MODEL_TYPE", default="wide_deep")
# Wide model params
parser.add_argument('--linear-optimizer', type=str, dest='LINEAR_OPTIMIZER', default='Ftrl')
parser.add_argument('--linear-optimizer-lr', type=float, dest='LINEAR_OPTIMIZER_LR', default=0.01)
parser.add_argument('--linear-l1-reg', type=float, dest='LINEAR_L1_REG', default=0.0)
parser.add_argument('--linear-l2-reg', type=float, dest='LINEAR_L2_REG', default=0.0)
parser.add_argument('--linear-momentum', type=float, dest='LINEAR_MOMENTUM', default=0.9)
parser.add_argument(
"--linear-optimizer", type=str, dest="LINEAR_OPTIMIZER", default="Ftrl"
)
parser.add_argument(
"--linear-optimizer-lr", type=float, dest="LINEAR_OPTIMIZER_LR", default=0.01
)
parser.add_argument("--linear-l1-reg", type=float, dest="LINEAR_L1_REG", default=0.0)
parser.add_argument("--linear-l2-reg", type=float, dest="LINEAR_L2_REG", default=0.0)
parser.add_argument(
"--linear-momentum", type=float, dest="LINEAR_MOMENTUM", default=0.9
)
# Deep model params
parser.add_argument('--dnn-optimizer', type=str, dest='DNN_OPTIMIZER', default='Adagrad')
parser.add_argument('--dnn-optimizer-lr', type=float, dest='DNN_OPTIMIZER_LR', default=0.01)
parser.add_argument('--dnn-l1-reg', type=float, dest='DNN_L1_REG', default=0.0)
parser.add_argument('--dnn-l2-reg', type=float, dest='DNN_L2_REG', default=0.0)
parser.add_argument('--dnn-momentum', type=float, dest='DNN_MOMENTUM', default=0.9)
parser.add_argument('--dnn-hidden-layer-1', type=int, dest='DNN_HIDDEN_LAYER_1', default=0)
parser.add_argument('--dnn-hidden-layer-2', type=int, dest='DNN_HIDDEN_LAYER_2', default=0)
parser.add_argument('--dnn-hidden-layer-3', type=int, dest='DNN_HIDDEN_LAYER_3', default=128)
parser.add_argument('--dnn-hidden-layer-4', type=int, dest='DNN_HIDDEN_LAYER_4', default=128)
parser.add_argument('--dnn-user-embedding-dim', type=int, dest='DNN_USER_DIM', default=8)
parser.add_argument('--dnn-item-embedding-dim', type=int, dest='DNN_ITEM_DIM', default=8)
parser.add_argument('--dnn-batch-norm', type=int, dest='DNN_BATCH_NORM', default=1)
parser.add_argument('--dnn-dropout', type=float, dest='DNN_DROPOUT', default=0.0)
parser.add_argument(
"--dnn-optimizer", type=str, dest="DNN_OPTIMIZER", default="Adagrad"
)
parser.add_argument(
"--dnn-optimizer-lr", type=float, dest="DNN_OPTIMIZER_LR", default=0.01
)
parser.add_argument("--dnn-l1-reg", type=float, dest="DNN_L1_REG", default=0.0)
parser.add_argument("--dnn-l2-reg", type=float, dest="DNN_L2_REG", default=0.0)
parser.add_argument("--dnn-momentum", type=float, dest="DNN_MOMENTUM", default=0.9)
parser.add_argument(
"--dnn-hidden-layer-1", type=int, dest="DNN_HIDDEN_LAYER_1", default=0
)
parser.add_argument(
"--dnn-hidden-layer-2", type=int, dest="DNN_HIDDEN_LAYER_2", default=0
)
parser.add_argument(
"--dnn-hidden-layer-3", type=int, dest="DNN_HIDDEN_LAYER_3", default=128
)
parser.add_argument(
"--dnn-hidden-layer-4", type=int, dest="DNN_HIDDEN_LAYER_4", default=128
)
parser.add_argument(
"--dnn-user-embedding-dim", type=int, dest="DNN_USER_DIM", default=8
)
parser.add_argument(
"--dnn-item-embedding-dim", type=int, dest="DNN_ITEM_DIM", default=8
)
parser.add_argument("--dnn-batch-norm", type=int, dest="DNN_BATCH_NORM", default=1)
parser.add_argument("--dnn-dropout", type=float, dest="DNN_DROPOUT", default=0.0)
# Training parameters
parser.add_argument('--steps', type=int, dest='STEPS', default=10000)
parser.add_argument('--batch-size', type=int, dest='BATCH_SIZE', default=128)
parser.add_argument('--evaluate-while-training', dest='EVALUATE_WHILE_TRAINING', action='store_true')
parser.add_argument("--steps", type=int, dest="STEPS", default=10000)
parser.add_argument("--batch-size", type=int, dest="BATCH_SIZE", default=128)
parser.add_argument(
"--evaluate-while-training", dest="EVALUATE_WHILE_TRAINING", action="store_true"
)
args = parser.parse_args()
params = vars(args)
if params['TOP_K'] <= 0:
if params["TOP_K"] <= 0:
raise ValueError("Top K should be larger than 0")
if params['MODEL_TYPE'] not in {'wide', 'deep', 'wide_deep'}:
if params["MODEL_TYPE"] not in {"wide", "deep", "wide_deep"}:
raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'")
if params['DATA_DIR'] is None:
if params["DATA_DIR"] is None:
raise ValueError("Datastore path should be given")
print("Args:")
@ -111,10 +151,7 @@ for k, v in params.items():
print("Run", NOTEBOOK_NAME)
pm.execute_notebook(
NOTEBOOK_NAME,
OUTPUT_NOTEBOOK,
parameters=params,
kernel_name='python3'
NOTEBOOK_NAME, OUTPUT_NOTEBOOK, parameters=params, kernel_name="python3"
)
nb = pm.read_notebook(OUTPUT_NOTEBOOK)
@ -123,4 +160,4 @@ for m, v in nb.data.items():
# clean-up
os.remove(OUTPUT_NOTEBOOK)
shutil.rmtree(params['MODEL_DIR'], ignore_errors=True)
shutil.rmtree(params["MODEL_DIR"], ignore_errors=True)

Просмотреть файл

@ -9,10 +9,10 @@ DEFAULT_LABEL_COL = "label"
DEFAULT_TIMESTAMP_COL = "timestamp"
DEFAULT_PREDICTION_COL = "prediction"
COL_DICT = {
"col_user": DEFAULT_USER_COL,
"col_item": DEFAULT_ITEM_COL,
"col_rating": DEFAULT_RATING_COL,
"col_prediction": DEFAULT_PREDICTION_COL
"col_user": DEFAULT_USER_COL,
"col_item": DEFAULT_ITEM_COL,
"col_rating": DEFAULT_RATING_COL,
"col_prediction": DEFAULT_PREDICTION_COL,
}
# Filtering variables

Просмотреть файл

@ -125,4 +125,3 @@ def get_cudnn_version():
else:
raise ValueError("Not in Windows, Linux or Mac")
return find_cudnn_in_headers(candidates)

Просмотреть файл

@ -15,7 +15,7 @@ from __future__ import division # 1/2 == 0.5, as in Py3
from __future__ import absolute_import # avoid hiding global modules with locals
from __future__ import print_function # force use of print("hello")
from __future__ import (
unicode_literals
unicode_literals,
) # force unadorned strings "" to be Unicode without prepending u""
import time
import memory_profiler
@ -96,4 +96,3 @@ def pre_run_cell():
"""Capture current time before we execute the current command"""
global t1
t1 = time.time()

Просмотреть файл

@ -34,10 +34,14 @@ def line_graph(
# Setup figure only once
if subplot[2] == 1:
if plot_size:
plt.figure(figsize=(
plot_size[0]*subplot[1], # fig width = plot width * num columns
plot_size[1]*subplot[0] # fig height = plot height * num rows
))
plt.figure(
figsize=(
plot_size[0]
* subplot[1], # fig width = plot width * num columns
plot_size[1]
* subplot[0], # fig height = plot height * num rows
)
)
plt.subplots_adjust(wspace=0.5)
plt.subplot(*subplot)
else:

Просмотреть файл

@ -354,4 +354,4 @@ class MetricsLogger:
Returns:
dict: Log metrics.
"""
return self._log
return self._log

Просмотреть файл

@ -68,4 +68,3 @@ class Timer(object):
raise ValueError("Timer has not been stopped, please use stop().")
else:
return self._interval

Просмотреть файл

@ -96,4 +96,3 @@ def find_database(client, id):
return True
else:
return False

Просмотреть файл

@ -174,4 +174,3 @@ def get_spark_schema(header=DEFAULT_HEADER):
for i in range(26):
schema.add(StructField(header[i + n_ints], StringType()))
return schema

Просмотреть файл

@ -78,4 +78,4 @@ def download_path(path=None):
tmp_dir.cleanup()
else:
path = os.path.realpath(path)
yield path
yield path

Просмотреть файл

@ -199,7 +199,7 @@ def load_pandas_df(
movie_col = header[1]
with download_path(local_cache_path) as path:
filepath = os.path.join(path, "ml-{}.zip".format(size))
filepath = os.path.join(path, "ml-{}.zip".format(size))
datapath, item_datapath = _maybe_download_and_extract(size, filepath)
# Load movie features such as title, genres, and release year
@ -256,7 +256,7 @@ def load_item_df(
raise ValueError(ERROR_MOVIE_LENS_SIZE)
with download_path(local_cache_path) as path:
filepath = os.path.join(path, "ml-{}.zip".format(size))
filepath = os.path.join(path, "ml-{}.zip".format(size))
_, item_datapath = _maybe_download_and_extract(size, filepath)
item_df = _load_item_df(
size, item_datapath, movie_col, title_col, genres_col, year_col
@ -404,14 +404,16 @@ def load_spark_df(
movie_col = schema[1].name
with download_path(local_cache_path) as path:
filepath = os.path.join(path, "ml-{}.zip".format(size))
filepath = os.path.join(path, "ml-{}.zip".format(size))
datapath, item_datapath = _maybe_download_and_extract(size, filepath)
spark_datapath = "file:///" + datapath # shorten form of file://localhost/
# Load movie features such as title, genres, and release year.
# Since the file size is small, we directly load as pd.DataFrame from the driver node
# and then convert into spark.DataFrame
item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col)
item_pd_df = _load_item_df(
size, item_datapath, movie_col, title_col, genres_col, year_col
)
item_df = spark.createDataFrame(item_pd_df) if item_pd_df is not None else None
if is_databricks():
@ -467,8 +469,7 @@ def _get_schema(header, schema):
schema = StructType()
try:
(
schema
.add(StructField(header[0], IntegerType()))
schema.add(StructField(header[0], IntegerType()))
.add(StructField(header[1], IntegerType()))
.add(StructField(header[2], FloatType()))
.add(StructField(header[3], LongType()))

Просмотреть файл

@ -220,7 +220,7 @@ class LibffmConverter:
def _convert(field, feature, field_index, field_feature_index_dict):
field_feature_index = field_feature_index_dict[(field, feature)]
if isinstance(feature, str):
if isinstance(feature, str):
feature = 1
return "{}:{}:{}".format(field_index, field_feature_index, feature)

Просмотреть файл

@ -623,7 +623,9 @@ def map_at_k(
# calculate reciprocal rank of items for each user and sum them up
df_hit_sorted = df_hit.copy()
df_hit_sorted["rr"] = (df_hit_sorted.groupby(col_user).cumcount() + 1) / df_hit_sorted["rank"]
df_hit_sorted["rr"] = (
df_hit_sorted.groupby(col_user).cumcount() + 1
) / df_hit_sorted["rank"]
df_hit_sorted = df_hit_sorted.groupby(col_user).agg({"rr": "sum"}).reset_index()
df_merge = pd.merge(df_hit_sorted, df_hit_count, on=col_user)

Просмотреть файл

@ -486,6 +486,7 @@ def ndcg_score(y_true, y_score, k=10):
actual = dcg_score(y_true, y_score, k)
return actual / best
def hit_score(y_true, y_score, k=10):
"""Computing hit score metric at k.
@ -503,6 +504,7 @@ def hit_score(y_true, y_score, k=10):
return 1
return 0
def dcg_score(y_true, y_score, k=10):
"""Computing dcg score metric at k.
@ -521,7 +523,6 @@ def dcg_score(y_true, y_score, k=10):
return np.sum(gains / discounts)
def cal_metric(labels, preds, metrics):
"""Calculate metrics,such as auc, logloss.
@ -555,7 +556,7 @@ def cal_metric(labels, preds, metrics):
res["f1"] = round(f1, 4)
elif metric == "mean_mrr":
mean_mrr = np.mean(
[
[
mrr_score(each_labels, each_preds)
for each_labels, each_preds in zip(labels, preds)
]
@ -563,12 +564,12 @@ def cal_metric(labels, preds, metrics):
res["mean_mrr"] = round(mean_mrr, 4)
elif metric.startswith("ndcg"): # format like: ndcg@2;4;6;8
ndcg_list = [1, 2]
ks = metric.split('@')
ks = metric.split("@")
if len(ks) > 1:
ndcg_list = [int(token) for token in ks[1].split(';')]
ndcg_list = [int(token) for token in ks[1].split(";")]
for k in ndcg_list:
ndcg_temp = np.mean(
[
[
ndcg_score(each_labels, each_preds, k)
for each_labels, each_preds in zip(labels, preds)
]
@ -576,13 +577,13 @@ def cal_metric(labels, preds, metrics):
res["ndcg@{0}".format(k)] = round(ndcg_temp, 4)
elif metric.startswith("hit"): # format like: hit@2;4;6;8
hit_list = [1, 2]
ks = metric.split('@')
ks = metric.split("@")
if len(ks) > 1:
hit_list = [int(token) for token in ks[1].split(';')]
hit_list = [int(token) for token in ks[1].split(";")]
for k in hit_list:
hit_temp = np.mean(
[
hit_score(each_labels, each_preds, k)
hit_score(each_labels, each_preds, k)
for each_labels, each_preds in zip(labels, preds)
]
)

Просмотреть файл

@ -146,9 +146,16 @@ class DKNTextIterator(BaseIterator):
if not line:
break
label, candidate_news_index, candidate_news_val, click_news_index, click_news_val, candidate_news_entity_index, click_news_entity_index, impression_id = self.parser_one_line(
line
)
(
label,
candidate_news_index,
candidate_news_val,
click_news_index,
click_news_val,
candidate_news_entity_index,
click_news_entity_index,
impression_id,
) = self.parser_one_line(line)
candidate_news_index_batch.append(candidate_news_index)
candidate_news_val_batch.append(candidate_news_val)

Просмотреть файл

@ -29,6 +29,7 @@ class FFMTextIterator(BaseIterator):
Iterator will not load the whole data into memory. Instead, it loads data into memory
per mini-batch, so that large files can be used as input data.
"""
def __init__(self, hparams, graph, col_spliter=" ", ID_spliter="%"):
"""Initialize an iterator. Create necessary placeholders for the model.

Просмотреть файл

@ -16,6 +16,7 @@ class DKN(BaseModel):
Recommendation", in Proceedings of the 2018 World Wide Web Conference on World
Wide Web, 2018.
"""
def __init__(self, hparams, iterator_creator):
"""Initialization steps for DKN.
Compared with the BaseModel, DKN requires two different pre-computed embeddings,

Просмотреть файл

@ -23,6 +23,7 @@ class A2SVDModel(SequentialBaseModel):
the 28th International Joint Conferences on Artificial Intelligence, IJCAI19,
Pages 4213-4219, AAAI Press, 2019.
"""
def _build_seq_graph(self):
"""The main function to create A2SVD model.

Просмотреть файл

@ -16,6 +16,7 @@ class CaserModel(SequentialBaseModel):
sequence embedding", in Proceedings of the Eleventh ACM International Conference on
Web Search and Data Mining, ACM, 2018.
"""
def __init__(self, hparams, iterator_creator):
"""Initialization of variables for caser

Просмотреть файл

@ -17,6 +17,7 @@ class GRU4RecModel(SequentialBaseModel):
B. Hidasi, A. Karatzoglou, L. Baltrunas, D. Tikk, "Session-based Recommendations
with Recurrent Neural Networks", ICLR (Poster), 2016.
"""
def _build_seq_graph(self):
"""The main function to create GRU4Rec model.

Просмотреть файл

@ -21,6 +21,7 @@ class SLI_RECModel(SequentialBaseModel):
the 28th International Joint Conferences on Artificial Intelligence, IJCAI19,
Pages 4213-4219, AAAI Press, 2019.
"""
def _build_seq_graph(self):
"""The main function to create sli_rec model.
@ -112,7 +113,7 @@ class SLI_RECModel(SequentialBaseModel):
shape=[user_embedding.shape.as_list()[-1], query_size],
initializer=self.initializer,
)
att_inputs = tf.tensordot(user_embedding, attention_mat, [[2],[0]])
att_inputs = tf.tensordot(user_embedding, attention_mat, [[2], [0]])
queries = tf.reshape(
tf.tile(query, [1, att_inputs.shape[1].value]), tf.shape(att_inputs)

Просмотреть файл

@ -18,6 +18,7 @@ class XDeepFMModel(BaseModel):
24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining,
KDD 2018, London, 2018.
"""
def _build_graph(self):
"""The main function to create xdeepfm's logic.

Просмотреть файл

@ -81,4 +81,3 @@ def hide_fastai_progress_bar():
master_bar,
progress_bar,
)

Просмотреть файл

@ -15,11 +15,11 @@ def model_perf_plots(df):
Returns:
matplotlib axes
"""
g = sns.FacetGrid(df, col="metric", hue='stage', col_wrap=2, sharey=False)
g = g.map(sns.scatterplot, "epoch", "value").add_legend()
g = sns.FacetGrid(df, col="metric", hue="stage", col_wrap=2, sharey=False)
g = g.map(sns.scatterplot, "epoch", "value").add_legend()
def compare_metric(df_list, metric='prec', stage='test'):
def compare_metric(df_list, metric="prec", stage="test"):
"""Function to combine and prepare list of dataframes into tidy format
Args:
df_list (list): List of dataframes
@ -29,18 +29,29 @@ def compare_metric(df_list, metric='prec', stage='test'):
Returns:
Pandas dataframe
"""
colnames = ['model'+str(x) for x in list(range(1,len(df_list)+1))]
models = [df[(df['stage']==stage) & (df['metric']==metric)]['value'].reset_index(
drop=True).values for df in df_list]
colnames = ["model" + str(x) for x in list(range(1, len(df_list) + 1))]
models = [
df[(df["stage"] == stage) & (df["metric"] == metric)]["value"]
.reset_index(drop=True)
.values
for df in df_list
]
output = pd.DataFrame(zip(*models),
columns=colnames).stack().reset_index()
output.columns = ['epoch','data','value']
return output
output = pd.DataFrame(zip(*models), columns=colnames).stack().reset_index()
output.columns = ["epoch", "data", "value"]
return output
def track_model_metrics(model, train_interactions, test_interactions, k=10,
no_epochs=100, no_threads=8, show_plot=True, **kwargs):
def track_model_metrics(
model,
train_interactions,
test_interactions,
k=10,
no_epochs=100,
no_threads=8,
show_plot=True,
**kwargs
):
"""Function to record model's performance at each epoch, formats the performance into tidy format,
plots the performance and outputs the performance data
Args:
@ -58,36 +69,51 @@ def track_model_metrics(model, train_interactions, test_interactions, k=10,
matplotlib axes: side effect of the method
"""
# initialising temp data storage
model_prec_train = [0]*no_epochs
model_prec_test = [0]*no_epochs
model_prec_train = [0] * no_epochs
model_prec_test = [0] * no_epochs
model_rec_train = [0]*no_epochs
model_rec_test = [0]*no_epochs
# fit model and store train/test metrics at each epoch
model_rec_train = [0] * no_epochs
model_rec_test = [0] * no_epochs
# fit model and store train/test metrics at each epoch
for epoch in range(no_epochs):
# print(f'Epoch: {epoch}/{epochs}')
model.fit_partial(interactions=train_interactions, epochs=1,
num_threads=no_threads, **kwargs)
model_prec_train[epoch] = precision_at_k(model, train_interactions, k=k, **kwargs).mean()
model_prec_test[epoch] = precision_at_k(model, test_interactions, k=k, **kwargs).mean()
# print(f'Epoch: {epoch}/{epochs}')
model.fit_partial(
interactions=train_interactions, epochs=1, num_threads=no_threads, **kwargs
)
model_prec_train[epoch] = precision_at_k(
model, train_interactions, k=k, **kwargs
).mean()
model_prec_test[epoch] = precision_at_k(
model, test_interactions, k=k, **kwargs
).mean()
model_rec_train[epoch] = recall_at_k(
model, train_interactions, k=k, **kwargs
).mean()
model_rec_test[epoch] = recall_at_k(
model, test_interactions, k=k, **kwargs
).mean()
model_rec_train[epoch] = recall_at_k(model, train_interactions, k=k, **kwargs).mean()
model_rec_test[epoch] = recall_at_k(model, test_interactions, k=k, **kwargs).mean()
# collect the performance metrics into a dataframe
fitting_metrics = pd.DataFrame(zip(model_prec_train, model_prec_test,
model_rec_train, model_rec_test),
columns=['model_prec_train', 'model_prec_test', 'model_rec_train', 'model_rec_test'])
fitting_metrics = pd.DataFrame(
zip(model_prec_train, model_prec_test, model_rec_train, model_rec_test),
columns=[
"model_prec_train",
"model_prec_test",
"model_rec_train",
"model_rec_test",
],
)
# convert into tidy format
fitting_metrics = fitting_metrics.stack().reset_index()
fitting_metrics.columns = ['epoch','level','value']
fitting_metrics.columns = ["epoch", "level", "value"]
# exact the labels for each observation
fitting_metrics['stage'] = fitting_metrics.level.str.split('_').str[-1]
fitting_metrics['metric'] = fitting_metrics.level.str.split('_').str[1]
fitting_metrics.drop(['level'], axis = 1, inplace=True)
fitting_metrics["stage"] = fitting_metrics.level.str.split("_").str[-1]
fitting_metrics["metric"] = fitting_metrics.level.str.split("_").str[1]
fitting_metrics.drop(["level"], axis=1, inplace=True)
# replace the metric keys to improve visualisation
metric_keys = {'prec':'Precision', 'rec':'Recall'}
metric_keys = {"prec": "Precision", "rec": "Recall"}
fitting_metrics.metric.replace(metric_keys, inplace=True)
# plots the performance data
if show_plot == True:
@ -115,9 +141,11 @@ def similar_users(user_id, user_features, model, N=10):
user_norms[user_norms == 0] = 1e-10
scores /= user_norms
best = np.argpartition(scores, -(N+1))[-(N+1):]
return pd.DataFrame(sorted(zip(best, scores[best] / user_norms[user_id]),
key=lambda x: -x[1])[1:], columns = ['userID', 'score'])
best = np.argpartition(scores, -(N + 1))[-(N + 1) :]
return pd.DataFrame(
sorted(zip(best, scores[best] / user_norms[user_id]), key=lambda x: -x[1])[1:],
columns=["userID", "score"],
)
def similar_items(item_id, item_features, model, N=10):
@ -133,16 +161,18 @@ def similar_items(item_id, item_features, model, N=10):
Pandas dataframe of top N most similar items with score
"""
_, item_representations = model.get_item_representations(features=item_features)
# Cosine similarity
scores = item_representations.dot(item_representations[item_id, :])
item_norms = np.linalg.norm(item_representations, axis=1)
item_norms[item_norms == 0] = 1e-10
scores /= item_norms
best = np.argpartition(scores, -(N+1))[-(N+1):]
return pd.DataFrame(sorted(zip(best, scores[best] / item_norms[item_id]),
key=lambda x: -x[1])[1:], columns = ['itemID', 'score'])
best = np.argpartition(scores, -(N + 1))[-(N + 1) :]
return pd.DataFrame(
sorted(zip(best, scores[best] / item_norms[item_id]), key=lambda x: -x[1])[1:],
columns=["itemID", "score"],
)
def prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights):
@ -158,24 +188,32 @@ def prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights):
Returns:
Pandas dataframe of user-item selected for testing
"""
test_df = pd.DataFrame(zip(
uids[test_idx],
iids[test_idx],
[list(uid_map.keys())[x] for x in uids[test_idx]],
[list(iid_map.keys())[x] for x in iids[test_idx]]),
columns=['uid','iid','userID','itemID'])
test_df = pd.DataFrame(
zip(
uids[test_idx],
iids[test_idx],
[list(uid_map.keys())[x] for x in uids[test_idx]],
[list(iid_map.keys())[x] for x in iids[test_idx]],
),
columns=["uid", "iid", "userID", "itemID"],
)
dok_weights = weights.todok()
test_df['rating'] = test_df.apply(
lambda x:dok_weights[x.uid,x.iid], axis=1)
test_df["rating"] = test_df.apply(lambda x: dok_weights[x.uid, x.iid], axis=1)
return test_df[['userID', 'itemID', 'rating']]
return test_df[["userID", "itemID", "rating"]]
def prepare_all_predictions(data, uid_map, iid_map, interactions,
model, num_threads,
user_features=None,
item_features=None):
def prepare_all_predictions(
data,
uid_map,
iid_map,
interactions,
model,
num_threads,
user_features=None,
item_features=None,
):
"""Function to prepare all predictions for evaluation
Args:
data (pandas df): dataframe of all users, items and ratings as loaded
@ -193,25 +231,31 @@ def prepare_all_predictions(data, uid_map, iid_map, interactions,
users, items, preds = [], [], []
item = list(data.itemID.unique())
for user in data.userID.unique():
user = [user] * len(item)
user = [user] * len(item)
users.extend(user)
items.extend(item)
all_predictions = pd.DataFrame(data={"userID": users, "itemID":items})
all_predictions['uid'] = all_predictions.userID.map(uid_map)
all_predictions['iid'] = all_predictions.itemID.map(iid_map)
all_predictions = pd.DataFrame(data={"userID": users, "itemID": items})
all_predictions["uid"] = all_predictions.userID.map(uid_map)
all_predictions["iid"] = all_predictions.itemID.map(iid_map)
dok_weights = interactions.todok()
all_predictions['rating'] = all_predictions.apply(
lambda x: dok_weights[x.uid,x.iid], axis=1)
all_predictions["rating"] = all_predictions.apply(
lambda x: dok_weights[x.uid, x.iid], axis=1
)
all_predictions = all_predictions[all_predictions.rating < 1].reset_index(drop=True)
all_predictions = all_predictions.drop('rating', axis=1)
all_predictions['prediction'] = all_predictions.apply(lambda x: model.predict(
user_ids=x['uid'], item_ids=[x['iid']],
user_features=user_features,
item_features=item_features,
num_threads=num_threads)[0], axis=1)
return all_predictions[['userID','itemID','prediction']]
all_predictions = all_predictions.drop("rating", axis=1)
all_predictions["prediction"] = all_predictions.apply(
lambda x: model.predict(
user_ids=x["uid"],
item_ids=[x["iid"]],
user_features=user_features,
item_features=item_features,
num_threads=num_threads,
)[0],
axis=1,
)
return all_predictions[["userID", "itemID", "prediction"]]

Просмотреть файл

@ -53,12 +53,12 @@ class NCF:
seed (int): Seed.
"""
# seed
tf.set_random_seed(seed)
np.random.seed(seed)
self.seed = seed
self.n_users = n_users
self.n_items = n_items
self.model_type = model_type.lower()
@ -105,7 +105,10 @@ class NCF:
# set embedding table
self.embedding_gmf_P = tf.Variable(
tf.truncated_normal(
shape=[self.n_users, self.n_factors], mean=0.0, stddev=0.01, seed=self.seed,
shape=[self.n_users, self.n_factors],
mean=0.0,
stddev=0.01,
seed=self.seed,
),
name="embedding_gmf_P",
dtype=tf.float32,
@ -113,7 +116,10 @@ class NCF:
self.embedding_gmf_Q = tf.Variable(
tf.truncated_normal(
shape=[self.n_items, self.n_factors], mean=0.0, stddev=0.01, seed=self.seed,
shape=[self.n_items, self.n_factors],
mean=0.0,
stddev=0.01,
seed=self.seed,
),
name="embedding_gmf_Q",
dtype=tf.float32,
@ -174,7 +180,9 @@ class NCF:
output,
num_outputs=layer_size,
activation_fn=tf.nn.relu,
weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed),
weights_initializer=tf.contrib.layers.xavier_initializer(
seed=self.seed
),
)
self.mlp_vector = output
@ -189,7 +197,9 @@ class NCF:
num_outputs=1,
activation_fn=None,
biases_initializer=None,
weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed),
weights_initializer=tf.contrib.layers.xavier_initializer(
seed=self.seed
),
)
self.output = tf.sigmoid(output)
@ -200,7 +210,9 @@ class NCF:
num_outputs=1,
activation_fn=None,
biases_initializer=None,
weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed),
weights_initializer=tf.contrib.layers.xavier_initializer(
seed=self.seed
),
)
self.output = tf.sigmoid(output)
@ -213,7 +225,9 @@ class NCF:
num_outputs=1,
activation_fn=None,
biases_initializer=None,
weights_initializer=tf.contrib.layers.xavier_initializer(seed=self.seed),
weights_initializer=tf.contrib.layers.xavier_initializer(
seed=self.seed
),
)
self.output = tf.sigmoid(output)

Просмотреть файл

@ -120,7 +120,7 @@ class LSTURModel(BaseModel):
recurrent_initializer=keras.initializers.glorot_uniform(seed=self.seed),
bias_initializer=keras.initializers.Zeros(),
)(layers.Masking(mask_value=0.0)(click_title_presents))
user_present = layers.Concatenate()([short_uemb, long_u_emb])
user_present = layers.Dense(
hparams.gru_unit,

Просмотреть файл

@ -51,7 +51,7 @@ class NAMLModel(BaseModel):
batch_data["candidate_title_batch"],
batch_data["candidate_body_batch"],
batch_data["candidate_vert_batch"],
batch_data["candidate_subvert_batch"]
batch_data["candidate_subvert_batch"],
]
input_label = batch_data["labels"]
return input_feat, input_label
@ -96,7 +96,9 @@ class NAMLModel(BaseModel):
click_news_presents = layers.TimeDistributed(newsencoder)(
his_input_title_body_verts
)
user_present = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(click_news_presents)
user_present = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(
click_news_presents
)
model = keras.Model(
his_input_title_body_verts, user_present, name="user_encoder"
@ -145,7 +147,9 @@ class NAMLModel(BaseModel):
concate_repr = layers.Concatenate(axis=-2)(
[title_repr, body_repr, vert_repr, subvert_repr]
)
news_repr = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(concate_repr)
news_repr = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(
concate_repr
)
model = keras.Model(input_title_body_verts, news_repr, name="news_encoder")
return model
@ -170,7 +174,7 @@ class NAMLModel(BaseModel):
activation=hparams.cnn_activation,
padding="same",
bias_initializer=keras.initializers.Zeros(),
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed)
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
)(y)
y = layers.Dropout(hparams.dropout)(y)
pred_title = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)
@ -199,7 +203,7 @@ class NAMLModel(BaseModel):
activation=hparams.cnn_activation,
padding="same",
bias_initializer=keras.initializers.Zeros(),
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed)
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
)(y)
y = layers.Dropout(hparams.dropout)(y)
pred_body = AttLayer2(hparams.attention_hidden_dim, seed=self.seed)(y)
@ -223,10 +227,10 @@ class NAMLModel(BaseModel):
vert_emb = vert_embedding(input_vert)
pred_vert = layers.Dense(
hparams.filter_num,
hparams.filter_num,
activation=hparams.dense_activation,
bias_initializer=keras.initializers.Zeros(),
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed)
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
)(vert_emb)
pred_vert = layers.Reshape((1, hparams.filter_num))(pred_vert)
@ -248,10 +252,10 @@ class NAMLModel(BaseModel):
subvert_emb = subvert_embedding(input_subvert)
pred_subvert = layers.Dense(
hparams.filter_num,
hparams.filter_num,
activation=hparams.dense_activation,
bias_initializer=keras.initializers.Zeros(),
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed)
kernel_initializer=keras.initializers.glorot_uniform(seed=self.seed),
)(subvert_emb)
pred_subvert = layers.Reshape((1, hparams.filter_num))(pred_subvert)

Просмотреть файл

@ -15,6 +15,7 @@ log = logging.getLogger(__name__)
class RBM:
"""Restricted Boltzmann Machine"""
def __init__(
self,
hidden_units=500,
@ -27,7 +28,7 @@ class RBM:
sampling_protocol=[50, 70, 80, 90, 100],
debug=False,
with_metrics=False,
seed=42
seed=42,
):
"""Implementation of a multinomial Restricted Boltzmann Machine for collaborative filtering
in numpy/pandas/tensorflow
@ -146,8 +147,6 @@ class RBM:
tf.Tensor: Float32 tensor of sampled units. The value is 1 if pr>g and 0 otherwise.
"""
# sample from a Bernoulli distribution with same dimensions as input distribution
g = tf.convert_to_tensor(np.random.uniform(size=pr.shape[1]), dtype=tf.float32)
@ -260,7 +259,9 @@ class RBM:
self.w = tf.get_variable(
"weight",
[self.Nvisible, self.Nhidden],
initializer=tf.random_normal_initializer(stddev=self.stdv, seed=self.seed),
initializer=tf.random_normal_initializer(
stddev=self.stdv, seed=self.seed
),
dtype="float32",
)
@ -278,7 +279,6 @@ class RBM:
dtype="float32",
)
def sample_hidden_units(self, vv):
"""Sampling: In RBM we use Contrastive divergence to sample the parameter space. In order to do that we need
to initialize the two conditional probabilities:
@ -355,7 +355,6 @@ class RBM:
return pvh, v_
def gibbs_sampling(self):
"""Gibbs sampling: Determines an estimate of the model configuration via sampling. In the binary
RBM we need to impose that unseen movies stay as such, i.e. the sampling phase should not modify
@ -431,7 +430,6 @@ class RBM:
if self.debug:
log.info("percentage of epochs covered so far %f2" % (epoch_percentage))
def accuracy(self, vp):
"""Train/Test Mean average precision

Просмотреть файл

@ -17,8 +17,17 @@ from reco_utils.recommender.ncf.ncf_singlenode import NCF
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset
from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_chrono_split
from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k,
recall_at_k, get_top_k_items)
from reco_utils.evaluation.python_evaluation import (
rmse,
mae,
rsquared,
exp_var,
map_at_k,
ndcg_at_k,
precision_at_k,
recall_at_k,
get_top_k_items,
)
from reco_utils.common.constants import SEED as DEFAULT_SEED
logging.basicConfig(level=logging.DEBUG)
@ -39,16 +48,16 @@ def ncf_training(params):
data = NCFDataset(train=train_data, test=validation_data, seed=DEFAULT_SEED)
model = NCF (
n_users=data.n_users,
model = NCF(
n_users=data.n_users,
n_items=data.n_items,
model_type="NeuMF",
n_factors=params["n_factors"],
layer_sizes=[16,8,4],
layer_sizes=[16, 8, 4],
n_epochs=params["n_epochs"],
learning_rate=params["learning_rate"],
verbose=params["verbose"],
seed=DEFAULT_SEED
seed=DEFAULT_SEED,
)
model.fit(data)
@ -58,12 +67,18 @@ def ncf_training(params):
metrics_dict = {}
rating_metrics = params["rating_metrics"]
if len(rating_metrics) > 0:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
for (_, row) in validation_data.iterrows()]
predictions = [
[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
for (_, row) in validation_data.iterrows()
]
predictions = pd.DataFrame(
predictions, columns=["userID", "itemID", "prediction"]
)
predictions = predictions.astype(
{"userID": "int64", "itemID": "int64", "prediction": "float64"}
)
predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions = predictions.astype({'userID': 'int64', 'itemID': 'int64', 'prediction': 'float64'})
for metric in rating_metrics:
result = getattr(evaluation, metric)(validation_data, predictions)
logger.debug("%s = %g", metric, result)
@ -77,18 +92,25 @@ def ncf_training(params):
users, items, preds = [], [], []
item = list(train_data.itemID.unique())
for user in train_data.userID.unique():
user = [user] * len(item)
user = [user] * len(item)
users.extend(user)
items.extend(item)
preds.extend(list(model.predict(user, item, is_list=True)))
all_predictions = pd.DataFrame(data={"userID": users, "itemID": items, "prediction": preds})
all_predictions = pd.DataFrame(
data={"userID": users, "itemID": items, "prediction": preds}
)
merged = pd.merge(train_data, all_predictions, on=["userID", "itemID"], how="outer")
all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)
merged = pd.merge(
train_data, all_predictions, on=["userID", "itemID"], how="outer"
)
all_predictions = merged[merged.rating.isnull()].drop("rating", axis=1)
for metric in ranking_metrics:
result = getattr(evaluation, metric)(
validation_data, all_predictions, col_prediction="prediction", k=params["k"]
validation_data,
all_predictions,
col_prediction="prediction",
k=params["k"],
)
logger.debug("%s@%d = %g", metric, params["k"], result)
if metric == params["primary_metric"]:

Просмотреть файл

@ -109,7 +109,10 @@ def get_trials(optimize_mode):
raise ValueError("optimize_mode should equal either minimize or maximize")
all_trials = requests.get(NNI_TRIAL_JOBS_URL).json()
trials = [
(ast.literal_eval(ast.literal_eval(trial['finalMetricData'][0]['data'])), trial["logPath"].split(":")[-1])
(
ast.literal_eval(ast.literal_eval(trial["finalMetricData"][0]["data"])),
trial["logPath"].split(":")[-1],
)
for trial in all_trials
]
sorted_trials = sorted(
@ -142,8 +145,10 @@ def start_nni(config_path, wait=WAITING_TIME, max_retries=MAX_RETRIES):
max_retries (int): max number of retries
"""
nni_env = os.environ.copy()
nni_env['PATH'] = sys.prefix + '/bin:' + nni_env['PATH']
proc = subprocess.run([sys.prefix + '/bin/nnictl', 'create', '--config', config_path], env=nni_env)
nni_env["PATH"] = sys.prefix + "/bin:" + nni_env["PATH"]
proc = subprocess.run(
[sys.prefix + "/bin/nnictl", "create", "--config", config_path], env=nni_env
)
# proc = subprocess.run(["nnictl", "create", "--config", config_path], env=nni_env)
if proc.returncode != 0:
raise RuntimeError("'nnictl create' failed with code %d" % proc.returncode)

Просмотреть файл

@ -53,4 +53,3 @@ def generate_param_grid(params):
params_exp.append(param_exp)
return params_exp

Просмотреть файл

@ -27,4 +27,3 @@ if __name__ == "__main__":
deps += list(PIP_WIN32.values())
with open("requirements.txt", "w") as f:
f.write("\n".join(set(deps)))

Просмотреть файл

@ -16,59 +16,70 @@ from azureml.core import Run
def create_arg_parser():
parser = argparse.ArgumentParser(description='Process inputs')
parser = argparse.ArgumentParser(description="Process inputs")
# test folder
parser.add_argument("--testfolder", "-f",
action="store",
default="./tests/unit",
help="Folder where tests are located")
parser.add_argument("--num",
action="store",
default="99",
help="test num")
parser.add_argument(
"--testfolder",
"-f",
action="store",
default="./tests/unit",
help="Folder where tests are located",
)
parser.add_argument("--num", action="store", default="99", help="test num")
# test markers
parser.add_argument("--testmarkers", "-m",
action="store",
default="not notebooks and not spark and not gpu",
help="Specify test markers for test selection")
parser.add_argument(
"--testmarkers",
"-m",
action="store",
default="not notebooks and not spark and not gpu",
help="Specify test markers for test selection",
)
# test results file
parser.add_argument("--xmlname", "-j",
action="store",
default="reports/test-unit.xml",
help="Test results")
parser.add_argument(
"--xmlname",
"-j",
action="store",
default="reports/test-unit.xml",
help="Test results",
)
args = parser.parse_args()
return args
if __name__ == "__main__":
logger = logging.getLogger('submit_azureml_pytest.py')
logger = logging.getLogger("submit_azureml_pytest.py")
args = create_arg_parser()
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger.debug('junit_xml {}'.format(args.xmlname))
logger.debug("junit_xml {}".format(args.xmlname))
# Run.get_context() is needed to save context as pytest causes corruption
# of env vars
run = Run.get_context()
'''
"""
This is an example of a working subprocess.run for a unit test run:
subprocess.run(["pytest", "tests/unit",
"-m", "not notebooks and not spark and not gpu",
"--junitxml=reports/test-unit.xml"])
'''
logger.debug('args.junitxml {}'.format(args.xmlname))
logger.debug('junit= --junitxml={}'.format(args.xmlname))
pytest_cmd = ['pytest', args.testfolder, '-m', args.testmarkers,
'--junitxml={}'.format(args.xmlname)]
logger.info('pytest run:{}'.format(' '.join(pytest_cmd)))
"""
logger.debug("args.junitxml {}".format(args.xmlname))
logger.debug("junit= --junitxml={}".format(args.xmlname))
pytest_cmd = [
"pytest",
args.testfolder,
"-m",
args.testmarkers,
"--junitxml={}".format(args.xmlname),
]
logger.info("pytest run:{}".format(" ".join(pytest_cmd)))
subprocess.run(pytest_cmd)
#
# Leveraged code from this notebook:
# https://msdata.visualstudio.com/Vienna/_search?action=contents&text=upload_folder&type=code&lp=code-Project&filters=ProjectFilters%7BVienna%7DRepositoryFilters%7BAzureMlCli%7D&pageSize=25&sortOptions=%5B%7B%22field%22%3A%22relevance%22%2C%22sortOrder%22%3A%22desc%22%7D%5D&result=DefaultCollection%2FVienna%2FAzureMlCli%2FGBmaster%2F%2Fsrc%2Fazureml-core%2Fazureml%2Fcore%2Frun.py
logger.debug('os.listdir files {}'.format(os.listdir('.')))
logger.debug("os.listdir files {}".format(os.listdir(".")))
# files for AzureML
name_of_upload = "reports"

Просмотреть файл

@ -48,8 +48,9 @@ from azureml.core.compute_target import ComputeTargetException
from azureml.core.workspace import WorkspaceException
def setup_workspace(workspace_name, subscription_id, resource_group, cli_auth,
location):
def setup_workspace(
workspace_name, subscription_id, resource_group, cli_auth, location
):
"""
This sets up an Azure Workspace.
An existing Azure Workspace is used or a new one is created if needed for
@ -71,35 +72,34 @@ def setup_workspace(workspace_name, subscription_id, resource_group, cli_auth,
Returns:
ws: workspace reference
"""
logger.debug('setup: workspace_name is {}'.format(workspace_name))
logger.debug('setup: resource_group is {}'.format(resource_group))
logger.debug('setup: subid is {}'.format(subscription_id))
logger.debug('setup: location is {}'.format(location))
logger.debug("setup: workspace_name is {}".format(workspace_name))
logger.debug("setup: resource_group is {}".format(resource_group))
logger.debug("setup: subid is {}".format(subscription_id))
logger.debug("setup: location is {}".format(location))
try:
# use existing workspace if there is one
ws = Workspace.get(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
auth=cli_auth
)
# use existing workspace if there is one
ws = Workspace.get(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
auth=cli_auth,
)
except WorkspaceException:
# this call might take a minute or two.
logger.debug('Creating new workspace')
ws = Workspace.create(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
# create_resource_group=True,
location=location,
auth=cli_auth
)
# this call might take a minute or two.
logger.debug("Creating new workspace")
ws = Workspace.create(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
# create_resource_group=True,
location=location,
auth=cli_auth,
)
return ws
def setup_persistent_compute_target(workspace, cluster_name, vm_size,
max_nodes):
def setup_persistent_compute_target(workspace, cluster_name, vm_size, max_nodes):
"""
Set up a persistent compute target on AzureML.
A persistent compute target runs noticeably faster than a
@ -127,15 +127,13 @@ def setup_persistent_compute_target(workspace, cluster_name, vm_size,
try:
cpu_cluster = ComputeTarget(workspace=workspace, name=cluster_name)
logger.debug('setup: Found existing cluster, use it.')
logger.debug("setup: Found existing cluster, use it.")
except ComputeTargetException:
logger.debug('setup: create cluster')
logger.debug("setup: create cluster")
compute_config = AmlCompute.provisioning_configuration(
vm_size=vm_size,
max_nodes=max_nodes)
cpu_cluster = ComputeTarget.create(workspace,
cluster_name,
compute_config)
vm_size=vm_size, max_nodes=max_nodes
)
cpu_cluster = ComputeTarget.create(workspace, cluster_name, compute_config)
cpu_cluster.wait_for_completion(show_output=True)
return cpu_cluster
@ -173,7 +171,8 @@ def create_run_config(cpu_cluster, docker_proc_type, conda_env_file):
# True means the user will manually configure the environment
run_amlcompute.environment.python.user_managed_dependencies = False
run_amlcompute.environment.python.conda_dependencies = CondaDependencies(
conda_dependencies_file_path=conda_env_file)
conda_dependencies_file_path=conda_env_file
)
return run_amlcompute
@ -190,13 +189,14 @@ def create_experiment(workspace, experiment_name):
exp - AzureML experiment
"""
logger.debug('create: experiment_name {}'.format(experiment_name))
logger.debug("create: experiment_name {}".format(experiment_name))
exp = Experiment(workspace=workspace, name=experiment_name)
return(exp)
return exp
def submit_experiment_to_azureml(test, test_folder, test_markers, junitxml,
run_config, experiment):
def submit_experiment_to_azureml(
test, test_folder, test_markers, junitxml, run_config, experiment
):
"""
Submitting the experiment to AzureML actually runs the script.
@ -219,20 +219,23 @@ def submit_experiment_to_azureml(test, test_folder, test_markers, junitxml,
run : AzureML run or trial
"""
logger.debug('submit: testfolder {}'.format(test_folder))
logger.debug('junitxml: {}'.format(junitxml))
logger.debug("submit: testfolder {}".format(test_folder))
logger.debug("junitxml: {}".format(junitxml))
project_folder = "."
script_run_config = ScriptRunConfig(source_directory=project_folder,
script=test,
run_config=run_config,
arguments=["--testfolder",
test_folder,
"--testmarkers",
test_markers,
"--xmlname",
junitxml]
)
script_run_config = ScriptRunConfig(
source_directory=project_folder,
script=test,
run_config=run_config,
arguments=[
"--testfolder",
test_folder,
"--testmarkers",
test_markers,
"--xmlname",
junitxml,
],
)
run = experiment.submit(script_run_config)
# waits only for configuration to complete
run.wait_for_completion(show_output=True, wait_post_processing=True)
@ -240,7 +243,7 @@ def submit_experiment_to_azureml(test, test_folder, test_markers, junitxml,
# test logs can also be found on azure
# go to azure portal to see log in azure ws and look for experiment name
# and look for individual run
logger.debug('files {}'.format(run.get_file_names))
logger.debug("files {}".format(run.get_file_names))
return run
@ -251,92 +254,113 @@ def create_arg_parser():
use defaults. The user has many options they can select.
"""
parser = argparse.ArgumentParser(description='Process some inputs')
parser = argparse.ArgumentParser(description="Process some inputs")
# script to run pytest
parser.add_argument("--test",
action="store",
default="./tests/ci/run_pytest.py",
help="location of script to run pytest")
parser.add_argument(
"--test",
action="store",
default="./tests/ci/run_pytest.py",
help="location of script to run pytest",
)
# test folder
parser.add_argument("--testfolder",
action="store",
default="./tests/unit",
help="folder where tests are stored")
parser.add_argument(
"--testfolder",
action="store",
default="./tests/unit",
help="folder where tests are stored",
)
# pytest test markers
parser.add_argument("--testmarkers",
action="store",
default="not notebooks and not spark and not gpu",
help="pytest markers indicate tests to run")
parser.add_argument(
"--testmarkers",
action="store",
default="not notebooks and not spark and not gpu",
help="pytest markers indicate tests to run",
)
# test summary file
parser.add_argument("--junitxml",
action="store",
default="reports/test-unit.xml",
help="file for returned test results")
parser.add_argument(
"--junitxml",
action="store",
default="reports/test-unit.xml",
help="file for returned test results",
)
# max num nodes in Azure cluster
parser.add_argument("--maxnodes",
action="store",
default=4,
help="specify the maximum number of nodes for the run")
parser.add_argument(
"--maxnodes",
action="store",
default=4,
help="specify the maximum number of nodes for the run",
)
# Azure resource group
parser.add_argument("--rg",
action="store",
default="recommender",
help="Azure Resource Group")
parser.add_argument(
"--rg", action="store", default="recommender", help="Azure Resource Group"
)
# AzureML workspace Name
parser.add_argument("--wsname",
action="store",
default="RecoWS",
help="AzureML workspace name")
parser.add_argument(
"--wsname", action="store", default="RecoWS", help="AzureML workspace name"
)
# AzureML clustername
parser.add_argument("--clustername",
action="store",
default="amlcompute",
help="Set name of Azure cluster")
parser.add_argument(
"--clustername",
action="store",
default="amlcompute",
help="Set name of Azure cluster",
)
# Azure VM size
parser.add_argument("--vmsize",
action="store",
default="STANDARD_D3_V2",
help="Set the size of the VM either STANDARD_D3_V2")
parser.add_argument(
"--vmsize",
action="store",
default="STANDARD_D3_V2",
help="Set the size of the VM either STANDARD_D3_V2",
)
# cpu or gpu
parser.add_argument("--dockerproc",
action="store",
default="cpu",
help="Base image used in docker container")
parser.add_argument(
"--dockerproc",
action="store",
default="cpu",
help="Base image used in docker container",
)
# Azure subscription id, when used in a pipeline, it is stored in keyvault
parser.add_argument("--subid",
action="store",
default="123456",
help="Azure Subscription ID")
parser.add_argument(
"--subid", action="store", default="123456", help="Azure Subscription ID"
)
# ./reco.yaml is created in the azure devops pipeline.
# Not recommended to change this.
parser.add_argument("--condafile",
action="store",
default="./reco.yaml",
help="file with environment variables")
parser.add_argument(
"--condafile",
action="store",
default="./reco.yaml",
help="file with environment variables",
)
# AzureML experiment name
parser.add_argument("--expname",
action="store",
default="persistentAML",
help="experiment name on Azure")
parser.add_argument(
"--expname",
action="store",
default="persistentAML",
help="experiment name on Azure",
)
# Azure datacenter location
parser.add_argument("--location",
default="EastUS",
help="Azure location")
parser.add_argument("--location", default="EastUS", help="Azure location")
# github repo, stored in AzureML experiment for info purposes
parser.add_argument("--reponame",
action="store",
default="--reponame MyGithubRepo",
help="GitHub repo being tested")
parser.add_argument(
"--reponame",
action="store",
default="--reponame MyGithubRepo",
help="GitHub repo being tested",
)
# github branch, stored in AzureML experiment for info purposes
parser.add_argument("--branch",
action="store",
default="--branch MyGithubBranch",
help=" Identify the branch test test is run on")
parser.add_argument(
"--branch",
action="store",
default="--branch MyGithubBranch",
help=" Identify the branch test test is run on",
)
# github pull request, stored in AzureML experiment for info purposes
parser.add_argument("--pr",
action="store",
default="--pr PRTestRun",
help="If a pr triggered the test, list it here")
parser.add_argument(
"--pr",
action="store",
default="--pr PRTestRun",
help="If a pr triggered the test, list it here",
)
args = parser.parse_args()
@ -344,52 +368,60 @@ def create_arg_parser():
if __name__ == "__main__":
logger = logging.getLogger('submit_azureml_pytest.py')
logger = logging.getLogger("submit_azureml_pytest.py")
# logger.setLevel(logging.DEBUG)
# logging.basicConfig(level=logging.DEBUG)
args = create_arg_parser()
if args.dockerproc == "cpu":
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
docker_proc_type = DEFAULT_CPU_IMAGE
else:
from azureml.core.runconfig import DEFAULT_GPU_IMAGE
docker_proc_type = DEFAULT_GPU_IMAGE
cli_auth = AzureCliAuthentication()
workspace = setup_workspace(workspace_name=args.wsname,
subscription_id=args.subid,
resource_group=args.rg,
cli_auth=cli_auth,
location=args.location)
workspace = setup_workspace(
workspace_name=args.wsname,
subscription_id=args.subid,
resource_group=args.rg,
cli_auth=cli_auth,
location=args.location,
)
cpu_cluster = setup_persistent_compute_target(
workspace=workspace,
cluster_name=args.clustername,
vm_size=args.vmsize,
max_nodes=args.maxnodes)
workspace=workspace,
cluster_name=args.clustername,
vm_size=args.vmsize,
max_nodes=args.maxnodes,
)
run_config = create_run_config(cpu_cluster=cpu_cluster,
docker_proc_type=docker_proc_type,
conda_env_file=args.condafile)
run_config = create_run_config(
cpu_cluster=cpu_cluster,
docker_proc_type=docker_proc_type,
conda_env_file=args.condafile,
)
logger.info('exp: In Azure, look for experiment named {}'.format(
args.expname))
logger.info("exp: In Azure, look for experiment named {}".format(args.expname))
# create new or use existing experiment
experiment = Experiment(workspace=workspace, name=args.expname)
run = submit_experiment_to_azureml(test=args.test,
test_folder=args.testfolder,
test_markers=args.testmarkers,
junitxml=args.junitxml,
run_config=run_config,
experiment=experiment)
run = submit_experiment_to_azureml(
test=args.test,
test_folder=args.testfolder,
test_markers=args.testmarkers,
junitxml=args.junitxml,
run_config=run_config,
experiment=experiment,
)
# add helpful information to experiment on Azure
run.tag('RepoName', args.reponame)
run.tag('Branch', args.branch)
run.tag('PR', args.pr)
run.tag("RepoName", args.reponame)
run.tag("Branch", args.branch)
run.tag("PR", args.pr)
# download files from AzureML
run.download_files(prefix='reports', output_paths='./reports')
run.download_files(prefix="reports", output_paths="./reports")
run.complete()

Просмотреть файл

@ -23,4 +23,3 @@ def test_criteo_load_spark_df(spark, criteo_first_row):
assert len(df.columns) == 40
first_row = df.limit(1).collect()[0].asDict()
assert first_row == criteo_first_row

Просмотреть файл

@ -137,13 +137,7 @@ def test_load_pandas_df(
],
)
def test_load_item_df(
size,
num_movies,
movie_example,
title_example,
genres_example,
year_example,
tmp,
size, num_movies, movie_example, title_example, genres_example, year_example, tmp,
):
"""Test movielens item data load (not rating data)
"""
@ -154,7 +148,13 @@ def test_load_item_df(
assert df["title"][0] == title_example
# Test title and genres
df = load_item_df(size, local_cache_path=tmp, movie_col="item", genres_col="genres", year_col="year")
df = load_item_df(
size,
local_cache_path=tmp,
movie_col="item",
genres_col="genres",
year_col="year",
)
assert len(df) == num_movies
# movile_col, genres_col and year_col
assert len(df.columns) == 3
@ -214,10 +214,7 @@ def test_load_spark_df(
# Test if correct data are loaded
header = ["1", "2", "3"]
schema = StructType(
[
StructField("u", IntegerType()),
StructField("m", IntegerType()),
]
[StructField("u", IntegerType()), StructField("m", IntegerType()),]
)
with pytest.warns(Warning):
df = load_spark_df(

Просмотреть файл

@ -11,9 +11,7 @@ from reco_utils.common.constants import (
DEFAULT_RATING_COL,
DEFAULT_TIMESTAMP_COL,
)
from reco_utils.dataset.python_splitters import (
python_chrono_split,
)
from reco_utils.dataset.python_splitters import python_chrono_split
# ncf data generation
@pytest.fixture(scope="module")

Просмотреть файл

@ -14,4 +14,3 @@ def path_notebooks():
return os.path.abspath(
os.path.join(os.path.dirname(__file__), os.path.pardir, "notebooks")
)

Просмотреть файл

@ -49,4 +49,3 @@ def read_matrix(file, row_map=None, col_map=None):
array = array[row_index, :]
array = array[:, col_index]
return array, row_ids, col_ids

Просмотреть файл

@ -37,4 +37,4 @@ def test_extract_criteo(tmp_path):
filepath = criteo.download_criteo(size="sample", work_directory=tmp_path)
filename = criteo.extract_criteo(size="sample", compressed_file=filepath)
statinfo = os.stat(filename)
assert statinfo.st_size == 24328072
assert statinfo.st_size == 24328072

Просмотреть файл

@ -101,13 +101,7 @@ def test_load_pandas_df(
[("100k", 1682, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995")],
)
def test_load_item_df(
size,
num_movies,
movie_example,
title_example,
genres_example,
year_example,
tmp,
size, num_movies, movie_example, title_example, genres_example, year_example, tmp,
):
"""Test movielens item data load (not rating data)
"""
@ -118,7 +112,13 @@ def test_load_item_df(
assert df["title"][0] == title_example
# Test title and genres
df = load_item_df(size, local_cache_path=tmp, movie_col="item", genres_col="genres", year_col="year")
df = load_item_df(
size,
local_cache_path=tmp,
movie_col="item",
genres_col="genres",
year_col="year",
)
assert len(df) == num_movies
# movile_col, genres_col and year_col
assert len(df.columns) == 3
@ -160,10 +160,7 @@ def test_load_spark_df(
# Test if correct data are loaded
header = ["1", "2", "3"]
schema = StructType(
[
StructField("u", IntegerType()),
StructField("m", IntegerType()),
]
[StructField("u", IntegerType()), StructField("m", IntegerType()),]
)
with pytest.warns(Warning):
df = load_spark_df(

Просмотреть файл

@ -36,18 +36,14 @@ def test_als_pyspark_smoke(notebooks):
@pytest.mark.smoke
@pytest.mark.spark
@pytest.mark.skipif(sys.platform == 'win32', reason="Not implemented on Windows")
@pytest.mark.skipif(sys.platform == "win32", reason="Not implemented on Windows")
def test_mmlspark_lightgbm_criteo_smoke(notebooks):
notebook_path = notebooks["mmlspark_lightgbm_criteo"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(
DATA_SIZE="sample",
NUM_ITERATIONS=50,
EARLY_STOPPING_ROUND=10
)
parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=50, EARLY_STOPPING_ROUND=10),
)
nb = pm.read_notebook(OUTPUT_NOTEBOOK)
results = nb.dataframe.set_index("name")["value"]

Просмотреть файл

@ -30,4 +30,3 @@ def test_download_path():
with download_path(tmp_dir.name) as path:
assert os.path.isdir(path)
assert os.path.isdir(path)

Просмотреть файл

@ -29,7 +29,7 @@ def test_clear_memory_all_gpus():
@pytest.mark.gpu
@pytest.mark.skipif(sys.platform == 'win32', reason="Not implemented on Windows")
@pytest.mark.skipif(sys.platform == "win32", reason="Not implemented on Windows")
def test_get_cuda_version():
assert get_cuda_version() > "9.0.0"

Просмотреть файл

@ -3,7 +3,11 @@
import pytest
from reco_utils.recommender.lightfm.lightfm_utils import (
compare_metric, track_model_metrics, similar_users, similar_items)
compare_metric,
track_model_metrics,
similar_users,
similar_items,
)
import itertools
import numpy as np
import pandas as pd
@ -20,95 +24,124 @@ TEST_ITEM_ID = 1
# note user and item ID need to be sequential for similar users and similar items to work
@pytest.fixture(scope="module")
def df():
mock_data = {
'userID':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'itemID':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'rating':[2.0,4.0,1.0,4.0,1.0,2.0,5.0,1.0,1.0,2.0],
'genre':['Action|Comedy','Drama','Drama|Romance|War',
'Drama|Sci-Fi','Horror','Action|Horror|Sci-Fi|Thriller',
'Drama|Romance|War','Western','Comedy','Horror'],
'occupation':['engineer','student','retired',
'administrator','writer','administrator','student','executive','student','other']
}
return pd.DataFrame(mock_data)
mock_data = {
"userID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"itemID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"rating": [2.0, 4.0, 1.0, 4.0, 1.0, 2.0, 5.0, 1.0, 1.0, 2.0],
"genre": [
"Action|Comedy",
"Drama",
"Drama|Romance|War",
"Drama|Sci-Fi",
"Horror",
"Action|Horror|Sci-Fi|Thriller",
"Drama|Romance|War",
"Western",
"Comedy",
"Horror",
],
"occupation": [
"engineer",
"student",
"retired",
"administrator",
"writer",
"administrator",
"student",
"executive",
"student",
"other",
],
}
return pd.DataFrame(mock_data)
@pytest.fixture(scope="module")
def interactions(df):
movie_genre = [x.split('|') for x in df['genre']]
all_movie_genre = sorted(list(set(itertools.chain.from_iterable(movie_genre))))
movie_genre = [x.split("|") for x in df["genre"]]
all_movie_genre = sorted(list(set(itertools.chain.from_iterable(movie_genre))))
all_occupations = sorted(list(set(df['occupation'])))
all_occupations = sorted(list(set(df["occupation"])))
dataset = Dataset()
dataset.fit(df['userID'],
df['itemID'],
item_features=all_movie_genre,
user_features=all_occupations)
dataset = Dataset()
dataset.fit(
df["userID"],
df["itemID"],
item_features=all_movie_genre,
user_features=all_occupations,
)
item_features = dataset.build_item_features(
(x, y) for x,y in zip(df.itemID, movie_genre))
item_features = dataset.build_item_features(
(x, y) for x, y in zip(df.itemID, movie_genre)
)
user_features = dataset.build_user_features(
(x, [y]) for x,y in zip(df.userID, df['occupation']))
user_features = dataset.build_user_features(
(x, [y]) for x, y in zip(df.userID, df["occupation"])
)
(interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values)
(interactions, _) = dataset.build_interactions(df.iloc[:, 0:3].values)
train_interactions, test_interactions = cross_validation.random_train_test_split(
interactions, test_percentage=TEST_PERCENTAGE,
random_state=np.random.RandomState(SEEDNO))
return train_interactions, test_interactions, item_features, user_features
train_interactions, test_interactions = cross_validation.random_train_test_split(
interactions,
test_percentage=TEST_PERCENTAGE,
random_state=np.random.RandomState(SEEDNO),
)
return train_interactions, test_interactions, item_features, user_features
@pytest.fixture(scope="module")
def model():
return LightFM(loss='warp', random_state=np.random.RandomState(SEEDNO))
return LightFM(loss="warp", random_state=np.random.RandomState(SEEDNO))
@pytest.fixture(scope="module")
def fitting(model, interactions, df):
train_interactions, test_interactions, item_features, user_features = interactions
output, fitted_model = track_model_metrics(model=model,
train_interactions=train_interactions,
test_interactions=test_interactions,
user_features = user_features,
item_features = item_features,
show_plot=False)
return output, fitted_model
train_interactions, test_interactions, item_features, user_features = interactions
output, fitted_model = track_model_metrics(
model=model,
train_interactions=train_interactions,
test_interactions=test_interactions,
user_features=user_features,
item_features=item_features,
show_plot=False,
)
return output, fitted_model
@pytest.fixture(scope="module")
def sim_users(interactions, fitting):
_, _, _, user_features = interactions
_, fitted_model = fitting
return similar_users(user_id=TEST_USER_ID, user_features=user_features,
model=fitted_model, N=5)
_, _, _, user_features = interactions
_, fitted_model = fitting
return similar_users(
user_id=TEST_USER_ID, user_features=user_features, model=fitted_model, N=5
)
@pytest.fixture(scope="module")
def sim_items(interactions, fitting):
_, _, item_features, _ = interactions
_, fitted_model = fitting
return similar_items(item_id=TEST_ITEM_ID, item_features=item_features,
model=fitted_model, N=5)
_, _, item_features, _ = interactions
_, fitted_model = fitting
return similar_items(
item_id=TEST_ITEM_ID, item_features=item_features, model=fitted_model, N=5
)
def test_interactions(interactions):
train_interactions, test_interactions, item_features, user_features = interactions
assert train_interactions.shape == (10, 10)
assert test_interactions.shape == (10, 10)
assert item_features.shape == (10, 19)
assert user_features.shape == (10, 17)
train_interactions, test_interactions, item_features, user_features = interactions
assert train_interactions.shape == (10, 10)
assert test_interactions.shape == (10, 10)
assert item_features.shape == (10, 19)
assert user_features.shape == (10, 17)
def test_fitting(fitting):
output, _ = fitting
assert output.shape == (600, 4)
output, _ = fitting
assert output.shape == (600, 4)
def test_sim_users(sim_users):
assert sim_users.shape == (5, 2)
assert sim_users.shape == (5, 2)
def test_sim_items(sim_items):
assert sim_items.shape == (5, 2)
assert sim_items.shape == (5, 2)

Просмотреть файл

@ -17,8 +17,10 @@ BATCH_SIZE = 32
def test_data_preprocessing(python_dataset_ncf):
train, test = python_dataset_ncf
data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST, seed=SEED)
data = Dataset(
train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST, seed=SEED
)
# shape
assert len(data.train) == len(train)
assert len(data.test) == len(test)
@ -39,7 +41,9 @@ def test_data_preprocessing(python_dataset_ncf):
def test_train_loader(python_dataset_ncf):
train, test = python_dataset_ncf
data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST, seed=SEED)
data = Dataset(
train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST, seed=SEED
)
# collect positvie user-item dict
positive_pool = {}
@ -49,7 +53,7 @@ def test_train_loader(python_dataset_ncf):
# without negative sampling
for batch in data.train_loader(batch_size=BATCH_SIZE, shuffle=False):
user, item, labels = batch
#shape
# shape
assert len(user) == BATCH_SIZE
assert len(item) == BATCH_SIZE
assert len(labels) == BATCH_SIZE
@ -58,9 +62,9 @@ def test_train_loader(python_dataset_ncf):
# right labels
for u, i, is_pos in zip(user, item, labels):
if is_pos:
assert i in positive_pool[u]
else:
assert i not in positive_pool[u]
assert i in positive_pool[u]
else:
assert i not in positive_pool[u]
data.negative_sampling()
label_list = []
@ -73,9 +77,9 @@ def test_train_loader(python_dataset_ncf):
# right labels
for u, i, is_pos in zip(user, item, labels):
if is_pos:
assert i in positive_pool[u]
else:
assert i not in positive_pool[u]
assert i in positive_pool[u]
else:
assert i not in positive_pool[u]
label_list.append(is_pos)
@ -90,7 +94,7 @@ def test_test_loader(python_dataset_ncf):
# positive user-item dict, noting that the pool is train+test
positive_pool = {}
df = train.append(test)
for u in df[DEFAULT_USER_COL].unique():
for u in df[DEFAULT_USER_COL].unique():
positive_pool[u] = set(df[df[DEFAULT_USER_COL] == u][DEFAULT_ITEM_COL])
for batch in data.test_loader():
@ -104,9 +108,9 @@ def test_test_loader(python_dataset_ncf):
for u, i, is_pos in zip(user, item, labels):
if is_pos:
assert i in positive_pool[u]
else:
assert i not in positive_pool[u]
assert i in positive_pool[u]
else:
assert i not in positive_pool[u]
label_list.append(is_pos)

Просмотреть файл

@ -24,7 +24,9 @@ N_NEG_TEST = 10
"model_type, n_users, n_items", [("NeuMF", 1, 1), ("GMF", 10, 10), ("MLP", 4, 8)]
)
def test_init(model_type, n_users, n_items):
model = NCF(n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED)
model = NCF(
n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED
)
# model type
assert model.model_type == model_type.lower()
# number of users in dataset
@ -39,7 +41,7 @@ def test_init(model_type, n_users, n_items):
assert model.embedding_mlp_P.shape == [n_users, model.n_factors]
# dimension of mlp item embedding
assert model.embedding_mlp_Q.shape == [n_items, model.n_factors]
# TODO: more parameters
@ -52,7 +54,9 @@ def test_regular_save_load(model_type, n_users, n_items):
if os.path.exists(ckpt):
shutil.rmtree(ckpt)
model = NCF(n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED)
model = NCF(
n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED
)
model.save(ckpt)
if model.model_type == "neumf":
P = model.sess.run(model.embedding_gmf_P)
@ -65,7 +69,9 @@ def test_regular_save_load(model_type, n_users, n_items):
Q = model.sess.run(model.embedding_mlp_Q)
del model
model = NCF(n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED)
model = NCF(
n_users=n_users, n_items=n_items, model_type=model_type, n_epochs=1, seed=SEED
)
if model.model_type == "neumf":
model.load(neumf_dir=ckpt)
@ -89,9 +95,7 @@ def test_regular_save_load(model_type, n_users, n_items):
@pytest.mark.gpu
@pytest.mark.parametrize(
"n_users, n_items", [(5, 5), (4, 8)]
)
@pytest.mark.parametrize("n_users, n_items", [(5, 5), (4, 8)])
def test_neumf_save_load(n_users, n_items):
model_type = "gmf"
ckpt_gmf = ".%s" % model_type
@ -137,31 +141,31 @@ def test_neumf_save_load(n_users, n_items):
@pytest.mark.gpu
@pytest.mark.parametrize(
"model_type", ["NeuMF", "GMF", "MLP"]
)
@pytest.mark.parametrize("model_type", ["NeuMF", "GMF", "MLP"])
def test_fit(python_dataset_ncf, model_type):
train, test = python_dataset_ncf
data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
model = NCF(n_users=data.n_users, n_items=data.n_items, model_type=model_type, n_epochs=1)
model = NCF(
n_users=data.n_users, n_items=data.n_items, model_type=model_type, n_epochs=1
)
model.fit(data)
@pytest.mark.gpu
@pytest.mark.parametrize(
"model_type", ["NeuMF", "GMF", "MLP"]
)
@pytest.mark.parametrize("model_type", ["NeuMF", "GMF", "MLP"])
def test_predict(python_dataset_ncf, model_type):
# test data format
train, test = python_dataset_ncf
data = Dataset(train=train, test=test, n_neg=N_NEG, n_neg_test=N_NEG_TEST)
model = NCF(n_users=data.n_users, n_items=data.n_items, model_type=model_type, n_epochs=1)
model = NCF(
n_users=data.n_users, n_items=data.n_items, model_type=model_type, n_epochs=1
)
model.fit(data)
test_users, test_items = list(test[DEFAULT_USER_COL]), list(test[DEFAULT_ITEM_COL])
assert type(model.predict(test_users[0], test_items[0])) == float
res = model.predict(test_users, test_items, is_list=True)
assert type(res) == list

Просмотреть файл

@ -15,7 +15,7 @@ from reco_utils.tuning.nni.nni_utils import (
check_metrics_written,
get_trials,
NNI_STATUS_URL,
NNI_TRIAL_JOBS_URL
NNI_TRIAL_JOBS_URL,
)
@ -33,6 +33,7 @@ def mocked_status_get(url, content, error):
assert url.startswith(NNI_STATUS_URL)
return MockResponse(content, error)
class MockResponseTrials:
# Class that mocks requests.models.Response
def __init__(self, content):
@ -41,10 +42,11 @@ class MockResponseTrials:
def json(self):
return self._content
def mocked_trials_get(url, content):
assert url.startswith(NNI_TRIAL_JOBS_URL)
return MockResponseTrials(content)
def mock_exception():
raise Exception()
@ -54,17 +56,21 @@ def mock_exception():
def test_get_experiment_status():
content = "some_status"
error = ""
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
with patch(
"requests.get", side_effect=lambda url: mocked_status_get(url, content, error)
):
nni_status = get_experiment_status(NNI_STATUS_URL)
assert nni_status["status"] == "some_status"
assert nni_status["errors"] == [""]
assert nni_status["errors"] == [""]
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
def test_check_experiment_status_done():
content = "DONE"
error = ""
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
with patch(
"requests.get", side_effect=lambda url: mocked_status_get(url, content, error)
):
check_experiment_status(wait=0.1, max_retries=1)
@ -72,7 +78,9 @@ def test_check_experiment_status_done():
def test_check_experiment_status_tuner_no_more_trial():
content = "TUNER_NO_MORE_TRIAL"
error = ""
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
with patch(
"requests.get", side_effect=lambda url: mocked_status_get(url, content, error)
):
check_experiment_status(wait=0.1, max_retries=1)
@ -81,7 +89,10 @@ def test_check_experiment_status_running():
content = "RUNNING"
error = ""
with pytest.raises(TimeoutError) as excinfo:
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
with patch(
"requests.get",
side_effect=lambda url: mocked_status_get(url, content, error),
):
check_experiment_status(wait=0.1, max_retries=1)
assert "check_experiment_status() timed out" == str(excinfo.value)
@ -91,7 +102,10 @@ def test_check_experiment_status_no_more_trial():
content = "NO_MORE_TRIAL"
error = ""
with pytest.raises(TimeoutError) as excinfo:
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
with patch(
"requests.get",
side_effect=lambda url: mocked_status_get(url, content, error),
):
check_experiment_status(wait=0.1, max_retries=1)
assert "check_experiment_status() timed out" == str(excinfo.value)
@ -101,9 +115,15 @@ def test_check_experiment_status_failed():
content = "some_failed_status"
error = "NNI_ERROR"
with pytest.raises(RuntimeError) as excinfo:
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
with patch(
"requests.get",
side_effect=lambda url: mocked_status_get(url, content, error),
):
check_experiment_status(wait=0.1, max_retries=1)
assert "NNI experiment failed to complete with status some_failed_status - NNI_ERROR" == str(excinfo.value)
assert (
"NNI experiment failed to complete with status some_failed_status - NNI_ERROR"
== str(excinfo.value)
)
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
@ -111,30 +131,35 @@ def test_check_stopped_timeout():
content = "some_status"
error = ""
with pytest.raises(TimeoutError) as excinfo:
with patch("requests.get", side_effect=lambda url: mocked_status_get(url, content, error)):
check_stopped(wait=.1, max_retries=1)
with patch(
"requests.get",
side_effect=lambda url: mocked_status_get(url, content, error),
):
check_stopped(wait=0.1, max_retries=1)
assert "check_stopped() timed out" == str(excinfo.value)
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
def test_check_stopped():
with patch("requests.get", side_effect=mock_exception):
check_stopped(wait=.1, max_retries=1)
check_stopped(wait=0.1, max_retries=1)
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
def test_check_metrics_written():
content = [{"finalMetricData": None}, {"finalMetricData": None}]
with patch("requests.get", side_effect=lambda url: mocked_trials_get(url, content)):
check_metrics_written(wait=.1, max_retries=1)
check_metrics_written(wait=0.1, max_retries=1)
@pytest.mark.skipif(sys.platform == "win32", reason="nni not installable on windows")
def test_check_metrics_written_timeout():
content = [{"logPath": "/p"}, {"logPath": "/q"}]
with pytest.raises(TimeoutError) as excinfo:
with patch("requests.get", side_effect=lambda url: mocked_trials_get(url, content)):
check_metrics_written(wait=.1, max_retries=1)
with patch(
"requests.get", side_effect=lambda url: mocked_trials_get(url, content)
):
check_metrics_written(wait=0.1, max_retries=1)
assert "check_metrics_written() timed out" == str(excinfo.value)
@ -142,31 +167,47 @@ def test_check_metrics_written_timeout():
def test_get_trials():
with TemporaryDirectory() as tmp_dir1, TemporaryDirectory() as tmp_dir2:
mock_trials = [
{"finalMetricData": [{"data": '{"rmse":0.8,"default":0.3}'}],
"logPath": "file://localhost:{}".format(tmp_dir1)},
{"finalMetricData": [{"data": '{"rmse":0.9,"default":0.2}'}],
"logPath": "file://localhost:{}".format(tmp_dir2)},
{
"finalMetricData": [{"data": '{"rmse":0.8,"default":0.3}'}],
"logPath": "file://localhost:{}".format(tmp_dir1),
},
{
"finalMetricData": [{"data": '{"rmse":0.9,"default":0.2}'}],
"logPath": "file://localhost:{}".format(tmp_dir2),
},
]
metrics1 = {"rmse": 0.8, "precision_at_k": 0.3}
with open(os.path.join(tmp_dir1, "metrics.json"), "w") as f:
json.dump(metrics1, f)
params1 = {"parameter_id": 1, "parameter_source": "algorithm",
"parameters": {"n_factors": 100, "reg": 0.1}}
params1 = {
"parameter_id": 1,
"parameter_source": "algorithm",
"parameters": {"n_factors": 100, "reg": 0.1},
}
with open(os.path.join(tmp_dir1, "parameter.cfg"), "w") as f:
json.dump(params1, f)
metrics2 = {"rmse": 0.9, "precision_at_k": 0.2}
with open(os.path.join(tmp_dir2, "metrics.json"), "w") as f:
json.dump(metrics2, f)
params2 = {"parameter_id": 2, "parameter_source": "algorithm",
"parameters": {"n_factors": 50, "reg": 0.02}}
params2 = {
"parameter_id": 2,
"parameter_source": "algorithm",
"parameters": {"n_factors": 50, "reg": 0.02},
}
with open(os.path.join(tmp_dir2, "parameter.cfg"), "w") as f:
json.dump(params2, f)
with patch("requests.get", side_effect=lambda url: mocked_trials_get(url, mock_trials)):
trials, best_metrics, best_params, best_trial_path = get_trials(optimize_mode="maximize")
with patch(
"requests.get", side_effect=lambda url: mocked_trials_get(url, mock_trials)
):
trials, best_metrics, best_params, best_trial_path = get_trials(
optimize_mode="maximize"
)
expected_trials = [({"rmse": 0.8, "default": 0.3}, tmp_dir1),
({"rmse": 0.9, "default": 0.2}, tmp_dir2)]
expected_trials = [
({"rmse": 0.8, "default": 0.3}, tmp_dir1),
({"rmse": 0.9, "default": 0.2}, tmp_dir2),
]
assert trials == expected_trials
assert best_metrics == metrics1
assert best_params == params1

Просмотреть файл

@ -17,9 +17,7 @@ def test_is_jupyter():
# Test on Jupyter notebook
path = os.path.join("tests", "unit", "test_notebook_utils.ipynb")
pm.execute_notebook(
path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME,
)
nb = pm.read_notebook(OUTPUT_NOTEBOOK)
df = nb.dataframe
@ -28,6 +26,7 @@ def test_is_jupyter():
result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0]
assert result_is_databricks is False
# @pytest.mark.notebooks
# def test_is_databricks():
# TODO Currently, we cannot pytest modules on Databricks

Просмотреть файл

@ -80,37 +80,31 @@ def test_wide_deep(notebooks, tmp):
model_dir = os.path.join(tmp, "wide_deep_0")
os.mkdir(model_dir)
params = {
'MOVIELENS_DATA_SIZE': '100k',
'STEPS': 1,
'EVALUATE_WHILE_TRAINING': False,
'MODEL_DIR': model_dir,
'EXPORT_DIR_BASE': model_dir,
'RATING_METRICS': ['rmse'],
'RANKING_METRICS': ['ndcg_at_k'],
"MOVIELENS_DATA_SIZE": "100k",
"STEPS": 1,
"EVALUATE_WHILE_TRAINING": False,
"MODEL_DIR": model_dir,
"EXPORT_DIR_BASE": model_dir,
"RATING_METRICS": ["rmse"],
"RANKING_METRICS": ["ndcg_at_k"],
}
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=params,
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params,
)
# Test with different parameters
model_dir = os.path.join(tmp, "wide_deep_1")
os.mkdir(model_dir)
params = {
'MOVIELENS_DATA_SIZE': '100k',
'STEPS': 1,
'ITEM_FEAT_COL': None,
'EVALUATE_WHILE_TRAINING': True,
'MODEL_DIR': model_dir,
'EXPORT_DIR_BASE': model_dir,
'RATING_METRICS': ['rsquared'],
'RANKING_METRICS': ['map_at_k'],
"MOVIELENS_DATA_SIZE": "100k",
"STEPS": 1,
"ITEM_FEAT_COL": None,
"EVALUATE_WHILE_TRAINING": True,
"MODEL_DIR": model_dir,
"EXPORT_DIR_BASE": model_dir,
"RATING_METRICS": ["rsquared"],
"RANKING_METRICS": ["map_at_k"],
}
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=params,
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params,
)

Просмотреть файл

@ -20,15 +20,13 @@ from reco_utils.dataset.pandas_df_utils import (
@pytest.fixture(scope="module")
def user_item_dataset():
"""Get users and items dataframe"""
user_df = pd.DataFrame({
'user_id': [1, 2, 3, 4, 5],
'user_age': [23, 24, 25, 26, 27]
})
user_df = pd.DataFrame(
{"user_id": [1, 2, 3, 4, 5], "user_age": [23, 24, 25, 26, 27]}
)
item_df = pd.DataFrame({
'item_id': [6, 7, 8],
'item_feat': [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]]
})
item_df = pd.DataFrame(
{"item_id": [6, 7, 8], "item_feat": [[0.1, 0.1], [0.2, 0.2], [0.3, 0.3]]}
)
return user_df, item_df
@ -39,14 +37,15 @@ def test_user_item_pairs(user_item_dataset):
user_item = user_item_pairs(
user_df=user_df,
item_df=item_df,
user_col='user_id',
item_col='item_id',
shuffle=False
user_col="user_id",
item_col="item_id",
shuffle=False,
)
# Validate cross-join
assert len(user_df) * len(item_df) == len(user_item)
assert user_item.loc[(user_item['user_id'] == 3) & (user_item['item_id'] == 7)].values.tolist()[0]\
== [3, 25, 7, [0.2, 0.2]]
assert user_item.loc[
(user_item["user_id"] == 3) & (user_item["item_id"] == 7)
].values.tolist()[0] == [3, 25, 7, [0.2, 0.2]]
# Check if result is deterministic
assert user_item.iloc[0].values.tolist() == [1, 23, 6, [0.1, 0.1]]
@ -55,124 +54,153 @@ def test_user_item_pairs(user_item_dataset):
user_item_shuffled = user_item_pairs(
user_df=user_df,
item_df=item_df,
user_col='user_id',
item_col='item_id',
shuffle=True
user_col="user_id",
item_col="item_id",
shuffle=True,
)
# Check shuffled result is still valid
assert len(user_df) * len(item_df) == len(user_item_shuffled)
row = user_item.loc[(user_item['user_id'] == 2) & (user_item['item_id'] == 6)]
assert row['user_age'].iloc[0] == 24
assert row['item_feat'].iloc[0] == [0.1, 0.1]
row = user_item.loc[(user_item["user_id"] == 2) & (user_item["item_id"] == 6)]
assert row["user_age"].iloc[0] == 24
assert row["item_feat"].iloc[0] == [0.1, 0.1]
# Check shuffled result is different from not-shuffled dataframe
assert [*user_item_shuffled['user_id'].values] != [*user_item['user_id'].values]
assert [*user_item_shuffled["user_id"].values] != [*user_item["user_id"].values]
# Check filter
seen_df = pd.DataFrame({
'user_id': [1, 9, 3, 5, 5, 1],
'item_id': [1, 6, 7, 6, 8, 9]
})
seen_df = pd.DataFrame(
{"user_id": [1, 9, 3, 5, 5, 1], "item_id": [1, 6, 7, 6, 8, 9]}
)
user_item_filtered = user_item_pairs(
user_df=user_df,
item_df=item_df,
user_col='user_id',
item_col='item_id',
user_col="user_id",
item_col="item_id",
user_item_filter_df=seen_df,
shuffle=False
shuffle=False,
)
# Check filtered out number
assert len(user_item_filtered) == len(user_item) - 3
# Check filtered out record
assert len(user_item_filtered.loc[(user_item['user_id'] == 3) & (user_item['item_id'] == 7)]) == 0
assert (
len(
user_item_filtered.loc[
(user_item["user_id"] == 3) & (user_item["item_id"] == 7)
]
)
== 0
)
def test_filter_by():
user_df = pd.DataFrame({
'user_id': [1, 9, 3, 5, 5, 1],
'item_id': [1, 6, 7, 6, 8, 9]
})
user_df = pd.DataFrame(
{"user_id": [1, 9, 3, 5, 5, 1], "item_id": [1, 6, 7, 6, 8, 9]}
)
seen_df = pd.DataFrame({
'user_id': [1, 2, 4],
})
seen_df = pd.DataFrame({"user_id": [1, 2, 4],})
filtered_df = filter_by(user_df, seen_df, ['user_id'])
filtered_df = filter_by(user_df, seen_df, ["user_id"])
# Check filtered out number
assert len(filtered_df) == len(user_df) - 2
# Check filtered out record
assert len(filtered_df.loc[(user_df['user_id'] == 1)]) == 0
assert len(filtered_df.loc[(user_df["user_id"] == 1)]) == 0
def test_csv_to_libffm():
df_feature = pd.DataFrame({
'rating': [1, 0, 0, 1, 1],
'field1': ['xxx1', 'xxx2', 'xxx4', 'xxx4', 'xxx4'],
'field2': [3, 4, 5, 6, 7],
'field3': [1.0, 2.0, 3.0, 4.0, 5.0],
'field4': ['1', '2', '3', '4', '5']
})
df_feature = pd.DataFrame(
{
"rating": [1, 0, 0, 1, 1],
"field1": ["xxx1", "xxx2", "xxx4", "xxx4", "xxx4"],
"field2": [3, 4, 5, 6, 7],
"field3": [1.0, 2.0, 3.0, 4.0, 5.0],
"field4": ["1", "2", "3", "4", "5"],
}
)
with TemporaryDirectory() as td:
filepath = os.path.join(td, "test")
converter = LibffmConverter(filepath=filepath).fit(df_feature)
df_feature_libffm = converter.transform(df_feature)
# Check the input column types. For example, a bool type is not allowed.
df_feature_wrong_type = df_feature.copy()
df_feature_wrong_type['field4'] = True
df_feature_wrong_type["field4"] = True
with pytest.raises(TypeError) as e:
LibffmConverter().fit(df_feature_wrong_type)
assert e.value == "Input columns should be only object and/or numeric types."
assert (
e.value == "Input columns should be only object and/or numeric types."
)
# Check if the dim is the same.
assert df_feature_libffm.shape == df_feature.shape
# Check if the columns are converted successfully.
assert df_feature_libffm.iloc[0, :].values.tolist() == [1, '1:1:1', '2:4:3', '3:5:1.0', '4:6:1']
assert df_feature_libffm.iloc[0, :].values.tolist() == [
1,
"1:1:1",
"2:4:3",
"3:5:1.0",
"4:6:1",
]
# Check if the duplicated column entries are indexed correctly.
# It should skip counting the duplicated features in a field column.
assert df_feature_libffm.iloc[-1, :].values.tolist() == [1, '1:3:1', '2:4:7', '3:5:5.0', '4:10:1']
assert df_feature_libffm.iloc[-1, :].values.tolist() == [
1,
"1:3:1",
"2:4:7",
"3:5:5.0",
"4:10:1",
]
# Check if the file is written successfully.
assert os.path.isfile(filepath)
with open(filepath, 'r') as f:
with open(filepath, "r") as f:
line = f.readline()
assert line == '1 1:1:1 2:4:3 3:5:1.0 4:6:1\n'
assert line == "1 1:1:1 2:4:3 3:5:1.0 4:6:1\n"
# Parameters in the transformation should be reported correctly.
params = converter.get_params()
assert params == {
'field count': 4,
'feature count': 10,
'file path': filepath
}
assert params == {"field count": 4, "feature count": 10, "file path": filepath}
# Dataset with the same columns should be transformable with a fitted converter.
df_feature_new = pd.DataFrame({
'rating': [1, 0, 0, 1, 1, 1],
'field1': ['xxx1', 'xxx2', 'xxx4', 'xxx4', 'xxx4', 'xxx3'],
'field2': [3, 4, 5, 6, 7, 8],
'field3': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
'field4': ['1', '2', '3', '4', '5', '6']
})
df_feature_new = pd.DataFrame(
{
"rating": [1, 0, 0, 1, 1, 1],
"field1": ["xxx1", "xxx2", "xxx4", "xxx4", "xxx4", "xxx3"],
"field2": [3, 4, 5, 6, 7, 8],
"field3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
"field4": ["1", "2", "3", "4", "5", "6"],
}
)
df_feature_new_libffm = converter.transform(df_feature_new)
assert df_feature_new_libffm.iloc[0, :].values.tolist() == [1, '1:1:1', '2:5:3', '3:6:1.0', '4:7:1']
assert df_feature_new_libffm.iloc[-1, :].values.tolist() == [1, '1:4:1', '2:5:8', '3:6:6.0', '4:12:1']
assert df_feature_new_libffm.iloc[0, :].values.tolist() == [
1,
"1:1:1",
"2:5:3",
"3:6:1.0",
"4:7:1",
]
assert df_feature_new_libffm.iloc[-1, :].values.tolist() == [
1,
"1:4:1",
"2:5:8",
"3:6:6.0",
"4:12:1",
]
def test_has_columns():
df_1 = pd.DataFrame(dict(a=[1, 2, 3]))
df_2 = pd.DataFrame(dict(b=[7, 8, 9], a=[1, 2, 3]))
assert has_columns(df_1, ['a'])
assert has_columns(df_2, ['a'])
assert has_columns(df_2, ['a', 'b'])
assert not has_columns(df_2, ['a', 'b', 'c'])
assert has_columns(df_1, ["a"])
assert has_columns(df_2, ["a"])
assert has_columns(df_2, ["a", "b"])
assert not has_columns(df_2, ["a", "b", "c"])
def test_has_same_base_dtype():
@ -180,7 +208,7 @@ def test_has_same_base_dtype():
arr_int64 = np.array([1, 2, 3], dtype=np.int64)
arr_float32 = np.array([1, 2, 3], dtype=np.float32)
arr_float64 = np.array([1, 2, 3], dtype=np.float64)
arr_str = ['a', 'b', 'c']
arr_str = ["a", "b", "c"]
df_1 = pd.DataFrame(dict(a=arr_int32, b=arr_int64))
df_2 = pd.DataFrame(dict(a=arr_int64, b=arr_int32))
@ -192,42 +220,60 @@ def test_has_same_base_dtype():
# all columns match
assert has_same_base_dtype(df_1, df_2)
# specific column matches
assert has_same_base_dtype(df_3, df_4, columns=['a'])
assert has_same_base_dtype(df_3, df_4, columns=["a"])
# some column types do not match
assert not has_same_base_dtype(df_3, df_4)
# column types do not match
assert not has_same_base_dtype(df_1, df_3, columns=['a'])
assert not has_same_base_dtype(df_1, df_3, columns=["a"])
# all columns are not shared
assert not has_same_base_dtype(df_4, df_5)
# column types do not match
assert not has_same_base_dtype(df_5, df_6, columns=['a'])
assert not has_same_base_dtype(df_5, df_6, columns=["a"])
# assert string columns match
assert has_same_base_dtype(df_6, df_6)
def test_lru_cache_df():
df1 = pd.DataFrame(dict(a=[1, 2, 3], b=['a', 'b', 'c']))
df2 = pd.DataFrame(dict(a=[1, 2, 3], c=['a', 'b', 'c']))
df3 = pd.DataFrame(dict(a=[1, 2, 3], b=['a', 'b', 'd']))
df1 = pd.DataFrame(dict(a=[1, 2, 3], b=["a", "b", "c"]))
df2 = pd.DataFrame(dict(a=[1, 2, 3], c=["a", "b", "c"]))
df3 = pd.DataFrame(dict(a=[1, 2, 3], b=["a", "b", "d"]))
@lru_cache_df(maxsize=2)
def cached_func(df):
pass
assert 'CacheInfo(hits=0, misses=0, maxsize=2, currsize=0)' == str(cached_func.cache_info())
assert "CacheInfo(hits=0, misses=0, maxsize=2, currsize=0)" == str(
cached_func.cache_info()
)
cached_func(df1)
assert 'CacheInfo(hits=0, misses=1, maxsize=2, currsize=1)' == str(cached_func.cache_info())
assert "CacheInfo(hits=0, misses=1, maxsize=2, currsize=1)" == str(
cached_func.cache_info()
)
cached_func(df1)
assert 'CacheInfo(hits=1, misses=1, maxsize=2, currsize=1)' == str(cached_func.cache_info())
assert "CacheInfo(hits=1, misses=1, maxsize=2, currsize=1)" == str(
cached_func.cache_info()
)
cached_func(df2)
assert 'CacheInfo(hits=1, misses=2, maxsize=2, currsize=2)' == str(cached_func.cache_info())
assert "CacheInfo(hits=1, misses=2, maxsize=2, currsize=2)" == str(
cached_func.cache_info()
)
cached_func(df2)
assert 'CacheInfo(hits=2, misses=2, maxsize=2, currsize=2)' == str(cached_func.cache_info())
assert "CacheInfo(hits=2, misses=2, maxsize=2, currsize=2)" == str(
cached_func.cache_info()
)
cached_func(df3)
assert 'CacheInfo(hits=2, misses=3, maxsize=2, currsize=2)' == str(cached_func.cache_info())
assert "CacheInfo(hits=2, misses=3, maxsize=2, currsize=2)" == str(
cached_func.cache_info()
)
cached_func(df1)
assert 'CacheInfo(hits=2, misses=4, maxsize=2, currsize=2)' == str(cached_func.cache_info())
assert "CacheInfo(hits=2, misses=4, maxsize=2, currsize=2)" == str(
cached_func.cache_info()
)
cached_func(df3)
assert 'CacheInfo(hits=3, misses=4, maxsize=2, currsize=2)' == str(cached_func.cache_info())
assert "CacheInfo(hits=3, misses=4, maxsize=2, currsize=2)" == str(
cached_func.cache_info()
)
cached_func.cache_clear()
assert 'CacheInfo(hits=0, misses=0, maxsize=2, currsize=0)' == str(cached_func.cache_info())
assert "CacheInfo(hits=0, misses=0, maxsize=2, currsize=0)" == str(
cached_func.cache_info()
)

Просмотреть файл

@ -11,15 +11,13 @@ def test_line_graph():
x_guides=[0, 1],
x_name="Epoch",
y_name="Accuracy",
legend_loc='best'
legend_loc="best",
)
plt.close()
# Single graph as a subplot
line_graph(
values=[1, 2, 3],
labels="Train",
subplot=(1, 1, 1),
values=[1, 2, 3], labels="Train", subplot=(1, 1, 1),
)
plt.close()

Просмотреть файл

@ -393,14 +393,22 @@ def test_python_errors(rating_true, rating_pred):
rmse(rating_true, rating_true, col_user="not_user")
with pytest.raises(ValueError):
mae(rating_pred, rating_pred, col_rating=DEFAULT_PREDICTION_COL, col_user="not_user")
mae(
rating_pred,
rating_pred,
col_rating=DEFAULT_PREDICTION_COL,
col_user="not_user",
)
with pytest.raises(ValueError):
rsquared(rating_true, rating_pred, col_item="not_item")
with pytest.raises(ValueError):
exp_var(
rating_pred, rating_pred, col_rating=DEFAULT_PREDICTION_COL, col_item="not_item"
rating_pred,
rating_pred,
col_rating=DEFAULT_PREDICTION_COL,
col_item="not_item",
)
with pytest.raises(ValueError):
@ -414,5 +422,8 @@ def test_python_errors(rating_true, rating_pred):
with pytest.raises(ValueError):
map_at_k(
rating_pred, rating_pred, col_rating=DEFAULT_PREDICTION_COL, col_user="not_user"
rating_pred,
rating_pred,
col_rating=DEFAULT_PREDICTION_COL,
col_user="not_user",
)

Просмотреть файл

@ -119,7 +119,7 @@ def test_min_rating_filter():
{
DEFAULT_USER_COL: [1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5],
DEFAULT_ITEM_COL: [5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1],
DEFAULT_RATING_COL: np.random.randint(1, 6, 15)
DEFAULT_RATING_COL: np.random.randint(1, 6, 15),
}
)
@ -198,10 +198,11 @@ def test_random_splitter(test_specs, python_dataset):
# check values sum to 1
splits = python_random_split(
python_dataset, ratio=[.7, .2, .1], seed=test_specs["seed"]
python_dataset, ratio=[0.7, 0.2, 0.1], seed=test_specs["seed"]
)
assert(len(splits)) == 3
assert (len(splits)) == 3
def test_chrono_splitter(test_specs, python_dataset):
splits = python_chrono_split(
@ -435,4 +436,3 @@ def test_float_numpy_stratified_splitter(test_specs, python_float_dataset):
assert Xtst_rated / X_rated == pytest.approx(
(1 - test_specs["ratio"]), rel=test_specs["fluctuation"]
)

Просмотреть файл

@ -17,37 +17,43 @@ TOL = 0.0001
@pytest.fixture
def target_matrices(scope="module"):
J1 = np.array([[1.0, 0.0, 0.5],
[0.0, 1.0, 0.33333],
[0.5, 0.33333, 1.0]])
J2 = np.array([[1.0, 0.0, 0.0, 0.2],
[0.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 0.5],
[0.2, 0.0, 0.5, 1.0]])
L1 = np.array([[1.0, 0.0, 0.5],
[0.0, 0.5, 0.25],
[0.5, 0.25, 0.5]])
L2 = np.array([[0.5, 0.0, 0.0, 0.125],
[0.0, 0.33333, 0.0, 0.0],
[0.0, 0.0, 0.5, 0.25],
[0.125, 0.0, 0.25, 0.25]])
J1 = np.array([[1.0, 0.0, 0.5], [0.0, 1.0, 0.33333], [0.5, 0.33333, 1.0]])
J2 = np.array(
[
[1.0, 0.0, 0.0, 0.2],
[0.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 0.5],
[0.2, 0.0, 0.5, 1.0],
]
)
L1 = np.array([[1.0, 0.0, 0.5], [0.0, 0.5, 0.25], [0.5, 0.25, 0.5]])
L2 = np.array(
[
[0.5, 0.0, 0.0, 0.125],
[0.0, 0.33333, 0.0, 0.0],
[0.0, 0.0, 0.5, 0.25],
[0.125, 0.0, 0.25, 0.25],
]
)
return {
"jaccard1": pytest.approx(J1, TOL),
"jaccard2": pytest.approx(J2, TOL),
"lift1": pytest.approx(L1, TOL),
"lift2": pytest.approx(L2, TOL)
"lift2": pytest.approx(L2, TOL),
}
@pytest.fixture(scope="module")
def python_data():
cooccurrence1 = np.array([[1.0, 0.0, 1.0],
[0.0, 2.0, 1.0],
[1.0, 1.0, 2.0]])
cooccurrence2 = np.array([[2.0, 0.0, 0.0, 1.0],
[0.0, 3.0, 0.0, 0.0],
[0.0, 0.0, 2.0, 2.0],
[1.0, 0.0, 2.0, 4.0]])
cooccurrence1 = np.array([[1.0, 0.0, 1.0], [0.0, 2.0, 1.0], [1.0, 1.0, 2.0]])
cooccurrence2 = np.array(
[
[2.0, 0.0, 0.0, 1.0],
[0.0, 3.0, 0.0, 0.0],
[0.0, 0.0, 2.0, 2.0],
[1.0, 0.0, 2.0, 4.0],
]
)
return cooccurrence1, cooccurrence2
@ -75,14 +81,16 @@ def test_python_lift(python_data, target_matrices):
def test_exponential_decay():
values = np.array([1, 2, 3, 4, 5, 6])
expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1., 1.])
expected = np.array([0.25, 0.35355339, 0.5, 0.70710678, 1.0, 1.0])
actual = exponential_decay(value=values, max_val=5, half_life=2)
assert np.allclose(actual, expected, atol=TOL)
def test_get_top_k_scored_items():
scores = np.array([[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [1, 5, 3, 4, 2]])
top_items, top_scores = get_top_k_scored_items(scores=scores, top_k=3, sort_top_k=True)
top_items, top_scores = get_top_k_scored_items(
scores=scores, top_k=3, sort_top_k=True
)
assert np.array_equal(top_items, np.array([[4, 3, 2], [0, 1, 2], [1, 3, 2]]))
assert np.array_equal(top_scores, np.array([[5, 4, 3], [5, 4, 3], [5, 4, 3]]))

Просмотреть файл

@ -274,30 +274,42 @@ def test_get_popularity_based_topk(header):
def test_get_normalized_scores(header):
train = pd.DataFrame({header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2],
header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7],
header["col_rating"]: [3., 4., 5., 4., 3., 2., 1., 5.],
header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800]})
test = pd.DataFrame({header["col_user"]: [1, 1, 1, 2, 2, 2],
header["col_item"]: [5, 6, 7, 2, 3, 4],
header["col_rating"]: [2., 1., 5., 3., 4., 5.]})
train = pd.DataFrame(
{
header["col_user"]: [1, 1, 1, 1, 2, 2, 2, 2],
header["col_item"]: [1, 2, 3, 4, 1, 5, 6, 7],
header["col_rating"]: [3.0, 4.0, 5.0, 4.0, 3.0, 2.0, 1.0, 5.0],
header["col_timestamp"]: [1, 20, 30, 400, 50, 60, 70, 800],
}
)
test = pd.DataFrame(
{
header["col_user"]: [1, 1, 1, 2, 2, 2],
header["col_item"]: [5, 6, 7, 2, 3, 4],
header["col_rating"]: [2.0, 1.0, 5.0, 3.0, 4.0, 5.0],
}
)
model = SARSingleNode(**header, timedecay_formula=True, normalize=True)
model.fit(train)
actual = model.score(test, remove_seen=True, normalize=True)
expected = np.array([
[-np.inf, -np.inf, -np.inf, -np.inf, 3., 3., 3.],
[-np.inf, 3., 3., 3., -np.inf, -np.inf, -np.inf],
])
expected = np.array(
[
[-np.inf, -np.inf, -np.inf, -np.inf, 3.0, 3.0, 3.0],
[-np.inf, 3.0, 3.0, 3.0, -np.inf, -np.inf, -np.inf],
]
)
assert actual.shape == (2, 7)
assert isinstance(actual, np.ndarray)
assert np.isclose(expected, actual).all()
actual = model.score(test, normalize=True)
expected = np.array([
[3.80000633, 4.14285448, 4.14285448, 4.14285448, 3., 3., 3.],
[2.8000859, 3., 3., 3., 2.71441353, 2.71441353, 2.71441353]
])
expected = np.array(
[
[3.80000633, 4.14285448, 4.14285448, 4.14285448, 3.0, 3.0, 3.0],
[2.8000859, 3.0, 3.0, 3.0, 2.71441353, 2.71441353, 2.71441353],
]
)
assert actual.shape == (2, 7)
assert isinstance(actual, np.ndarray)

Просмотреть файл

@ -29,16 +29,23 @@ from reco_utils.common.constants import (
)
from reco_utils.evaluation.python_evaluation import rmse
ITEM_FEAT_COL = 'itemFeat'
ITEM_FEAT_COL = "itemFeat"
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def pd_df():
df = pd.DataFrame(
{
DEFAULT_USER_COL: [1, 1, 1, 2, 2, 2],
DEFAULT_ITEM_COL: [1, 2, 3, 1, 4, 5],
ITEM_FEAT_COL: [[1, 1, 1], [2, 2, 2], [3, 3, 3], [1, 1, 1], [4, 4, 4], [5, 5, 5]],
ITEM_FEAT_COL: [
[1, 1, 1],
[2, 2, 2],
[3, 3, 3],
[1, 1, 1],
[4, 4, 4],
[5, 5, 5],
],
DEFAULT_RATING_COL: [5, 4, 3, 5, 5, 3],
}
)
@ -56,10 +63,10 @@ def test_pandas_input_fn(pd_df):
batch = dataset.make_one_shot_iterator().get_next()
with tf.Session() as sess:
features = sess.run(batch)
# check the input function returns all the columns
assert len(features) == len(df.columns)
for k, v in features.items():
assert k in df.columns.values
# check if a list feature column converted correctly
@ -67,7 +74,7 @@ def test_pandas_input_fn(pd_df):
assert np.array_equal(v, df[k].values)
elif len(v.shape) == 2:
assert v.shape[1] == len(df[k][0])
# check dataset with shuffles
dataset = pandas_input_fn(df, shuffle=True, seed=SEED)()
batch = dataset.make_one_shot_iterator().get_next()
@ -76,7 +83,7 @@ def test_pandas_input_fn(pd_df):
print(features)
# check the input function returns all the columns
assert len(features) == len(df.columns)
for k, v in features.items():
assert k in df.columns.values
# check if a list feature column converted correctly
@ -90,30 +97,32 @@ def test_pandas_input_fn(pd_df):
batch = dataset_with_label.make_one_shot_iterator().get_next()
with tf.Session() as sess:
features, label = sess.run(batch)
assert len(features) == len(df.columns) - 1 # label should not be in the features
assert (
len(features) == len(df.columns) - 1
) # label should not be in the features
@pytest.mark.gpu
def test_build_optimizer():
adadelta = build_optimizer('Adadelta')
adadelta = build_optimizer("Adadelta")
assert isinstance(adadelta, tf.train.AdadeltaOptimizer)
adagrad = build_optimizer('Adagrad')
adagrad = build_optimizer("Adagrad")
assert isinstance(adagrad, tf.train.AdagradOptimizer)
adam = build_optimizer('Adam')
adam = build_optimizer("Adam")
assert isinstance(adam, tf.train.AdamOptimizer)
ftrl = build_optimizer('Ftrl', **{'l1_regularization_strength': 0.001})
assert isinstance(ftrl, tf.train.FtrlOptimizer)
ftrl = build_optimizer("Ftrl", **{"l1_regularization_strength": 0.001})
assert isinstance(ftrl, tf.train.FtrlOptimizer)
momentum = build_optimizer('Momentum', **{'momentum': 0.5})
momentum = build_optimizer("Momentum", **{"momentum": 0.5})
assert isinstance(momentum, tf.train.MomentumOptimizer)
rmsprop = build_optimizer('RMSProp')
rmsprop = build_optimizer("RMSProp")
assert isinstance(rmsprop, tf.train.RMSPropOptimizer)
sgd = build_optimizer('SGD')
sgd = build_optimizer("SGD")
assert isinstance(sgd, tf.train.GradientDescentOptimizer)
@ -125,12 +134,12 @@ def test_evaluation_log_hook(pd_df, tmp):
hook_frequency = 10
train_steps = 10
_, deep_columns = build_feature_columns(users, items, model_type='deep')
_, deep_columns = build_feature_columns(users, items, model_type="deep")
model = build_model(
tmp,
deep_columns=deep_columns,
save_checkpoints_steps=train_steps//hook_frequency
save_checkpoints_steps=train_steps // hook_frequency,
)
evaluation_logger = MetricsLogger()
@ -143,7 +152,7 @@ def test_evaluation_log_hook(pd_df, tmp):
true_df=data,
y_col=DEFAULT_RATING_COL,
eval_df=data.drop(DEFAULT_RATING_COL, axis=1),
every_n_iter=train_steps//hook_frequency,
every_n_iter=train_steps // hook_frequency,
model_dir=tmp,
eval_fns=[rmse],
)
@ -154,12 +163,12 @@ def test_evaluation_log_hook(pd_df, tmp):
y_col=DEFAULT_RATING_COL,
batch_size=1,
num_epochs=None,
shuffle=True
shuffle=True,
),
hooks=hooks,
steps=train_steps
steps=train_steps,
)
# Check if hook logged the given metric
assert rmse.__name__ in evaluation_logger.get_log()
assert len(evaluation_logger.get_log()[rmse.__name__]) == hook_frequency
@ -175,20 +184,13 @@ def test_pandas_input_fn_for_saved_model(pd_df, tmp):
data, users, items = pd_df
model_dir = os.path.join(tmp, "model")
export_dir = os.path.join(tmp, "export")
_, deep_columns = build_feature_columns(users, items, model_type='deep')
_, deep_columns = build_feature_columns(users, items, model_type="deep")
# Train a model
model = build_model(
model_dir,
deep_columns=deep_columns,
)
model = build_model(model_dir, deep_columns=deep_columns,)
train_fn = pandas_input_fn(
df=data,
y_col=DEFAULT_RATING_COL,
batch_size=1,
num_epochs=None,
shuffle=True
df=data, y_col=DEFAULT_RATING_COL, batch_size=1, num_epochs=None, shuffle=True
)
model.train(input_fn=train_fn, steps=1)
@ -196,32 +198,31 @@ def test_pandas_input_fn_for_saved_model(pd_df, tmp):
exported_path = export_model(
model=model,
train_input_fn=train_fn,
eval_input_fn=pandas_input_fn(
df=data, y_col=DEFAULT_RATING_COL
),
eval_input_fn=pandas_input_fn(df=data, y_col=DEFAULT_RATING_COL),
tf_feat_cols=deep_columns,
base_dir=export_dir
base_dir=export_dir,
)
saved_model = tf.contrib.estimator.SavedModelEstimator(exported_path)
# Test pandas_input_fn_for_saved_model with the saved model
test = data.drop(DEFAULT_RATING_COL, axis=1)
test.reset_index(drop=True, inplace=True)
list(itertools.islice(
saved_model.predict(
pandas_input_fn_for_saved_model(
df=test,
feat_name_type={
DEFAULT_USER_COL: int,
DEFAULT_ITEM_COL: int,
ITEM_FEAT_COL: list
}
)
),
len(test)
))
list(
itertools.islice(
saved_model.predict(
pandas_input_fn_for_saved_model(
df=test,
feat_name_type={
DEFAULT_USER_COL: int,
DEFAULT_ITEM_COL: int,
ITEM_FEAT_COL: list,
},
)
),
len(test),
)
)
# Close the event file so that the model folder can be cleaned up.
summary_writer = tf.summary.FileWriterCache.get(model.model_dir)
summary_writer.close()

Просмотреть файл

@ -132,4 +132,3 @@ def test_wide_deep_model(pd_df, tmp):
# Close the event file so that the model folder can be cleaned up.
summary_writer = tf.summary.FileWriterCache.get(model.model_dir)
summary_writer.close()