add lambda rank azureml example
This commit is contained in:
Родитель
02bb814189
Коммит
7b5b359fa6
|
@ -0,0 +1,10 @@
|
|||
# AzureML example
|
||||
|
||||
In the doc, user could learn how to use lightgbm transformation in azure machine learning.
|
||||
The example describes a pipeline containing training for lambdarank objective, inference and evaluation using [Azure ML Component SDK](https://componentsdk.azurewebsites.net/overview.html).
|
||||
|
||||
## Steps
|
||||
- Run command `pip install -r requirements.txt` and install the dependencies of submission.
|
||||
- Download `config.json` of your azure ml workspace and put it under `azureml` folder.
|
||||
- Run command `python run.py`.
|
||||
- Click the shown experiment link and check in your workspace.
|
|
@ -0,0 +1,61 @@
|
|||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from itertools import islice
|
||||
|
||||
import numpy as np
|
||||
from sklearn.metrics import ndcg_score
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--predict-result',
|
||||
help='The predict result for the evaluating data.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--evaluation-result',
|
||||
help='The output evaluation result.',
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
predict_result = args.predict_result
|
||||
evaluation_result = args.evaluation_result
|
||||
logger = logging.getLogger()
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(message)s')
|
||||
|
||||
logger.info('>>>>>Evaluation Module>>>>>')
|
||||
if os.path.isdir(predict_result):
|
||||
logger.info('[Evaluation] Directory "{}" is provided, use default file name "pred.result"'.format(predict_result))
|
||||
predict_result = os.path.join(predict_result, 'pred.result')
|
||||
|
||||
if os.path.isdir(evaluation_result):
|
||||
logger.info('[Evaluation] Directory "{}" is provided, use default file name "evaluation_result"'.format(evaluation_result))
|
||||
evaluation_result = os.path.join(evaluation_result, 'evaluation_result')
|
||||
|
||||
with open(predict_result) as fin:
|
||||
predict_result_dict = json.load(fin)
|
||||
|
||||
output_list = []
|
||||
pred_prob = np.array(predict_result_dict['pred_prob'])
|
||||
query = np.array(predict_result_dict["query"]).astype(int)
|
||||
true_label = np.array(predict_result_dict["true_label"]).astype(int)
|
||||
pred_prob_iter = iter(pred_prob)
|
||||
pred_prob_res = [list(islice(pred_prob_iter, 0, q)) for q in query]
|
||||
true_label_iter = iter(true_label)
|
||||
true_label_res = [list(islice(true_label_iter, 0, q)) for q in query]
|
||||
ndcg_scores = []
|
||||
for true_relevance, scores in zip(true_label_res, pred_prob_res):
|
||||
true_relevance = np.asarray([list(true_relevance)])
|
||||
scores = np.asarray([list(scores)])
|
||||
ndcg_scores.append(ndcg_score(true_relevance, scores))
|
||||
|
||||
ndcg_scores = np.array(ndcg_scores)
|
||||
output_list.append(f'[Evaluation] NDCG on evaluation data: {np.array2string(ndcg_scores)}')
|
||||
output_list.append(f'[Evaluation] Average NDCG on evaluation data: {np.average(ndcg_scores)}')
|
||||
output_str = '\n'.join(output_list)
|
||||
logger.info(output_str)
|
||||
with open(evaluation_result, 'w') as fout:
|
||||
fout.write(output_str)
|
||||
|
||||
logger.info('<<<<<Evaluation Module<<<<<')
|
|
@ -0,0 +1,32 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
$schema: http://azureml/sdk-2-0/CommandComponent.json
|
||||
name: microsoft.com.azureml.test.evaluation
|
||||
version: 0.0.1
|
||||
display_name: Evaluation
|
||||
type: CommandComponent
|
||||
description: Evaluation module
|
||||
tags: {}
|
||||
inputs:
|
||||
predict_result:
|
||||
type: path
|
||||
description: The predict result for the evaluating data.
|
||||
outputs:
|
||||
evaluation_result:
|
||||
type: path
|
||||
description: The output evaluation result.
|
||||
command: >-
|
||||
python evaluate.py --predict-result {inputs.predict_result} --evaluation-result {outputs.evaluation_result}
|
||||
environment:
|
||||
conda:
|
||||
conda_dependencies:
|
||||
name: project_environment
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- pip=20.0
|
||||
- pip:
|
||||
- scikit-learn==0.22.2
|
||||
os: Linux
|
|
@ -1,9 +1,11 @@
|
|||
import argparse
|
||||
import logging
|
||||
import lightgbm as lgb
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import lightgbm as lgb
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -27,15 +29,8 @@ if __name__ == '__main__':
|
|||
infer_result = args.infer_result
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(message)s')
|
||||
|
||||
logger.info('>>>>>LightGBM Infer Module>>>>>')
|
||||
if os.path.isdir(infer_data_path):
|
||||
logger.info(
|
||||
'[LightGBM Infer] Directory "{}" is provided, use default file name "tp.data"'.format(infer_data_path))
|
||||
infer_data_path = os.path.join(infer_data_path, 'tp.data')
|
||||
|
||||
if os.path.isdir(model_path):
|
||||
logger.info(
|
||||
'[LightGBM Infer] Directory "{}" is provided, use default file name "lgbm.model"'.format(model_path))
|
||||
|
@ -47,7 +42,37 @@ if __name__ == '__main__':
|
|||
infer_result = os.path.join(infer_result, 'pred.result')
|
||||
|
||||
bst = lgb.Booster(model_file=model_path)
|
||||
pred_p = bst.predict(infer_data_path)
|
||||
pred_l = np.argmax(pred_p, axis=1)
|
||||
np.savetxt(infer_result, pred_p)
|
||||
pred_prob = bst.predict(infer_data_path)
|
||||
# prepare for evaluation, unnecessary if do not need evaluation.
|
||||
parser_config_str = ""
|
||||
write_to_parser = False
|
||||
label_id = -1
|
||||
query_id = -1
|
||||
with open(model_path) as fin:
|
||||
for line in fin:
|
||||
if line.startswith("label_index="):
|
||||
label_id = int(line.strip().split("=")[1])
|
||||
if line.startswith("[group_column"):
|
||||
query_id = int(line.split(":")[1].strip().strip("]"))
|
||||
|
||||
if line.startswith("parser:"):
|
||||
write_to_parser = True
|
||||
continue
|
||||
elif line.startswith("end of parser"):
|
||||
write_to_parser = False
|
||||
|
||||
if write_to_parser:
|
||||
parser_config_str += line
|
||||
|
||||
parser_config_path = Path(tempfile.mkdtemp()) / "parser_config.json"
|
||||
with open(parser_config_path, 'w') as fout:
|
||||
fout.write(parser_config_str)
|
||||
infer_ds = lgb.Dataset(infer_data_path, params={"parser_config_file": parser_config_path,
|
||||
"label": label_id, "query": query_id})
|
||||
infer_ds.construct()
|
||||
true_label = infer_ds.get_label().astype(int)
|
||||
query = infer_ds.get_group().astype(int)
|
||||
infer_result_dict = {'pred_prob': pred_prob.tolist(), 'true_label': true_label.tolist(), 'query': query.tolist()}
|
||||
with open(infer_result, "w") as fout:
|
||||
json.dump(infer_result_dict, fout)
|
||||
logger.info('<<<<<LightGBM Infer Module<<<<<')
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
$schema: http://azureml/sdk-2-0/CommandComponent.json
|
||||
name: microsoft.com.azureml.test.integrated_transformlib_lightGBM_infer
|
||||
name: microsoft.com.azureml.test.integrated_freeform2_lightGBM_infer
|
||||
version: 0.0.1
|
||||
display_name: Integrated FreeForm2 lightGBM Infer
|
||||
type: CommandComponent
|
||||
|
@ -23,10 +23,7 @@ command: >-
|
|||
python infer.py --infer-data-path {inputs.infer_data_path} --model-path {inputs.model_path} --infer-result {outputs.infer_result}
|
||||
environment:
|
||||
docker:
|
||||
image: transformprocessor.azurecr.io/lightgbm_transform_dependencies:20211109.v1
|
||||
registry:
|
||||
username: username
|
||||
password: password
|
||||
image: docker.io/krystal1130/lightgbm_transform_dependencies:20211109.v1
|
||||
conda:
|
||||
conda_dependencies:
|
||||
name: project_environment
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
import argparse
|
||||
import logging
|
||||
import lightgbm as lgb
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
import lightgbm as lgb
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
|
@ -13,7 +13,11 @@ if __name__ == '__main__':
|
|||
)
|
||||
parser.add_argument(
|
||||
'--parser-config-path',
|
||||
help='The file path for the feature spec.',
|
||||
help='The json path for parser config.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--label',
|
||||
help='Label column name.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--model-path',
|
||||
|
@ -22,6 +26,7 @@ if __name__ == '__main__':
|
|||
args, _ = parser.parse_known_args()
|
||||
train_data_path = args.train_data_path
|
||||
parser_config_path = args.parser_config_path
|
||||
label = args.label
|
||||
model_path = args.model_path
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
@ -31,23 +36,27 @@ if __name__ == '__main__':
|
|||
train_data = lgb.Dataset(train_data_path,
|
||||
params={
|
||||
"parser_config_file": parser_config_path,
|
||||
"label": 'name:m:Rating'
|
||||
"label": f'name:{label}'
|
||||
})
|
||||
train_data.construct()
|
||||
|
||||
params = {
|
||||
'learning_rate': 0.1,
|
||||
'lambda_l1': 0.1,
|
||||
'lambda_l2': 0.2,
|
||||
'max_depth': 4,
|
||||
'objective': 'multiclass',
|
||||
'num_class': np.max(train_data.get_label().astype(int)) + 1,
|
||||
'seed': 2021,
|
||||
'boosting': 'gbdt',
|
||||
'learning_rate': 0.22,
|
||||
'objective': 'lambdarank',
|
||||
'ndcg_eval_at': "1,3,5",
|
||||
'metric_freq': 1,
|
||||
'label_gain': ','.join([str(i) for i in range(350)]),
|
||||
'metric': 'ndcg',
|
||||
'num_trees': 800,
|
||||
'num_leaves': 300,
|
||||
'min_data_in_leaf': 50,
|
||||
'max_bin': 16,
|
||||
'query': 0,
|
||||
'feature_fraction': 0.15,
|
||||
# Make sure the stable result given the same input
|
||||
'deterministic': 'true',
|
||||
'force_col_wise': 'true'
|
||||
}
|
||||
|
||||
bst = lgb.train(params, train_data)
|
||||
bst = lgb.train(params, train_data, valid_sets=[train_data])
|
||||
logger.info('Finished training.')
|
||||
if os.path.isdir(model_path):
|
||||
logger.info(
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
$schema: http://azureml/sdk-2-0/CommandComponent.json
|
||||
name: microsoft.com.azureml.test.integrated_transformLib_lightGBM_train
|
||||
name: microsoft.com.azureml.test.integrated_freeform2_lightGBM_train
|
||||
version: 0.0.1
|
||||
display_name: Integrated FreeForm2 lightGBM Train
|
||||
type: CommandComponent
|
||||
|
@ -14,19 +14,19 @@ inputs:
|
|||
description: The file path for the training data.
|
||||
parser_config_path:
|
||||
type: path
|
||||
description: The file path for the feature spec.
|
||||
description: The json path for parser config.
|
||||
label:
|
||||
type: string
|
||||
description: label column name
|
||||
outputs:
|
||||
model_path:
|
||||
type: path
|
||||
description: The output file path for the trained LightGBM model.
|
||||
command: >-
|
||||
python train.py --train-data-path {inputs.train_data_path} --parser-config-path {inputs.parser_config_path} --model-path {outputs.model_path}
|
||||
python train.py --train-data-path {inputs.train_data_path} --parser-config-path {inputs.parser_config_path} --label {inputs.label} --model-path {outputs.model_path}
|
||||
environment:
|
||||
docker:
|
||||
image: transformprocessor.azurecr.io/lightgbm_transform_dependencies:20211109.v1
|
||||
registry:
|
||||
username: username
|
||||
password: password
|
||||
image: docker.io/krystal1130/lightgbm_transform_dependencies:20211109.v1
|
||||
conda:
|
||||
conda_dependencies:
|
||||
name: project_environment
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
azure-ml-component
|
||||
azureml-defaults
|
|
@ -1,12 +1,13 @@
|
|||
# Initialize the workspace
|
||||
from azureml.core import Workspace
|
||||
from azureml.core import Workspace, Dataset
|
||||
from azureml.core.compute import AmlCompute, ComputeTarget
|
||||
from azure.ml.component import Component
|
||||
from azure.ml.component import dsl
|
||||
|
||||
# Initialize the workspace
|
||||
config_path = "config.json"
|
||||
ws = Workspace.from_config(path=config_path)
|
||||
|
||||
# Retrieve or create the computer target
|
||||
from azureml.core.compute import AmlCompute, ComputeTarget
|
||||
|
||||
cluster_name = "cpu-cluster"
|
||||
if cluster_name not in ws.compute_targets:
|
||||
print('Creating a new compute target...')
|
||||
|
@ -14,39 +15,40 @@ if cluster_name not in ws.compute_targets:
|
|||
compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
|
||||
compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)
|
||||
|
||||
# Prepare dataset
|
||||
datastore = ws.get_default_datastore()
|
||||
data_dir = "data"
|
||||
training_data_name = 'train.tsv'
|
||||
inference_data_name = 'test.tsv'
|
||||
parser_config_name = 'parser_config_file.json'
|
||||
datastore.upload(src_dir=data_dir, target_path=data_dir, overwrite=True, show_progress=True)
|
||||
training_set = Dataset.File.from_files(path=[(datastore, f'{data_dir}/{training_data_name}')])
|
||||
inference_set = Dataset.File.from_files(path=[(datastore, f'{data_dir}/{inference_data_name}')])
|
||||
parser_config = Dataset.File.from_files(path=[(datastore, f'{data_dir}/{parser_config_name}')])
|
||||
|
||||
# Create components
|
||||
from azure.ml.component import Component
|
||||
train_component_func = Component.from_yaml(ws, yaml_file='components/train/train.yaml')
|
||||
infer_component_func = Component.from_yaml(ws, yaml_file='components/infer/infer.yaml')
|
||||
eval_component_func = Component.from_yaml(ws, yaml_file='components/evaluate/evaluate.yaml')
|
||||
|
||||
|
||||
# Create pipeline
|
||||
from azure.ml.component import dsl
|
||||
|
||||
# define a pipeline
|
||||
@dsl.pipeline(name='A_test_pipeline_use_lightGBM_transform',
|
||||
description='Test lightGBM feature transformation binding',
|
||||
default_compute_target=cluster_name)
|
||||
def test_pipeline(training_set, inference_set, parser_config):
|
||||
lgbm_train = train_component_func(train_data_path=training_set,
|
||||
parser_config_path=parser_config)
|
||||
lgbm_train.comment = "Use FreeForm2 parser to do built-in feature transformation in training."
|
||||
lgbm_infer = infer_component_func(infer_data_path=inference_set, model_path=lgbm_train.outputs.model_path)
|
||||
lgbm_infer.comment = "The model has saved parser info, will do the transform automatically, " \
|
||||
"no need to do separate data processing for test data."
|
||||
train = train_component_func(train_data_path=training_set, parser_config_path=parser_config, label="m:Rating")
|
||||
train.comment = "Use FreeForm2 parser to do built-in feature transformation in training."
|
||||
infer = infer_component_func(infer_data_path=inference_set, model_path=train.outputs.model_path)
|
||||
infer.comment = "The model has saved parser info, will do the transform automatically, " \
|
||||
"no need to do separate data processing for test data."
|
||||
eval = eval_component_func(predict_result=infer.outputs.infer_result)
|
||||
eval.comment = "Calculate NDCG for lambda rank."
|
||||
|
||||
|
||||
training_data_name = 'TransformProcessorTest413_data'
|
||||
inference_data_name = 'TransformProcessorTest413_data'
|
||||
parser_config_name = 'FreeForm2ParserConfig'
|
||||
|
||||
from azureml.core import Dataset
|
||||
|
||||
training_set = Dataset.get_by_name(ws, name=training_data_name)
|
||||
inference_set = Dataset.get_by_name(ws, name=inference_data_name)
|
||||
parser_config = Dataset.get_by_name(ws, name=parser_config_name)
|
||||
pipeline = test_pipeline(training_set=training_set, inference_set=inference_set, parser_config=parser_config) #, header_file=header_file)
|
||||
|
||||
# Online run
|
||||
pipeline = test_pipeline(training_set=training_set, inference_set=inference_set, parser_config=parser_config)
|
||||
run = pipeline.submit(experiment_name='model-comparison-integrated-transform')
|
||||
run
|
||||
# run.wait_for_completion()
|
||||
|
|
Загрузка…
Ссылка в новой задаче