add lambda rank azureml example

This commit is contained in:
Jincheng Chen 2021-11-22 11:28:50 +00:00
Родитель 02bb814189
Коммит 7b5b359fa6
9 изменённых файлов: 203 добавлений и 65 удалений

10
azureml/README.md Normal file
Просмотреть файл

@ -0,0 +1,10 @@
# AzureML example
In the doc, user could learn how to use lightgbm transformation in azure machine learning.
The example describes a pipeline containing training for lambdarank objective, inference and evaluation using [Azure ML Component SDK](https://componentsdk.azurewebsites.net/overview.html).
## Steps
- Run command `pip install -r requirements.txt` and install the dependencies of submission.
- Download `config.json` of your azure ml workspace and put it under `azureml` folder.
- Run command `python run.py`.
- Click the shown experiment link and check in your workspace.

Просмотреть файл

@ -0,0 +1,61 @@
import argparse
import json
import logging
import os
from itertools import islice
import numpy as np
from sklearn.metrics import ndcg_score
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--predict-result',
help='The predict result for the evaluating data.',
)
parser.add_argument(
'--evaluation-result',
help='The output evaluation result.',
)
args, _ = parser.parse_known_args()
predict_result = args.predict_result
evaluation_result = args.evaluation_result
logger = logging.getLogger()
logging.basicConfig(level=logging.DEBUG, format='%(message)s')
logger.info('>>>>>Evaluation Module>>>>>')
if os.path.isdir(predict_result):
logger.info('[Evaluation] Directory "{}" is provided, use default file name "pred.result"'.format(predict_result))
predict_result = os.path.join(predict_result, 'pred.result')
if os.path.isdir(evaluation_result):
logger.info('[Evaluation] Directory "{}" is provided, use default file name "evaluation_result"'.format(evaluation_result))
evaluation_result = os.path.join(evaluation_result, 'evaluation_result')
with open(predict_result) as fin:
predict_result_dict = json.load(fin)
output_list = []
pred_prob = np.array(predict_result_dict['pred_prob'])
query = np.array(predict_result_dict["query"]).astype(int)
true_label = np.array(predict_result_dict["true_label"]).astype(int)
pred_prob_iter = iter(pred_prob)
pred_prob_res = [list(islice(pred_prob_iter, 0, q)) for q in query]
true_label_iter = iter(true_label)
true_label_res = [list(islice(true_label_iter, 0, q)) for q in query]
ndcg_scores = []
for true_relevance, scores in zip(true_label_res, pred_prob_res):
true_relevance = np.asarray([list(true_relevance)])
scores = np.asarray([list(scores)])
ndcg_scores.append(ndcg_score(true_relevance, scores))
ndcg_scores = np.array(ndcg_scores)
output_list.append(f'[Evaluation] NDCG on evaluation data: {np.array2string(ndcg_scores)}')
output_list.append(f'[Evaluation] Average NDCG on evaluation data: {np.average(ndcg_scores)}')
output_str = '\n'.join(output_list)
logger.info(output_str)
with open(evaluation_result, 'w') as fout:
fout.write(output_str)
logger.info('<<<<<Evaluation Module<<<<<')

Просмотреть файл

@ -0,0 +1,32 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
$schema: http://azureml/sdk-2-0/CommandComponent.json
name: microsoft.com.azureml.test.evaluation
version: 0.0.1
display_name: Evaluation
type: CommandComponent
description: Evaluation module
tags: {}
inputs:
predict_result:
type: path
description: The predict result for the evaluating data.
outputs:
evaluation_result:
type: path
description: The output evaluation result.
command: >-
python evaluate.py --predict-result {inputs.predict_result} --evaluation-result {outputs.evaluation_result}
environment:
conda:
conda_dependencies:
name: project_environment
channels:
- defaults
dependencies:
- python=3.8
- pip=20.0
- pip:
- scikit-learn==0.22.2
os: Linux

Просмотреть файл

@ -1,9 +1,11 @@
import argparse
import logging
import lightgbm as lgb
import numpy as np
import os
import json
import logging
import os
import tempfile
from pathlib import Path
import lightgbm as lgb
if __name__ == '__main__':
@ -27,15 +29,8 @@ if __name__ == '__main__':
infer_result = args.infer_result
logger = logging.getLogger()
logging.basicConfig(level=logging.DEBUG, format='%(message)s')
logger.info('>>>>>LightGBM Infer Module>>>>>')
if os.path.isdir(infer_data_path):
logger.info(
'[LightGBM Infer] Directory "{}" is provided, use default file name "tp.data"'.format(infer_data_path))
infer_data_path = os.path.join(infer_data_path, 'tp.data')
if os.path.isdir(model_path):
logger.info(
'[LightGBM Infer] Directory "{}" is provided, use default file name "lgbm.model"'.format(model_path))
@ -47,7 +42,37 @@ if __name__ == '__main__':
infer_result = os.path.join(infer_result, 'pred.result')
bst = lgb.Booster(model_file=model_path)
pred_p = bst.predict(infer_data_path)
pred_l = np.argmax(pred_p, axis=1)
np.savetxt(infer_result, pred_p)
pred_prob = bst.predict(infer_data_path)
# prepare for evaluation, unnecessary if do not need evaluation.
parser_config_str = ""
write_to_parser = False
label_id = -1
query_id = -1
with open(model_path) as fin:
for line in fin:
if line.startswith("label_index="):
label_id = int(line.strip().split("=")[1])
if line.startswith("[group_column"):
query_id = int(line.split(":")[1].strip().strip("]"))
if line.startswith("parser:"):
write_to_parser = True
continue
elif line.startswith("end of parser"):
write_to_parser = False
if write_to_parser:
parser_config_str += line
parser_config_path = Path(tempfile.mkdtemp()) / "parser_config.json"
with open(parser_config_path, 'w') as fout:
fout.write(parser_config_str)
infer_ds = lgb.Dataset(infer_data_path, params={"parser_config_file": parser_config_path,
"label": label_id, "query": query_id})
infer_ds.construct()
true_label = infer_ds.get_label().astype(int)
query = infer_ds.get_group().astype(int)
infer_result_dict = {'pred_prob': pred_prob.tolist(), 'true_label': true_label.tolist(), 'query': query.tolist()}
with open(infer_result, "w") as fout:
json.dump(infer_result_dict, fout)
logger.info('<<<<<LightGBM Infer Module<<<<<')

Просмотреть файл

@ -2,7 +2,7 @@
# Licensed under the MIT License.
$schema: http://azureml/sdk-2-0/CommandComponent.json
name: microsoft.com.azureml.test.integrated_transformlib_lightGBM_infer
name: microsoft.com.azureml.test.integrated_freeform2_lightGBM_infer
version: 0.0.1
display_name: Integrated FreeForm2 lightGBM Infer
type: CommandComponent
@ -23,10 +23,7 @@ command: >-
python infer.py --infer-data-path {inputs.infer_data_path} --model-path {inputs.model_path} --infer-result {outputs.infer_result}
environment:
docker:
image: transformprocessor.azurecr.io/lightgbm_transform_dependencies:20211109.v1
registry:
username: username
password: password
image: docker.io/krystal1130/lightgbm_transform_dependencies:20211109.v1
conda:
conda_dependencies:
name: project_environment

Просмотреть файл

@ -1,9 +1,9 @@
import argparse
import logging
import lightgbm as lgb
import numpy as np
import os
import lightgbm as lgb
if __name__ == '__main__':
parser = argparse.ArgumentParser()
@ -13,7 +13,11 @@ if __name__ == '__main__':
)
parser.add_argument(
'--parser-config-path',
help='The file path for the feature spec.',
help='The json path for parser config.',
)
parser.add_argument(
'--label',
help='Label column name.',
)
parser.add_argument(
'--model-path',
@ -22,6 +26,7 @@ if __name__ == '__main__':
args, _ = parser.parse_known_args()
train_data_path = args.train_data_path
parser_config_path = args.parser_config_path
label = args.label
model_path = args.model_path
logger = logging.getLogger()
@ -31,23 +36,27 @@ if __name__ == '__main__':
train_data = lgb.Dataset(train_data_path,
params={
"parser_config_file": parser_config_path,
"label": 'name:m:Rating'
"label": f'name:{label}'
})
train_data.construct()
params = {
'learning_rate': 0.1,
'lambda_l1': 0.1,
'lambda_l2': 0.2,
'max_depth': 4,
'objective': 'multiclass',
'num_class': np.max(train_data.get_label().astype(int)) + 1,
'seed': 2021,
'boosting': 'gbdt',
'learning_rate': 0.22,
'objective': 'lambdarank',
'ndcg_eval_at': "1,3,5",
'metric_freq': 1,
'label_gain': ','.join([str(i) for i in range(350)]),
'metric': 'ndcg',
'num_trees': 800,
'num_leaves': 300,
'min_data_in_leaf': 50,
'max_bin': 16,
'query': 0,
'feature_fraction': 0.15,
# Make sure the stable result given the same input
'deterministic': 'true',
'force_col_wise': 'true'
}
bst = lgb.train(params, train_data)
bst = lgb.train(params, train_data, valid_sets=[train_data])
logger.info('Finished training.')
if os.path.isdir(model_path):
logger.info(

Просмотреть файл

@ -2,7 +2,7 @@
# Licensed under the MIT License.
$schema: http://azureml/sdk-2-0/CommandComponent.json
name: microsoft.com.azureml.test.integrated_transformLib_lightGBM_train
name: microsoft.com.azureml.test.integrated_freeform2_lightGBM_train
version: 0.0.1
display_name: Integrated FreeForm2 lightGBM Train
type: CommandComponent
@ -14,19 +14,19 @@ inputs:
description: The file path for the training data.
parser_config_path:
type: path
description: The file path for the feature spec.
description: The json path for parser config.
label:
type: string
description: label column name
outputs:
model_path:
type: path
description: The output file path for the trained LightGBM model.
command: >-
python train.py --train-data-path {inputs.train_data_path} --parser-config-path {inputs.parser_config_path} --model-path {outputs.model_path}
python train.py --train-data-path {inputs.train_data_path} --parser-config-path {inputs.parser_config_path} --label {inputs.label} --model-path {outputs.model_path}
environment:
docker:
image: transformprocessor.azurecr.io/lightgbm_transform_dependencies:20211109.v1
registry:
username: username
password: password
image: docker.io/krystal1130/lightgbm_transform_dependencies:20211109.v1
conda:
conda_dependencies:
name: project_environment

2
azureml/requirements.txt Normal file
Просмотреть файл

@ -0,0 +1,2 @@
azure-ml-component
azureml-defaults

Просмотреть файл

@ -1,12 +1,13 @@
# Initialize the workspace
from azureml.core import Workspace
from azureml.core import Workspace, Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azure.ml.component import Component
from azure.ml.component import dsl
# Initialize the workspace
config_path = "config.json"
ws = Workspace.from_config(path=config_path)
# Retrieve or create the computer target
from azureml.core.compute import AmlCompute, ComputeTarget
cluster_name = "cpu-cluster"
if cluster_name not in ws.compute_targets:
print('Creating a new compute target...')
@ -14,39 +15,40 @@ if cluster_name not in ws.compute_targets:
compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)
# Prepare dataset
datastore = ws.get_default_datastore()
data_dir = "data"
training_data_name = 'train.tsv'
inference_data_name = 'test.tsv'
parser_config_name = 'parser_config_file.json'
datastore.upload(src_dir=data_dir, target_path=data_dir, overwrite=True, show_progress=True)
training_set = Dataset.File.from_files(path=[(datastore, f'{data_dir}/{training_data_name}')])
inference_set = Dataset.File.from_files(path=[(datastore, f'{data_dir}/{inference_data_name}')])
parser_config = Dataset.File.from_files(path=[(datastore, f'{data_dir}/{parser_config_name}')])
# Create components
from azure.ml.component import Component
train_component_func = Component.from_yaml(ws, yaml_file='components/train/train.yaml')
infer_component_func = Component.from_yaml(ws, yaml_file='components/infer/infer.yaml')
eval_component_func = Component.from_yaml(ws, yaml_file='components/evaluate/evaluate.yaml')
# Create pipeline
from azure.ml.component import dsl
# define a pipeline
@dsl.pipeline(name='A_test_pipeline_use_lightGBM_transform',
description='Test lightGBM feature transformation binding',
default_compute_target=cluster_name)
def test_pipeline(training_set, inference_set, parser_config):
lgbm_train = train_component_func(train_data_path=training_set,
parser_config_path=parser_config)
lgbm_train.comment = "Use FreeForm2 parser to do built-in feature transformation in training."
lgbm_infer = infer_component_func(infer_data_path=inference_set, model_path=lgbm_train.outputs.model_path)
lgbm_infer.comment = "The model has saved parser info, will do the transform automatically, " \
"no need to do separate data processing for test data."
train = train_component_func(train_data_path=training_set, parser_config_path=parser_config, label="m:Rating")
train.comment = "Use FreeForm2 parser to do built-in feature transformation in training."
infer = infer_component_func(infer_data_path=inference_set, model_path=train.outputs.model_path)
infer.comment = "The model has saved parser info, will do the transform automatically, " \
"no need to do separate data processing for test data."
eval = eval_component_func(predict_result=infer.outputs.infer_result)
eval.comment = "Calculate NDCG for lambda rank."
training_data_name = 'TransformProcessorTest413_data'
inference_data_name = 'TransformProcessorTest413_data'
parser_config_name = 'FreeForm2ParserConfig'
from azureml.core import Dataset
training_set = Dataset.get_by_name(ws, name=training_data_name)
inference_set = Dataset.get_by_name(ws, name=inference_data_name)
parser_config = Dataset.get_by_name(ws, name=parser_config_name)
pipeline = test_pipeline(training_set=training_set, inference_set=inference_set, parser_config=parser_config) #, header_file=header_file)
# Online run
pipeline = test_pipeline(training_set=training_set, inference_set=inference_set, parser_config=parser_config)
run = pipeline.submit(experiment_name='model-comparison-integrated-transform')
run
# run.wait_for_completion()