add lambda rank azureml example

2021-11-22 11:28:50 +00:00 · 2021-11-22 11:28:50 +00:00 · 7b5b359fa6
--- a/azureml/README.md
+++ b/azureml/README.md
@ -0,0 +1,10 @@
+# AzureML example
+
+In the doc, user could learn how to use lightgbm transformation in azure machine learning. 
+The example describes a pipeline containing training for lambdarank objective, inference and evaluation using [Azure ML Component SDK](https://componentsdk.azurewebsites.net/overview.html).
+
+## Steps
+- Run command `pip install -r requirements.txt` and install the dependencies of submission.
+- Download `config.json` of your azure ml workspace and put it under `azureml` folder.
+- Run command `python run.py`.
+- Click the shown experiment link and check in your workspace.
--- a/azureml/components/evaluate/evaluate.py
+++ b/azureml/components/evaluate/evaluate.py
@ -0,0 +1,61 @@
+import argparse
+import json
+import logging
+import os
+from itertools import islice
+
+import numpy as np
+from sklearn.metrics import ndcg_score
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--predict-result',
+        help='The predict result for the evaluating data.',
+    )
+    parser.add_argument(
+        '--evaluation-result',
+        help='The output evaluation result.',
+    )
+    args, _ = parser.parse_known_args()
+    predict_result = args.predict_result
+    evaluation_result = args.evaluation_result
+    logger = logging.getLogger()
+    logging.basicConfig(level=logging.DEBUG, format='%(message)s')
+
+    logger.info('>>>>>Evaluation Module>>>>>')
+    if os.path.isdir(predict_result):
+        logger.info('[Evaluation] Directory "{}" is provided, use default file name "pred.result"'.format(predict_result))
+        predict_result = os.path.join(predict_result, 'pred.result')
+
+    if os.path.isdir(evaluation_result):
+        logger.info('[Evaluation] Directory "{}" is provided, use default file name "evaluation_result"'.format(evaluation_result))
+        evaluation_result = os.path.join(evaluation_result, 'evaluation_result')
+
+    with open(predict_result) as fin:
+        predict_result_dict = json.load(fin)
+
+    output_list = []
+    pred_prob = np.array(predict_result_dict['pred_prob'])
+    query = np.array(predict_result_dict["query"]).astype(int)
+    true_label = np.array(predict_result_dict["true_label"]).astype(int)
+    pred_prob_iter = iter(pred_prob)
+    pred_prob_res = [list(islice(pred_prob_iter, 0, q)) for q in query]
+    true_label_iter = iter(true_label)
+    true_label_res = [list(islice(true_label_iter, 0, q)) for q in query]
+    ndcg_scores = []
+    for true_relevance, scores in zip(true_label_res, pred_prob_res):
+        true_relevance = np.asarray([list(true_relevance)])
+        scores = np.asarray([list(scores)])
+        ndcg_scores.append(ndcg_score(true_relevance, scores))
+
+    ndcg_scores = np.array(ndcg_scores)
+    output_list.append(f'[Evaluation] NDCG on evaluation data: {np.array2string(ndcg_scores)}')
+    output_list.append(f'[Evaluation] Average NDCG on evaluation data: {np.average(ndcg_scores)}')
+    output_str = '\n'.join(output_list)
+    logger.info(output_str)
+    with open(evaluation_result, 'w') as fout:
+        fout.write(output_str)
+
+    logger.info('<<<<<Evaluation Module<<<<<')
--- a/azureml/components/evaluate/evaluate.yaml
+++ b/azureml/components/evaluate/evaluate.yaml
@ -0,0 +1,32 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+$schema: http://azureml/sdk-2-0/CommandComponent.json
+name: microsoft.com.azureml.test.evaluation
+version: 0.0.1
+display_name: Evaluation
+type: CommandComponent
+description: Evaluation module
+tags: {}
+inputs: 
+  predict_result:
+    type: path
+    description: The predict result for the evaluating data.
+outputs: 
+  evaluation_result:
+    type: path
+    description: The output evaluation result.
+command: >-
+  python evaluate.py --predict-result {inputs.predict_result} --evaluation-result {outputs.evaluation_result}
+environment:
+  conda:
+    conda_dependencies:
+      name: project_environment
+      channels:
+      - defaults
+      dependencies:
+      - python=3.8
+      - pip=20.0
+      - pip:
+        - scikit-learn==0.22.2
+  os: Linux
--- a/azureml/components/infer/infer.py
+++ b/azureml/components/infer/infer.py
@ -1,9 +1,11 @@
 import argparse
-import logging
-import lightgbm as lgb
-import numpy as np
-import os
 import json
+import logging
+import os
+import tempfile
+from pathlib import Path
+
+import lightgbm as lgb


 if __name__ == '__main__':
@ -27,15 +29,8 @@ if __name__ == '__main__':
    infer_result = args.infer_result

    logger = logging.getLogger()
-
    logging.basicConfig(level=logging.DEBUG, format='%(message)s')
-
    logger.info('>>>>>LightGBM Infer Module>>>>>')
-    if os.path.isdir(infer_data_path):
-        logger.info(
-            '[LightGBM Infer] Directory "{}" is provided, use default file name "tp.data"'.format(infer_data_path))
-        infer_data_path = os.path.join(infer_data_path, 'tp.data')
-
    if os.path.isdir(model_path):
        logger.info(
            '[LightGBM Infer] Directory "{}" is provided, use default file name "lgbm.model"'.format(model_path))
@ -47,7 +42,37 @@ if __name__ == '__main__':
        infer_result = os.path.join(infer_result, 'pred.result')

    bst = lgb.Booster(model_file=model_path)
-    pred_p = bst.predict(infer_data_path)
-    pred_l = np.argmax(pred_p, axis=1)
-    np.savetxt(infer_result, pred_p)
+    pred_prob = bst.predict(infer_data_path)
+    # prepare for evaluation, unnecessary if do not need evaluation.
+    parser_config_str = ""
+    write_to_parser = False
+    label_id = -1
+    query_id = -1
+    with open(model_path) as fin:
+        for line in fin:
+            if line.startswith("label_index="):
+                label_id = int(line.strip().split("=")[1])
+            if line.startswith("[group_column"):
+                query_id = int(line.split(":")[1].strip().strip("]"))
+
+            if line.startswith("parser:"):
+                write_to_parser = True
+                continue
+            elif line.startswith("end of parser"):
+                write_to_parser = False
+
+            if write_to_parser:
+                parser_config_str += line
+
+    parser_config_path = Path(tempfile.mkdtemp()) / "parser_config.json"
+    with open(parser_config_path, 'w') as fout:
+        fout.write(parser_config_str)
+    infer_ds = lgb.Dataset(infer_data_path, params={"parser_config_file": parser_config_path,
+                                                    "label": label_id, "query": query_id})
+    infer_ds.construct()
+    true_label = infer_ds.get_label().astype(int)
+    query = infer_ds.get_group().astype(int)
+    infer_result_dict = {'pred_prob': pred_prob.tolist(), 'true_label': true_label.tolist(), 'query': query.tolist()}
+    with open(infer_result, "w") as fout:
+        json.dump(infer_result_dict, fout)
    logger.info('<<<<<LightGBM Infer Module<<<<<')
--- a/azureml/components/infer/infer.yaml
+++ b/azureml/components/infer/infer.yaml
@ -2,7 +2,7 @@
 # Licensed under the MIT License.

 $schema: http://azureml/sdk-2-0/CommandComponent.json
-name: microsoft.com.azureml.test.integrated_transformlib_lightGBM_infer
+name: microsoft.com.azureml.test.integrated_freeform2_lightGBM_infer
 version: 0.0.1
 display_name: Integrated FreeForm2 lightGBM Infer
 type: CommandComponent
@ -23,10 +23,7 @@ command: >-
  python infer.py --infer-data-path {inputs.infer_data_path} --model-path {inputs.model_path} --infer-result {outputs.infer_result}
 environment:
  docker:
-    image: transformprocessor.azurecr.io/lightgbm_transform_dependencies:20211109.v1
-    registry:
-      username: username
-      password: password
+    image: docker.io/krystal1130/lightgbm_transform_dependencies:20211109.v1
  conda:
    conda_dependencies:
      name: project_environment
--- a/azureml/components/train/train.py
+++ b/azureml/components/train/train.py
@ -1,9 +1,9 @@
 import argparse
 import logging
-import lightgbm as lgb
-import numpy as np
 import os

+import lightgbm as lgb
+

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
@ -13,7 +13,11 @@ if __name__ == '__main__':
    )
    parser.add_argument(
        '--parser-config-path',
-        help='The file path for the feature spec.',
+        help='The json path for parser config.',
+    )
+    parser.add_argument(
+        '--label',
+        help='Label column name.',
    )
    parser.add_argument(
        '--model-path',
@ -22,6 +26,7 @@ if __name__ == '__main__':
    args, _ = parser.parse_known_args()
    train_data_path = args.train_data_path
    parser_config_path = args.parser_config_path
+    label = args.label
    model_path = args.model_path

    logger = logging.getLogger()
@ -31,23 +36,27 @@ if __name__ == '__main__':
    train_data = lgb.Dataset(train_data_path,
                             params={
                                 "parser_config_file": parser_config_path,
-                                 "label": 'name:m:Rating'
+                                 "label": f'name:{label}'
                             })
-    train_data.construct()
-
    params = {
-        'learning_rate': 0.1,
-        'lambda_l1': 0.1,
-        'lambda_l2': 0.2,
-        'max_depth': 4,
-        'objective': 'multiclass',
-        'num_class': np.max(train_data.get_label().astype(int)) + 1,
-        'seed': 2021,
+        'boosting': 'gbdt',
+        'learning_rate': 0.22,
+        'objective': 'lambdarank',
+        'ndcg_eval_at': "1,3,5",
+        'metric_freq': 1,
+        'label_gain': ','.join([str(i) for i in range(350)]),
+        'metric': 'ndcg',
+        'num_trees': 800,
+        'num_leaves': 300,
+        'min_data_in_leaf': 50,
+        'max_bin': 16,
+        'query': 0,
+        'feature_fraction': 0.15,
+        # Make sure the stable result given the same input
        'deterministic': 'true',
        'force_col_wise': 'true'
    }
-
-    bst = lgb.train(params, train_data)
+    bst = lgb.train(params, train_data, valid_sets=[train_data])
    logger.info('Finished training.')
    if os.path.isdir(model_path):
        logger.info(
--- a/azureml/components/train/train.yaml
+++ b/azureml/components/train/train.yaml
@ -2,7 +2,7 @@
 # Licensed under the MIT License.

 $schema: http://azureml/sdk-2-0/CommandComponent.json
-name: microsoft.com.azureml.test.integrated_transformLib_lightGBM_train
+name: microsoft.com.azureml.test.integrated_freeform2_lightGBM_train
 version: 0.0.1
 display_name: Integrated FreeForm2 lightGBM Train
 type: CommandComponent
@ -14,19 +14,19 @@ inputs:
    description: The file path for the training data.
  parser_config_path:
    type: path
-    description: The file path for the feature spec.
+    description: The json path for parser config.
+  label:
+    type: string
+    description: label column name
 outputs: 
  model_path:
    type: path
    description: The output file path for the trained LightGBM model.
 command: >-
-  python train.py --train-data-path {inputs.train_data_path} --parser-config-path {inputs.parser_config_path} --model-path {outputs.model_path}
+  python train.py --train-data-path {inputs.train_data_path} --parser-config-path {inputs.parser_config_path} --label {inputs.label} --model-path {outputs.model_path}
 environment:
  docker:
-    image: transformprocessor.azurecr.io/lightgbm_transform_dependencies:20211109.v1
-    registry:
-      username: username
-      password: password
+    image: docker.io/krystal1130/lightgbm_transform_dependencies:20211109.v1
  conda:
    conda_dependencies:
      name: project_environment
--- a/azureml/requirements.txt
+++ b/azureml/requirements.txt
@ -0,0 +1,2 @@
+azure-ml-component
+azureml-defaults
--- a/azureml/run.py
+++ b/azureml/run.py
@ -1,12 +1,13 @@
-# Initialize the workspace
-from azureml.core import Workspace
+from azureml.core import Workspace, Dataset
+from azureml.core.compute import AmlCompute, ComputeTarget
+from azure.ml.component import Component
+from azure.ml.component import dsl

+# Initialize the workspace
 config_path = "config.json"
 ws = Workspace.from_config(path=config_path)

 # Retrieve or create the computer target
-from azureml.core.compute import AmlCompute, ComputeTarget
-
 cluster_name = "cpu-cluster"
 if cluster_name not in ws.compute_targets:
    print('Creating a new compute target...')
@ -14,39 +15,40 @@ if cluster_name not in ws.compute_targets:
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20)

+# Prepare dataset
+datastore = ws.get_default_datastore()
+data_dir = "data"
+training_data_name = 'train.tsv'
+inference_data_name = 'test.tsv'
+parser_config_name = 'parser_config_file.json'
+datastore.upload(src_dir=data_dir, target_path=data_dir, overwrite=True, show_progress=True)
+training_set = Dataset.File.from_files(path=[(datastore, f'{data_dir}/{training_data_name}')])
+inference_set = Dataset.File.from_files(path=[(datastore, f'{data_dir}/{inference_data_name}')])
+parser_config = Dataset.File.from_files(path=[(datastore, f'{data_dir}/{parser_config_name}')])
+
 # Create components
-from azure.ml.component import Component
 train_component_func = Component.from_yaml(ws, yaml_file='components/train/train.yaml')
 infer_component_func = Component.from_yaml(ws, yaml_file='components/infer/infer.yaml')
+eval_component_func = Component.from_yaml(ws, yaml_file='components/evaluate/evaluate.yaml')
+

 # Create pipeline
-from azure.ml.component import dsl
-
 # define a pipeline
@dsl.pipeline(name='A_test_pipeline_use_lightGBM_transform',
              description='Test lightGBM feature transformation binding',
              default_compute_target=cluster_name)
 def test_pipeline(training_set, inference_set, parser_config):
-    lgbm_train = train_component_func(train_data_path=training_set,
-                                      parser_config_path=parser_config)
-    lgbm_train.comment = "Use FreeForm2 parser to do built-in feature transformation in training."
-    lgbm_infer = infer_component_func(infer_data_path=inference_set, model_path=lgbm_train.outputs.model_path)
-    lgbm_infer.comment = "The model has saved parser info, will do the transform automatically, " \
-                         "no need to do separate data processing for test data."
+    train = train_component_func(train_data_path=training_set, parser_config_path=parser_config, label="m:Rating")
+    train.comment = "Use FreeForm2 parser to do built-in feature transformation in training."
+    infer = infer_component_func(infer_data_path=inference_set, model_path=train.outputs.model_path)
+    infer.comment = "The model has saved parser info, will do the transform automatically, " \
+                    "no need to do separate data processing for test data."
+    eval = eval_component_func(predict_result=infer.outputs.infer_result)
+    eval.comment = "Calculate NDCG for lambda rank."


-training_data_name = 'TransformProcessorTest413_data'
-inference_data_name = 'TransformProcessorTest413_data'
-parser_config_name = 'FreeForm2ParserConfig'
-
-from azureml.core import Dataset
-
-training_set = Dataset.get_by_name(ws, name=training_data_name)
-inference_set = Dataset.get_by_name(ws, name=inference_data_name)
-parser_config = Dataset.get_by_name(ws, name=parser_config_name)
-pipeline = test_pipeline(training_set=training_set, inference_set=inference_set, parser_config=parser_config) #, header_file=header_file)
-
 # Online run
+pipeline = test_pipeline(training_set=training_set, inference_set=inference_set, parser_config=parser_config)
 run = pipeline.submit(experiment_name='model-comparison-integrated-transform')
 run
 # run.wait_for_completion()