Merge branch 'jfomhover/nlp-summarization-update' into nlp-summarization

2022-07-14 11:13:34 -07:00 · 2022-07-14 11:13:34 -07:00 · b06ca77852
--- a/nlp/aml-cli-v2/data-science/environment/Dockerfile
+++ b/nlp/aml-cli-v2/data-science/environment/Dockerfile
@ -0,0 +1,23 @@
+# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
+FROM nvcr.io/nvidia/pytorch:22.04-py3
+
+# Install dependencies missing in this container
+# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+# add ndv4-topo.xml
+RUN mkdir /opt/microsoft/
+ADD ./ndv4-topo.xml /opt/microsoft
+
+# to use on A100, enable env var below in your job
+# ENV NCCL_TOPO_FILE="/opt/microsoft/ndv4-topo.xml"
+
+# adjusts the level of info from NCCL tests
+ENV NCCL_DEBUG="INFO"
+ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
+
+# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
+ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
+ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
+ENV NCCL_SOCKET_IFNAME="eth0"
--- a/nlp/aml-cli-v2/data-science/environment/ndv4-topo.xml
+++ b/nlp/aml-cli-v2/data-science/environment/ndv4-topo.xml
@ -0,0 +1,35 @@
+<!-- This topology file was copied from https://github.com/Azure/azhpc-images/blob/master/common/network-tuning.sh -->
+<system version="1">
+  <cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+      <pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
+    <pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
+      <pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
+    </pci>
+  </cpu>
+</system>
--- a/nlp/aml-cli-v2/data-science/environment/requirements.txt
+++ b/nlp/aml-cli-v2/data-science/environment/requirements.txt
@ -0,0 +1,18 @@
+ # for local testing (cpu)
+# torchvision==0.12.0
+# torch==1.11.0
+pytorch_lightning==1.6.4
+transformers==4.18.0
+datasets==2.0.0
+rouge_score==0.0.4
+sentencepiece==0.1.96
+
+# for metrics reporting/plotting
+mlflow==1.25.1
+azureml-mlflow==1.41.0
+# matplotlib==3.5.2
+# tqdm==4.64.0
+psutil==5.9.0
+
+# for unit testing
+pytest==7.1.2
--- a/nlp/aml-cli-v2/data-science/environment/transformers_conda.yml
+++ b/nlp/aml-cli-v2/data-science/environment/transformers_conda.yml
@ -1,31 +0,0 @@
-name: project_environment
-channels:
-    - defaults
-    - pytorch
-dependencies:
-    - python=3.8
-    - pip=20.0
-    - pip:
-        - numpy
-        - boto3==1.15.0
-        - requests
-        - tqdm
-        - scipy
-        - sklearn
-        - strenum==0.4.7
-        - regex!=2019.12.17
-        - sentencepiece==0.1.91
-        - sacremoses
-        - tensorboardX
-        - transformers>=4.11
-        - datasets==2.0.0
-        - nltk==3.7
-        - nvidia-ml-py3
-        - rouge_score
-        - joblib
-        - pytorch_lightning==1.6.4
-        - azureml-mlflow==1.42.0
-        - 'inference-schema[numpy-support]==1.3.0'
-        - applicationinsights==0.11.10
-        - torch==1.9.0+cu111
-        - -f https://download.pytorch.org/whl/torch_stable.html
--- a/nlp/aml-cli-v2/data-science/environment/transformers_inference_conda.yml
+++ b/nlp/aml-cli-v2/data-science/environment/transformers_inference_conda.yml
@ -1,24 +0,0 @@
-channels:
-  - pytorch
-  - anaconda
-  - defaults
-  - conda-forge
-dependencies:
-  - python=3.8
-  - pip=21.2.4
-  - pytorch=1.10.0
-  - torchvision=0.11.1
-  - torchaudio=0.10.0
-  - cudatoolkit=11.1.1
-  - nvidia-apex=0.1.0
-  - gxx_linux-64=8.5.0
-  - pip:
-      - azureml-defaults==1.39.0
-      - azureml-mlflow==1.39.0
-      - azureml-telemetry==1.39.0
-      - azureml-train-core==1.39.0
-      - mlflow==1.24.0
-      - transformers==4.17.0
-      - 'inference-schema[numpy-support]==1.3.0'
-      - applicationinsights==0.11.10
-name: azureml_cc4492e420dd1d06c2877bacae431d92
--- a/nlp/aml-cli-v2/data-science/src/summarization/data_prep.py
+++ b/nlp/aml-cli-v2/data-science/src/summarization/data_prep.py
@ -66,6 +66,17 @@ def main():

    logger.info(f"Running with arguments: {args}")

+    # get tokenizer ready
+    tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
+    logger.info(f"tokenizer: {tokenizer}")
+
+    prefix = args.source_prefix if args.source_prefix is not None else ""
+    if args.source_prefix is None and "t5" in args.model_checkpoint.lower():
+        logger.warning(
+            "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
+            "`--source_prefix 'summarize: ' `"
+        )
+
    # Load dataset
    raw_dataset = load_dataset(args.dataset_name, args.dataset_config)

@ -86,16 +97,6 @@ def main():
        logger.info("sampled raw dataset:")
        logger.info(raw_dataset.num_rows)

-    tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
-    logger.info(f"tokenizer: {tokenizer}")
-
-    prefix = args.source_prefix if args.source_prefix is not None else ""
-    if args.source_prefix is None and "t5" in args.model_checkpoint.lower():
-        logger.warning(
-            "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
-            "`--source_prefix 'summarize: ' `"
-        )
-
    def preprocess_function(examples):
        # remove pairs where at least one record is None

--- a/nlp/aml-cli-v2/data-science/src/summarization/run_summarization.py
+++ b/nlp/aml-cli-v2/data-science/src/summarization/run_summarization.py
--- a/nlp/aml-cli-v2/mlops/azureml/train/pipeline.yaml
+++ b/nlp/aml-cli-v2/mlops/azureml/train/pipeline.yaml
@ -1,25 +1,121 @@
-$schema: https://azuremlsdk2.blob.core.windows.net/preview/0.0.1/autoMLJob.schema.json
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+experiment_name: text-summarization-pubmed-experiment
+description: Train a text summarization model using PubMed dataset for fine-tuning

-type: automl
-experiment_name: automl-text-classification-newsgroup-experiment
-description: A text classification job using newsgroup dataset
+# <inputs_and_outputs>
+inputs:
+  max_samples:
+    value: 10000
+  pretrained_model_name:
+    value: "microsoft/ssr-base"
+  num_train_epochs:
+    value: 5
+  batch_size:
+    value: 8
+  learning_rate:
+    value: 0.00005

-compute: azureml:gpu-cluster
+outputs: 
+  prepared_data:
+    type: uri_folder
+  finetuned_model:
+    type: uri_folder
+  # predictions:
+  # score_report:
+  # deploy_flag:
+# </inputs_and_outputs>

-task: text_classification
-primary_metric: accuracy
-log_verbosity: debug
+settings:
+  default_datastore: azureml:workspaceblobstore
+  default_compute: azureml:cpu-cluster-lg
+  continue_on_step_failure: false

-limits:
-  timeout_minutes: 60
+jobs:
+  prep_finetuning_dataset:
+    name: prep_finetuning_dataset
+    display_name: "Prepare dataset for training"
+    code: ../../../data-science/src
+    command: >-
+      python summarization/data_prep.py 
+      --dataset_name ${{inputs.dataset_name}} 
+      --dataset_config ${{inputs.dataset_config}} 
+      --text_column ${{inputs.text_column}} 
+      --summary_column ${{inputs.summary_column}} 
+      --max_samples ${{inputs.max_samples}} 
+      --model_checkpoint ${{inputs.pretrained_model_name}} 
+      --max_input_length ${{inputs.max_input_length}} 
+      --max_target_length ${{inputs.max_target_length}} 
+      --padding ${{inputs.padding}} 
+      --encodings ${{outputs.encodings}}
+    environment: azureml:nlp_summarization_train@latest
+    compute: azureml:cpu-cluster-lg
+    inputs:
+      dataset_name: "ccdv/pubmed-summarization"
+      dataset_config: "section"
+      text_column: "article"
+      summary_column: "abstract"
+      max_samples: ${{parent.inputs.max_samples}}
+      max_input_length: 512
+      max_target_length: 40
+      padding: "max_length"
+      pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
+    outputs:
+      encodings: ${{parent.outputs.prepared_data}}

-target_column_name: 'y'
-training_data:
-  type: mltable
-  path: azureml:training_data@latest
-validation_data:
-  type: mltable
-  path: azureml:validation_data@latest
+  finetune_model:
+    name: finetune_model
+    display_name: Fine-tune summarization model
+    code: ../../../data-science/src
+    command: >-
+      python summarization/run.py 
+      --preprocessed_datasets ${{inputs.preprocessed_datasets}} 
+      --learning_rate ${{inputs.learning_rate}} 
+      --per_device_train_batch_size ${{inputs.per_device_train_batch_size}} 
+      --per_device_eval_batch_size ${{inputs.per_device_eval_batch_size}} 
+      --max_samples ${{inputs.max_samples}} 
+      --model_name ${{inputs.pretrained_model_name}} 
+      --registered_model_name ${{inputs.registered_model_name}} 
+      --output_dir outputs 
+      --num_train_epochs ${{inputs.num_train_epochs}} 
+      --do_train --do_eval 
+      --trained_model_path ${{outputs.trained_model}}
+    environment: azureml:nlp_summarization_train@latest
+    inputs:
+      preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
+      registered_model_name: "microsoft-ssr-base-cnn-pubmed"
+      #model_path: ${{parent.jobs.train_step.outputs.trained_model_path
+      pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
+      max_samples: ${{parent.inputs.max_samples}}
+      learning_rate: ${{parent.inputs.learning_rate}}
+      num_train_epochs: ${{parent.inputs.num_train_epochs}}
+      per_device_train_batch_size: ${{parent.inputs.batch_size}}
+      per_device_eval_batch_size: ${{parent.inputs.batch_size}}
+    outputs:
+      trained_model: ${{parent.outputs.finetuned_model}}
+    compute: azureml:gpu-cluster
+    distribution:
+      type: pytorch
+      process_count_per_instance: 8 # number of gpus
+    resources:
+      instance_count: 1 # number of nodes

-featurization:
-  dataset_language: "eng"
+  evaluate_model:
+    name: evaluate_model
+    display_name: Evaluate model
+    code: ../../../data-science/src
+    command: >-
+      python summarization/run.py 
+      --preprocessed_datasets ${{inputs.preprocessed_datasets}} 
+      --max_samples ${{inputs.max_samples}} 
+      --output_dir outputs 
+      --model_path ${{inputs.model_path}} 
+      --trained_model_path ""
+      --do_eval
+    environment: azureml:nlp_summarization_train@latest
+    compute: azureml:gpu-cluster
+    inputs:
+      preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
+      model_path: ${{parent.jobs.finetune_model.outputs.trained_model}}
+      max_samples: ${{parent.inputs.max_samples}}
+      registered_model_name: "microsoft/ssr-base-cnn-pubmed"
--- a/nlp/aml-cli-v2/mlops/azureml/train/train-env.yaml
+++ b/nlp/aml-cli-v2/mlops/azureml/train/train-env.yaml
@ -0,0 +1,6 @@
+
+$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
+name: nlp_summarization_train
+version: dev3
+build:
+  path: ../../../data-science/environment/