Merge branch 'jfomhover/nlp-summarization-update' into nlp-summarization

This commit is contained in:
Jeff Omhover 2022-07-14 11:13:34 -07:00
Родитель 6ee3186f95 574075f5f0
Коммит b06ca77852
9 изменённых файлов: 208 добавлений и 84 удалений

Просмотреть файл

@ -0,0 +1,23 @@
# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
FROM nvcr.io/nvidia/pytorch:22.04-py3
# Install dependencies missing in this container
# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
COPY requirements.txt ./
RUN pip install -r requirements.txt
# add ndv4-topo.xml
RUN mkdir /opt/microsoft/
ADD ./ndv4-topo.xml /opt/microsoft
# to use on A100, enable env var below in your job
# ENV NCCL_TOPO_FILE="/opt/microsoft/ndv4-topo.xml"
# adjusts the level of info from NCCL tests
ENV NCCL_DEBUG="INFO"
ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
ENV NCCL_SOCKET_IFNAME="eth0"

Просмотреть файл

@ -0,0 +1,35 @@
<!-- This topology file was copied from https://github.com/Azure/azhpc-images/blob/master/common/network-tuning.sh -->
<system version="1">
<cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
</system>

Просмотреть файл

@ -0,0 +1,18 @@
# for local testing (cpu)
# torchvision==0.12.0
# torch==1.11.0
pytorch_lightning==1.6.4
transformers==4.18.0
datasets==2.0.0
rouge_score==0.0.4
sentencepiece==0.1.96
# for metrics reporting/plotting
mlflow==1.25.1
azureml-mlflow==1.41.0
# matplotlib==3.5.2
# tqdm==4.64.0
psutil==5.9.0
# for unit testing
pytest==7.1.2

Просмотреть файл

@ -1,31 +0,0 @@
name: project_environment
channels:
- defaults
- pytorch
dependencies:
- python=3.8
- pip=20.0
- pip:
- numpy
- boto3==1.15.0
- requests
- tqdm
- scipy
- sklearn
- strenum==0.4.7
- regex!=2019.12.17
- sentencepiece==0.1.91
- sacremoses
- tensorboardX
- transformers>=4.11
- datasets==2.0.0
- nltk==3.7
- nvidia-ml-py3
- rouge_score
- joblib
- pytorch_lightning==1.6.4
- azureml-mlflow==1.42.0
- 'inference-schema[numpy-support]==1.3.0'
- applicationinsights==0.11.10
- torch==1.9.0+cu111
- -f https://download.pytorch.org/whl/torch_stable.html

Просмотреть файл

@ -1,24 +0,0 @@
channels:
- pytorch
- anaconda
- defaults
- conda-forge
dependencies:
- python=3.8
- pip=21.2.4
- pytorch=1.10.0
- torchvision=0.11.1
- torchaudio=0.10.0
- cudatoolkit=11.1.1
- nvidia-apex=0.1.0
- gxx_linux-64=8.5.0
- pip:
- azureml-defaults==1.39.0
- azureml-mlflow==1.39.0
- azureml-telemetry==1.39.0
- azureml-train-core==1.39.0
- mlflow==1.24.0
- transformers==4.17.0
- 'inference-schema[numpy-support]==1.3.0'
- applicationinsights==0.11.10
name: azureml_cc4492e420dd1d06c2877bacae431d92

Просмотреть файл

@ -66,6 +66,17 @@ def main():
logger.info(f"Running with arguments: {args}")
# get tokenizer ready
tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
logger.info(f"tokenizer: {tokenizer}")
prefix = args.source_prefix if args.source_prefix is not None else ""
if args.source_prefix is None and "t5" in args.model_checkpoint.lower():
logger.warning(
"You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
"`--source_prefix 'summarize: ' `"
)
# Load dataset
raw_dataset = load_dataset(args.dataset_name, args.dataset_config)
@ -86,16 +97,6 @@ def main():
logger.info("sampled raw dataset:")
logger.info(raw_dataset.num_rows)
tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
logger.info(f"tokenizer: {tokenizer}")
prefix = args.source_prefix if args.source_prefix is not None else ""
if args.source_prefix is None and "t5" in args.model_checkpoint.lower():
logger.warning(
"You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
"`--source_prefix 'summarize: ' `"
)
def preprocess_function(examples):
# remove pairs where at least one record is None

Просмотреть файл

@ -1,25 +1,121 @@
$schema: https://azuremlsdk2.blob.core.windows.net/preview/0.0.1/autoMLJob.schema.json
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
experiment_name: text-summarization-pubmed-experiment
description: Train a text summarization model using PubMed dataset for fine-tuning
type: automl
experiment_name: automl-text-classification-newsgroup-experiment
description: A text classification job using newsgroup dataset
# <inputs_and_outputs>
inputs:
max_samples:
value: 10000
pretrained_model_name:
value: "microsoft/ssr-base"
num_train_epochs:
value: 5
batch_size:
value: 8
learning_rate:
value: 0.00005
compute: azureml:gpu-cluster
outputs:
prepared_data:
type: uri_folder
finetuned_model:
type: uri_folder
# predictions:
# score_report:
# deploy_flag:
# </inputs_and_outputs>
task: text_classification
primary_metric: accuracy
log_verbosity: debug
settings:
default_datastore: azureml:workspaceblobstore
default_compute: azureml:cpu-cluster-lg
continue_on_step_failure: false
limits:
timeout_minutes: 60
jobs:
prep_finetuning_dataset:
name: prep_finetuning_dataset
display_name: "Prepare dataset for training"
code: ../../../data-science/src
command: >-
python summarization/data_prep.py
--dataset_name ${{inputs.dataset_name}}
--dataset_config ${{inputs.dataset_config}}
--text_column ${{inputs.text_column}}
--summary_column ${{inputs.summary_column}}
--max_samples ${{inputs.max_samples}}
--model_checkpoint ${{inputs.pretrained_model_name}}
--max_input_length ${{inputs.max_input_length}}
--max_target_length ${{inputs.max_target_length}}
--padding ${{inputs.padding}}
--encodings ${{outputs.encodings}}
environment: azureml:nlp_summarization_train@latest
compute: azureml:cpu-cluster-lg
inputs:
dataset_name: "ccdv/pubmed-summarization"
dataset_config: "section"
text_column: "article"
summary_column: "abstract"
max_samples: ${{parent.inputs.max_samples}}
max_input_length: 512
max_target_length: 40
padding: "max_length"
pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
outputs:
encodings: ${{parent.outputs.prepared_data}}
target_column_name: 'y'
training_data:
type: mltable
path: azureml:training_data@latest
validation_data:
type: mltable
path: azureml:validation_data@latest
finetune_model:
name: finetune_model
display_name: Fine-tune summarization model
code: ../../../data-science/src
command: >-
python summarization/run.py
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
--learning_rate ${{inputs.learning_rate}}
--per_device_train_batch_size ${{inputs.per_device_train_batch_size}}
--per_device_eval_batch_size ${{inputs.per_device_eval_batch_size}}
--max_samples ${{inputs.max_samples}}
--model_name ${{inputs.pretrained_model_name}}
--registered_model_name ${{inputs.registered_model_name}}
--output_dir outputs
--num_train_epochs ${{inputs.num_train_epochs}}
--do_train --do_eval
--trained_model_path ${{outputs.trained_model}}
environment: azureml:nlp_summarization_train@latest
inputs:
preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
registered_model_name: "microsoft-ssr-base-cnn-pubmed"
#model_path: ${{parent.jobs.train_step.outputs.trained_model_path
pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
max_samples: ${{parent.inputs.max_samples}}
learning_rate: ${{parent.inputs.learning_rate}}
num_train_epochs: ${{parent.inputs.num_train_epochs}}
per_device_train_batch_size: ${{parent.inputs.batch_size}}
per_device_eval_batch_size: ${{parent.inputs.batch_size}}
outputs:
trained_model: ${{parent.outputs.finetuned_model}}
compute: azureml:gpu-cluster
distribution:
type: pytorch
process_count_per_instance: 8 # number of gpus
resources:
instance_count: 1 # number of nodes
featurization:
dataset_language: "eng"
evaluate_model:
name: evaluate_model
display_name: Evaluate model
code: ../../../data-science/src
command: >-
python summarization/run.py
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
--max_samples ${{inputs.max_samples}}
--output_dir outputs
--model_path ${{inputs.model_path}}
--trained_model_path ""
--do_eval
environment: azureml:nlp_summarization_train@latest
compute: azureml:gpu-cluster
inputs:
preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
model_path: ${{parent.jobs.finetune_model.outputs.trained_model}}
max_samples: ${{parent.inputs.max_samples}}
registered_model_name: "microsoft/ssr-base-cnn-pubmed"

Просмотреть файл

@ -0,0 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
name: nlp_summarization_train
version: dev3
build:
path: ../../../data-science/environment/