Merge branch 'jfomhover/nlp-summarization-update' into nlp-summarization
This commit is contained in:
Коммит
b06ca77852
|
@ -0,0 +1,23 @@
|
|||
# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
|
||||
FROM nvcr.io/nvidia/pytorch:22.04-py3
|
||||
|
||||
# Install dependencies missing in this container
|
||||
# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
|
||||
COPY requirements.txt ./
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
# add ndv4-topo.xml
|
||||
RUN mkdir /opt/microsoft/
|
||||
ADD ./ndv4-topo.xml /opt/microsoft
|
||||
|
||||
# to use on A100, enable env var below in your job
|
||||
# ENV NCCL_TOPO_FILE="/opt/microsoft/ndv4-topo.xml"
|
||||
|
||||
# adjusts the level of info from NCCL tests
|
||||
ENV NCCL_DEBUG="INFO"
|
||||
ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
|
||||
|
||||
# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
|
||||
ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
|
||||
ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
ENV NCCL_SOCKET_IFNAME="eth0"
|
|
@ -0,0 +1,35 @@
|
|||
<!-- This topology file was copied from https://github.com/Azure/azhpc-images/blob/master/common/network-tuning.sh -->
|
||||
<system version="1">
|
||||
<cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
|
@ -0,0 +1,18 @@
|
|||
# for local testing (cpu)
|
||||
# torchvision==0.12.0
|
||||
# torch==1.11.0
|
||||
pytorch_lightning==1.6.4
|
||||
transformers==4.18.0
|
||||
datasets==2.0.0
|
||||
rouge_score==0.0.4
|
||||
sentencepiece==0.1.96
|
||||
|
||||
# for metrics reporting/plotting
|
||||
mlflow==1.25.1
|
||||
azureml-mlflow==1.41.0
|
||||
# matplotlib==3.5.2
|
||||
# tqdm==4.64.0
|
||||
psutil==5.9.0
|
||||
|
||||
# for unit testing
|
||||
pytest==7.1.2
|
|
@ -1,31 +0,0 @@
|
|||
name: project_environment
|
||||
channels:
|
||||
- defaults
|
||||
- pytorch
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- pip=20.0
|
||||
- pip:
|
||||
- numpy
|
||||
- boto3==1.15.0
|
||||
- requests
|
||||
- tqdm
|
||||
- scipy
|
||||
- sklearn
|
||||
- strenum==0.4.7
|
||||
- regex!=2019.12.17
|
||||
- sentencepiece==0.1.91
|
||||
- sacremoses
|
||||
- tensorboardX
|
||||
- transformers>=4.11
|
||||
- datasets==2.0.0
|
||||
- nltk==3.7
|
||||
- nvidia-ml-py3
|
||||
- rouge_score
|
||||
- joblib
|
||||
- pytorch_lightning==1.6.4
|
||||
- azureml-mlflow==1.42.0
|
||||
- 'inference-schema[numpy-support]==1.3.0'
|
||||
- applicationinsights==0.11.10
|
||||
- torch==1.9.0+cu111
|
||||
- -f https://download.pytorch.org/whl/torch_stable.html
|
|
@ -1,24 +0,0 @@
|
|||
channels:
|
||||
- pytorch
|
||||
- anaconda
|
||||
- defaults
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- pip=21.2.4
|
||||
- pytorch=1.10.0
|
||||
- torchvision=0.11.1
|
||||
- torchaudio=0.10.0
|
||||
- cudatoolkit=11.1.1
|
||||
- nvidia-apex=0.1.0
|
||||
- gxx_linux-64=8.5.0
|
||||
- pip:
|
||||
- azureml-defaults==1.39.0
|
||||
- azureml-mlflow==1.39.0
|
||||
- azureml-telemetry==1.39.0
|
||||
- azureml-train-core==1.39.0
|
||||
- mlflow==1.24.0
|
||||
- transformers==4.17.0
|
||||
- 'inference-schema[numpy-support]==1.3.0'
|
||||
- applicationinsights==0.11.10
|
||||
name: azureml_cc4492e420dd1d06c2877bacae431d92
|
|
@ -66,6 +66,17 @@ def main():
|
|||
|
||||
logger.info(f"Running with arguments: {args}")
|
||||
|
||||
# get tokenizer ready
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
|
||||
logger.info(f"tokenizer: {tokenizer}")
|
||||
|
||||
prefix = args.source_prefix if args.source_prefix is not None else ""
|
||||
if args.source_prefix is None and "t5" in args.model_checkpoint.lower():
|
||||
logger.warning(
|
||||
"You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
|
||||
"`--source_prefix 'summarize: ' `"
|
||||
)
|
||||
|
||||
# Load dataset
|
||||
raw_dataset = load_dataset(args.dataset_name, args.dataset_config)
|
||||
|
||||
|
@ -86,16 +97,6 @@ def main():
|
|||
logger.info("sampled raw dataset:")
|
||||
logger.info(raw_dataset.num_rows)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
|
||||
logger.info(f"tokenizer: {tokenizer}")
|
||||
|
||||
prefix = args.source_prefix if args.source_prefix is not None else ""
|
||||
if args.source_prefix is None and "t5" in args.model_checkpoint.lower():
|
||||
logger.warning(
|
||||
"You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
|
||||
"`--source_prefix 'summarize: ' `"
|
||||
)
|
||||
|
||||
def preprocess_function(examples):
|
||||
# remove pairs where at least one record is None
|
||||
|
||||
|
|
|
@ -1,25 +1,121 @@
|
|||
$schema: https://azuremlsdk2.blob.core.windows.net/preview/0.0.1/autoMLJob.schema.json
|
||||
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
|
||||
type: pipeline
|
||||
experiment_name: text-summarization-pubmed-experiment
|
||||
description: Train a text summarization model using PubMed dataset for fine-tuning
|
||||
|
||||
type: automl
|
||||
experiment_name: automl-text-classification-newsgroup-experiment
|
||||
description: A text classification job using newsgroup dataset
|
||||
# <inputs_and_outputs>
|
||||
inputs:
|
||||
max_samples:
|
||||
value: 10000
|
||||
pretrained_model_name:
|
||||
value: "microsoft/ssr-base"
|
||||
num_train_epochs:
|
||||
value: 5
|
||||
batch_size:
|
||||
value: 8
|
||||
learning_rate:
|
||||
value: 0.00005
|
||||
|
||||
compute: azureml:gpu-cluster
|
||||
outputs:
|
||||
prepared_data:
|
||||
type: uri_folder
|
||||
finetuned_model:
|
||||
type: uri_folder
|
||||
# predictions:
|
||||
# score_report:
|
||||
# deploy_flag:
|
||||
# </inputs_and_outputs>
|
||||
|
||||
task: text_classification
|
||||
primary_metric: accuracy
|
||||
log_verbosity: debug
|
||||
settings:
|
||||
default_datastore: azureml:workspaceblobstore
|
||||
default_compute: azureml:cpu-cluster-lg
|
||||
continue_on_step_failure: false
|
||||
|
||||
limits:
|
||||
timeout_minutes: 60
|
||||
jobs:
|
||||
prep_finetuning_dataset:
|
||||
name: prep_finetuning_dataset
|
||||
display_name: "Prepare dataset for training"
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python summarization/data_prep.py
|
||||
--dataset_name ${{inputs.dataset_name}}
|
||||
--dataset_config ${{inputs.dataset_config}}
|
||||
--text_column ${{inputs.text_column}}
|
||||
--summary_column ${{inputs.summary_column}}
|
||||
--max_samples ${{inputs.max_samples}}
|
||||
--model_checkpoint ${{inputs.pretrained_model_name}}
|
||||
--max_input_length ${{inputs.max_input_length}}
|
||||
--max_target_length ${{inputs.max_target_length}}
|
||||
--padding ${{inputs.padding}}
|
||||
--encodings ${{outputs.encodings}}
|
||||
environment: azureml:nlp_summarization_train@latest
|
||||
compute: azureml:cpu-cluster-lg
|
||||
inputs:
|
||||
dataset_name: "ccdv/pubmed-summarization"
|
||||
dataset_config: "section"
|
||||
text_column: "article"
|
||||
summary_column: "abstract"
|
||||
max_samples: ${{parent.inputs.max_samples}}
|
||||
max_input_length: 512
|
||||
max_target_length: 40
|
||||
padding: "max_length"
|
||||
pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
|
||||
outputs:
|
||||
encodings: ${{parent.outputs.prepared_data}}
|
||||
|
||||
target_column_name: 'y'
|
||||
training_data:
|
||||
type: mltable
|
||||
path: azureml:training_data@latest
|
||||
validation_data:
|
||||
type: mltable
|
||||
path: azureml:validation_data@latest
|
||||
finetune_model:
|
||||
name: finetune_model
|
||||
display_name: Fine-tune summarization model
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python summarization/run.py
|
||||
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
|
||||
--learning_rate ${{inputs.learning_rate}}
|
||||
--per_device_train_batch_size ${{inputs.per_device_train_batch_size}}
|
||||
--per_device_eval_batch_size ${{inputs.per_device_eval_batch_size}}
|
||||
--max_samples ${{inputs.max_samples}}
|
||||
--model_name ${{inputs.pretrained_model_name}}
|
||||
--registered_model_name ${{inputs.registered_model_name}}
|
||||
--output_dir outputs
|
||||
--num_train_epochs ${{inputs.num_train_epochs}}
|
||||
--do_train --do_eval
|
||||
--trained_model_path ${{outputs.trained_model}}
|
||||
environment: azureml:nlp_summarization_train@latest
|
||||
inputs:
|
||||
preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
|
||||
registered_model_name: "microsoft-ssr-base-cnn-pubmed"
|
||||
#model_path: ${{parent.jobs.train_step.outputs.trained_model_path
|
||||
pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
|
||||
max_samples: ${{parent.inputs.max_samples}}
|
||||
learning_rate: ${{parent.inputs.learning_rate}}
|
||||
num_train_epochs: ${{parent.inputs.num_train_epochs}}
|
||||
per_device_train_batch_size: ${{parent.inputs.batch_size}}
|
||||
per_device_eval_batch_size: ${{parent.inputs.batch_size}}
|
||||
outputs:
|
||||
trained_model: ${{parent.outputs.finetuned_model}}
|
||||
compute: azureml:gpu-cluster
|
||||
distribution:
|
||||
type: pytorch
|
||||
process_count_per_instance: 8 # number of gpus
|
||||
resources:
|
||||
instance_count: 1 # number of nodes
|
||||
|
||||
featurization:
|
||||
dataset_language: "eng"
|
||||
evaluate_model:
|
||||
name: evaluate_model
|
||||
display_name: Evaluate model
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python summarization/run.py
|
||||
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
|
||||
--max_samples ${{inputs.max_samples}}
|
||||
--output_dir outputs
|
||||
--model_path ${{inputs.model_path}}
|
||||
--trained_model_path ""
|
||||
--do_eval
|
||||
environment: azureml:nlp_summarization_train@latest
|
||||
compute: azureml:gpu-cluster
|
||||
inputs:
|
||||
preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
|
||||
model_path: ${{parent.jobs.finetune_model.outputs.trained_model}}
|
||||
max_samples: ${{parent.inputs.max_samples}}
|
||||
registered_model_name: "microsoft/ssr-base-cnn-pubmed"
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
|
||||
name: nlp_summarization_train
|
||||
version: dev3
|
||||
build:
|
||||
path: ../../../data-science/environment/
|
Загрузка…
Ссылка в новой задаче