This commit is contained in:
Jeff Omhover 2022-07-12 14:12:29 -07:00
Родитель 0be7a12233
Коммит 2caba04f7a
1 изменённых файлов: 128 добавлений и 19 удалений

Просмотреть файл

@ -1,25 +1,134 @@
$schema: https://azuremlsdk2.blob.core.windows.net/preview/0.0.1/autoMLJob.schema.json
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
experiment_name: text-summarization-pubmed-experiment
description: A text summarization using pubmed dataset for fine-tuning
type: automl
experiment_name: automl-text-classification-newsgroup-experiment
description: A text classification job using newsgroup dataset
# compute: azureml:gpu-cluster
compute: azureml:gpu-cluster
# task: text_classification
# primary_metric: accuracy
# log_verbosity: debug
task: text_classification
primary_metric: accuracy
log_verbosity: debug
# limits:
# timeout_minutes: 60
limits:
timeout_minutes: 60
target_column_name: 'y'
training_data:
type: mltable
path: azureml:training_data@latest
validation_data:
type: mltable
path: azureml:validation_data@latest
# <inputs_and_outputs>
inputs:
max_samples:
value: 10000
pretrained_model_name:
value: "t5-small"
num_train_epochs:
value: 5
batch_size:
value: 8
learning_rate:
value: 0.00005
featurization:
dataset_language: "eng"
outputs:
generic_data_prepared:
trained_model:
predictions:
score_report:
deploy_flag:
# </inputs_and_outputs>
settings:
default_datastore: azureml:workspaceblobstore
default_compute: azureml:cpu-cluster
continue_on_step_failure: false
jobs:
prep_finetuning_dataset:
name: prep_finetuning_dataset
display_name: "Prepare our domain-specific dataset for training"
code: ../../../data-science/src
command: >-
python data_prep.py
--dataset_name ${{inputs.dataset_name}}
--dataset_config ${{inputs.dataset_config}}
--text_column ${{inputs.text_column}}
--summary_column ${{inputs.summary_column}}
--max_samples ${{inputs.max_samples}}
--model_checkpoint ${{inputs.pretrained_model_name}}
--max_input_length ${{inputs.max_input_length}}
--max_target_length ${{inputs.max_target_length}}
--padding ${{inputs.padding}}
--source_prefix ${{inputs.source_prefix}}
--encodings ${{outputs.encodings}}
environment: azureml:nlp_summarization_train@latest
inputs:
dataset_name: "ccdv/pubmed-summarization"
dataset_config: "section"
text_column: "article"
summary_column: "abstract"
max_samples: ${{parent.inputs.max_samples}}
max_input_length: 512
max_target_length: 40
padding: "max_length"
pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
source_prefix: "summarize: "
outputs:
encodings: ${{parent.outputs.large_dataset_preprocessed}}
finetune_model:
name: finetune_model
display_name: fine-tune a summarization model
code: ../../../data-science/src
command: >-
python run_summarization.py
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
--learning_rate ${{inputs.learning_rate}}
--per_device_train_batch_size ${{inputs.per_device_train_batch_size}}
--per_device_eval_batch_size ${{inputs.per_device_eval_batch_size}}
--max_samples ${{inputs.max_samples}}
--model_name ${{inputs.pretrained_model_name}}
--registered_model_name ${{inputs.registered_model_name}}
--output_dir outputs
--num_train_epochs ${{inputs.num_train_epochs}}
--trained_model_path ${{outputs.trained_model}}
--disable_tqdm True
--do_train --do_eval
--source_prefix ${{inputs.source_prefix}}
environment: azureml:nlp_summarization_train@latest
inputs:
preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
registered_model_name: "t5-small-cnn-pubmed"
#model_path: ${{parent.jobs.train_step.outputs.trained_model_path
pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
max_samples: ${{parent.inputs.max_samples}}
learning_rate: ${{parent.inputs.learning_rate}}
num_train_epochs: ${{parent.inputs.num_train_epochs}}
per_device_train_batch_size: ${{parent.inputs.batch_size}}
per_device_eval_batch_size: ${{parent.inputs.batch_size}}
source_prefix: "summarize: "
outputs:
trained_model: ${{parent.outputs.trained_model}}
compute: azureml:gpu-cluster
distribution:
type: pytorch
process_count_per_instance: 1 # number of gpus
resources:
instance_count: 1 # number of nodes
evaluate_model:
name: evaluate_model
display_name: evaluate-model
code: ../../../data-science/src
command: >-
python run_summarization.py
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
--max_samples ${{inputs.max_samples}}
--output_dir outputs
--model_path ${{inputs.model_path}}
--trained_model_path ${{outputs.trained_model_path}}
--do_eval
environment: azureml:nlp_summarization_train@latest
inputs:
preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
registered_model_name: "t5-small-cnn-pubmed"
model_path: ${{parent.jobs.finetune_model.outputs.trained_model
max_samples: ${{parent.inputs.max_samples}}
outputs:
trained_model_path: