submittable pipeline
This commit is contained in:
Родитель
0be7a12233
Коммит
2caba04f7a
|
@ -1,25 +1,134 @@
|
|||
$schema: https://azuremlsdk2.blob.core.windows.net/preview/0.0.1/autoMLJob.schema.json
|
||||
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
|
||||
type: pipeline
|
||||
experiment_name: text-summarization-pubmed-experiment
|
||||
description: A text summarization using pubmed dataset for fine-tuning
|
||||
|
||||
type: automl
|
||||
experiment_name: automl-text-classification-newsgroup-experiment
|
||||
description: A text classification job using newsgroup dataset
|
||||
# compute: azureml:gpu-cluster
|
||||
|
||||
compute: azureml:gpu-cluster
|
||||
# task: text_classification
|
||||
# primary_metric: accuracy
|
||||
# log_verbosity: debug
|
||||
|
||||
task: text_classification
|
||||
primary_metric: accuracy
|
||||
log_verbosity: debug
|
||||
# limits:
|
||||
# timeout_minutes: 60
|
||||
|
||||
limits:
|
||||
timeout_minutes: 60
|
||||
|
||||
target_column_name: 'y'
|
||||
training_data:
|
||||
type: mltable
|
||||
path: azureml:training_data@latest
|
||||
validation_data:
|
||||
type: mltable
|
||||
path: azureml:validation_data@latest
|
||||
# <inputs_and_outputs>
|
||||
inputs:
|
||||
max_samples:
|
||||
value: 10000
|
||||
pretrained_model_name:
|
||||
value: "t5-small"
|
||||
num_train_epochs:
|
||||
value: 5
|
||||
batch_size:
|
||||
value: 8
|
||||
learning_rate:
|
||||
value: 0.00005
|
||||
|
||||
featurization:
|
||||
dataset_language: "eng"
|
||||
outputs:
|
||||
generic_data_prepared:
|
||||
trained_model:
|
||||
predictions:
|
||||
score_report:
|
||||
deploy_flag:
|
||||
# </inputs_and_outputs>
|
||||
|
||||
settings:
|
||||
default_datastore: azureml:workspaceblobstore
|
||||
default_compute: azureml:cpu-cluster
|
||||
continue_on_step_failure: false
|
||||
|
||||
jobs:
|
||||
prep_finetuning_dataset:
|
||||
name: prep_finetuning_dataset
|
||||
display_name: "Prepare our domain-specific dataset for training"
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python data_prep.py
|
||||
--dataset_name ${{inputs.dataset_name}}
|
||||
--dataset_config ${{inputs.dataset_config}}
|
||||
--text_column ${{inputs.text_column}}
|
||||
--summary_column ${{inputs.summary_column}}
|
||||
--max_samples ${{inputs.max_samples}}
|
||||
--model_checkpoint ${{inputs.pretrained_model_name}}
|
||||
--max_input_length ${{inputs.max_input_length}}
|
||||
--max_target_length ${{inputs.max_target_length}}
|
||||
--padding ${{inputs.padding}}
|
||||
--source_prefix ${{inputs.source_prefix}}
|
||||
--encodings ${{outputs.encodings}}
|
||||
environment: azureml:nlp_summarization_train@latest
|
||||
inputs:
|
||||
dataset_name: "ccdv/pubmed-summarization"
|
||||
dataset_config: "section"
|
||||
text_column: "article"
|
||||
summary_column: "abstract"
|
||||
max_samples: ${{parent.inputs.max_samples}}
|
||||
max_input_length: 512
|
||||
max_target_length: 40
|
||||
padding: "max_length"
|
||||
pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
|
||||
source_prefix: "summarize: "
|
||||
outputs:
|
||||
encodings: ${{parent.outputs.large_dataset_preprocessed}}
|
||||
|
||||
finetune_model:
|
||||
name: finetune_model
|
||||
display_name: fine-tune a summarization model
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python run_summarization.py
|
||||
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
|
||||
--learning_rate ${{inputs.learning_rate}}
|
||||
--per_device_train_batch_size ${{inputs.per_device_train_batch_size}}
|
||||
--per_device_eval_batch_size ${{inputs.per_device_eval_batch_size}}
|
||||
--max_samples ${{inputs.max_samples}}
|
||||
--model_name ${{inputs.pretrained_model_name}}
|
||||
--registered_model_name ${{inputs.registered_model_name}}
|
||||
--output_dir outputs
|
||||
--num_train_epochs ${{inputs.num_train_epochs}}
|
||||
--trained_model_path ${{outputs.trained_model}}
|
||||
--disable_tqdm True
|
||||
--do_train --do_eval
|
||||
--source_prefix ${{inputs.source_prefix}}
|
||||
environment: azureml:nlp_summarization_train@latest
|
||||
inputs:
|
||||
preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
|
||||
registered_model_name: "t5-small-cnn-pubmed"
|
||||
#model_path: ${{parent.jobs.train_step.outputs.trained_model_path
|
||||
pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
|
||||
max_samples: ${{parent.inputs.max_samples}}
|
||||
learning_rate: ${{parent.inputs.learning_rate}}
|
||||
num_train_epochs: ${{parent.inputs.num_train_epochs}}
|
||||
per_device_train_batch_size: ${{parent.inputs.batch_size}}
|
||||
per_device_eval_batch_size: ${{parent.inputs.batch_size}}
|
||||
source_prefix: "summarize: "
|
||||
outputs:
|
||||
trained_model: ${{parent.outputs.trained_model}}
|
||||
compute: azureml:gpu-cluster
|
||||
distribution:
|
||||
type: pytorch
|
||||
process_count_per_instance: 1 # number of gpus
|
||||
resources:
|
||||
instance_count: 1 # number of nodes
|
||||
|
||||
evaluate_model:
|
||||
name: evaluate_model
|
||||
display_name: evaluate-model
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python run_summarization.py
|
||||
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
|
||||
--max_samples ${{inputs.max_samples}}
|
||||
--output_dir outputs
|
||||
--model_path ${{inputs.model_path}}
|
||||
--trained_model_path ${{outputs.trained_model_path}}
|
||||
--do_eval
|
||||
environment: azureml:nlp_summarization_train@latest
|
||||
inputs:
|
||||
preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
|
||||
registered_model_name: "t5-small-cnn-pubmed"
|
||||
model_path: ${{parent.jobs.finetune_model.outputs.trained_model
|
||||
max_samples: ${{parent.inputs.max_samples}}
|
||||
outputs:
|
||||
trained_model_path:
|
||||
|
|
Загрузка…
Ссылка в новой задаче