working up to training
This commit is contained in:
Родитель
2caba04f7a
Коммит
7ab393d2f5
|
@ -4,7 +4,7 @@ torch==1.11.0
|
|||
pytorch_lightning==1.6.4
|
||||
transformers==4.18.0
|
||||
datasets==2.0.0
|
||||
rouge_score==
|
||||
rouge_score==0.0.4
|
||||
|
||||
# for metrics reporting/plotting
|
||||
mlflow==1.25.1
|
||||
|
|
|
@ -66,6 +66,17 @@ def main():
|
|||
|
||||
logger.info(f"Running with arguments: {args}")
|
||||
|
||||
# get tokenizer ready
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
|
||||
logger.info(f"tokenizer: {tokenizer}")
|
||||
|
||||
prefix = args.source_prefix if args.source_prefix is not None else ""
|
||||
if args.source_prefix is None and "t5" in args.model_checkpoint.lower():
|
||||
logger.warning(
|
||||
"You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
|
||||
"`--source_prefix 'summarize: ' `"
|
||||
)
|
||||
|
||||
# Load dataset
|
||||
raw_dataset = load_dataset(args.dataset_name, args.dataset_config)
|
||||
|
||||
|
@ -86,16 +97,6 @@ def main():
|
|||
logger.info("sampled raw dataset:")
|
||||
logger.info(raw_dataset.num_rows)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
|
||||
logger.info(f"tokenizer: {tokenizer}")
|
||||
|
||||
prefix = args.source_prefix if args.source_prefix is not None else ""
|
||||
if args.source_prefix is None and "t5" in args.model_checkpoint.lower():
|
||||
logger.warning(
|
||||
"You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
|
||||
"`--source_prefix 'summarize: ' `"
|
||||
)
|
||||
|
||||
def preprocess_function(examples):
|
||||
# remove pairs where at least one record is None
|
||||
|
||||
|
|
|
@ -1,17 +1,7 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
|
||||
type: pipeline
|
||||
experiment_name: text-summarization-pubmed-experiment
|
||||
description: A text summarization using pubmed dataset for fine-tuning
|
||||
|
||||
# compute: azureml:gpu-cluster
|
||||
|
||||
# task: text_classification
|
||||
# primary_metric: accuracy
|
||||
# log_verbosity: debug
|
||||
|
||||
# limits:
|
||||
# timeout_minutes: 60
|
||||
|
||||
description: Train a text summarization model using PubMed dataset for fine-tuning
|
||||
|
||||
# <inputs_and_outputs>
|
||||
inputs:
|
||||
|
@ -27,25 +17,25 @@ inputs:
|
|||
value: 0.00005
|
||||
|
||||
outputs:
|
||||
generic_data_prepared:
|
||||
trained_model:
|
||||
predictions:
|
||||
score_report:
|
||||
deploy_flag:
|
||||
prepared_data:
|
||||
finetuned_model:
|
||||
# predictions:
|
||||
# score_report:
|
||||
# deploy_flag:
|
||||
# </inputs_and_outputs>
|
||||
|
||||
settings:
|
||||
default_datastore: azureml:workspaceblobstore
|
||||
default_compute: azureml:cpu-cluster
|
||||
default_compute: azureml:cpu-cluster-lg
|
||||
continue_on_step_failure: false
|
||||
|
||||
jobs:
|
||||
prep_finetuning_dataset:
|
||||
name: prep_finetuning_dataset
|
||||
display_name: "Prepare our domain-specific dataset for training"
|
||||
display_name: "Prepare dataset for training"
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python data_prep.py
|
||||
python summarization/data_prep.py
|
||||
--dataset_name ${{inputs.dataset_name}}
|
||||
--dataset_config ${{inputs.dataset_config}}
|
||||
--text_column ${{inputs.text_column}}
|
||||
|
@ -70,14 +60,14 @@ jobs:
|
|||
pretrained_model_name: ${{parent.inputs.pretrained_model_name}}
|
||||
source_prefix: "summarize: "
|
||||
outputs:
|
||||
encodings: ${{parent.outputs.large_dataset_preprocessed}}
|
||||
encodings: ${{parent.outputs.prepared_data}}
|
||||
|
||||
finetune_model:
|
||||
name: finetune_model
|
||||
display_name: fine-tune a summarization model
|
||||
display_name: Fine-tune summarization model
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python run_summarization.py
|
||||
python summarization/run.py
|
||||
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
|
||||
--learning_rate ${{inputs.learning_rate}}
|
||||
--per_device_train_batch_size ${{inputs.per_device_train_batch_size}}
|
||||
|
@ -104,7 +94,7 @@ jobs:
|
|||
per_device_eval_batch_size: ${{parent.inputs.batch_size}}
|
||||
source_prefix: "summarize: "
|
||||
outputs:
|
||||
trained_model: ${{parent.outputs.trained_model}}
|
||||
trained_model: ${{parent.outputs.finetuned_model}}
|
||||
compute: azureml:gpu-cluster
|
||||
distribution:
|
||||
type: pytorch
|
||||
|
@ -114,21 +104,19 @@ jobs:
|
|||
|
||||
evaluate_model:
|
||||
name: evaluate_model
|
||||
display_name: evaluate-model
|
||||
display_name: Evaluate model
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python run_summarization.py
|
||||
python summarization/run.py
|
||||
--preprocessed_datasets ${{inputs.preprocessed_datasets}}
|
||||
--max_samples ${{inputs.max_samples}}
|
||||
--output_dir outputs
|
||||
--model_path ${{inputs.model_path}}
|
||||
--trained_model_path ${{outputs.trained_model_path}}
|
||||
--trained_model_path ""
|
||||
--do_eval
|
||||
environment: azureml:nlp_summarization_train@latest
|
||||
inputs:
|
||||
preprocessed_datasets: ${{parent.jobs.prep_finetuning_dataset.outputs.encodings}}
|
||||
registered_model_name: "t5-small-cnn-pubmed"
|
||||
model_path: ${{parent.jobs.finetune_model.outputs.trained_model
|
||||
model_path: ${{parent.jobs.finetune_model.outputs.trained_model}}
|
||||
max_samples: ${{parent.inputs.max_samples}}
|
||||
outputs:
|
||||
trained_model_path:
|
||||
registered_model_name: "t5-small-cnn-pubmed"
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
|
||||
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
|
||||
name: nlp_summarization_train
|
||||
version: dev1
|
||||
version: dev2
|
||||
build:
|
||||
path: ../../../data-science/environment/
|
||||
|
|
Загрузка…
Ссылка в новой задаче