msrflute/core/server.py

595 строки
27 KiB
Python
Исходник Обычный вид История

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
In this file, we define the classes that live inside 'worker 0', the worker
responsible for orchestration and aggregation. The main class is the
OptimizationServer, which sends clients to the other workers to process and
combines the resulting models.
'''
import json
import logging
import os
import random
import shutil
import time
from collections import defaultdict
import numpy as np
import torch
# Internal imports
from core.globals import TRAINING_FRAMEWORK_TYPE
if TRAINING_FRAMEWORK_TYPE == 'mpi':
import core.federated as federated
else:
raise NotImplementedError('{} is not supported'.format(TRAINING_FRAMEWORK_TYPE))
from core.evaluation import Evaluation
from core.client import Client
from .strategies import select_strategy
from .trainer import (
ModelUpdater,
Trainer,
set_component_wise_lr,
)
from utils import (
get_lr,
print_rank,
update_json_log,
)
# For profiling
import cProfile
import pstats
# AzureML-related libs
from azureml.core import Run
run = Run.get_context()
class OptimizationServer(federated.Server):
def __init__(self, num_clients, model, optimizer, ss_scheduler, data_path, model_path, train_dataloader,
val_dataloader, test_dataloader, config, config_server):
'''Implement Server's orchestration and aggregation.
This is the main Server class, that actually implements orchestration
and aggregation, inheriting from `federated.Server`, which deals with
communication only.
The `train` method is central in FLUTE, as it defines good part of what
happens during training.
Args:
num_clients (int): total available clients.
model (torch.nn.Module): neural network model.
optimizer (torch.optim.Optimizer): optimizer.
ss_scheduler: scheduled sampling scheduler.
data_path (str): points to where data is.
model_path (str): points to where pretrained model is.
train_dataloader (torch.utils.data.DataLoader): dataloader for training
val_dataloader (torch.utils.data.DataLoader): dataloader for validation
test_dataloader (torch.utils.data.DataLoader): dataloader for test, can be None
config (dict): JSON style configuration parameters
config_server: deprecated, kept for API compatibility only.
'''
super().__init__()
# Initialize all attributes from arguments
self.client_idx_list = list(range(num_clients))
self.config = config
server_config = config['server_config']
decoder_config = config.get('decoder_config', None)
self.max_iteration = server_config['max_iteration']
self.do_clustering = server_config.get('clustering', False)
self.num_clients_per_iteration = [int(x) for x in server_config['num_clients_per_iteration'].split(',')] \
if isinstance(server_config['num_clients_per_iteration'], str) \
else [server_config['num_clients_per_iteration']]
self.val_freq = server_config['val_freq']
self.req_freq = server_config['rec_freq']
self.evaluation = Evaluation(config, model_path, self.process_testvalidate, val_dataloader, test_dataloader)
self.metrics = dict()
self.model_backup_freq = server_config.get('model_backup_freq', 100)
self.worker_trainer_config = server_config.get('trainer_config', {})
self.aggregate_median = server_config['aggregate_median']
self.initial_lr_client = server_config.get('initial_lr_client', -1.0)
self.lr_decay_factor = server_config.get('lr_decay_factor', 1.0)
self.model_type = config['model_config']['model_type']
self.quant_thresh = config['client_config'].get('quant_thresh', None)
self.quant_bits = config['client_config'].get('quant_bits', 10)
self.list_of_train_data = config['client_config']['data_config']['train']['list_of_train_data']
self.data_path = data_path
# Get max grad norm from data config
if 'train' in server_config['data_config']:
max_grad_norm = server_config['data_config']['train'].get('max_grad_norm', None)
else:
max_grad_norm = None
# Creating an instance to update the model with stats aggregated from workers
self.worker_trainer = ModelUpdater(
model=model,
optimizer=optimizer,
ss_scheduler=ss_scheduler,
train_dataloader=train_dataloader if train_dataloader is not None else val_dataloader,
val_dataloader=val_dataloader,
max_grad_norm=max_grad_norm,
anneal_config=server_config['annealing_config'],
model_type=self.model_type,
decoder_config=decoder_config
)
self.metrics['worker_trainer'] = self.worker_trainer
# Creating an instance for the server-side trainer (runs mini-batch SGD)
self.server_replay_iterations = None
self.server_trainer = None
if train_dataloader is not None:
assert 'server_replay_config' in server_config, 'server_replay_config is not set'
assert 'optimizer_config' in server_config[
'server_replay_config'], 'server-side replay training optimizer is not set'
self.server_optimizer_config = server_config['server_replay_config']['optimizer_config']
self.server_trainer_config = server_config['server_replay_config'].get('trainer_config', {})
self.server_replay_iterations = server_config['server_replay_config']['server_iterations']
self.server_trainer = Trainer(
model=model,
optimizer=None,
ss_scheduler=ss_scheduler,
train_dataloader=train_dataloader,
server_replay_config=server_config['server_replay_config'],
val_dataloader=None,
max_grad_norm=server_config['server_replay_config']\
.get('max_grad_norm',server_config['data_config']['train']\
.get('max_grad_norm',None)),
anneal_config=server_config['server_replay_config'].get('annealing_config', None)
)
self.skip_model_update = False # will not update the model if True
self.train_loss = 0.0
self.model_path = model_path
self.best_model_criterion = server_config['best_model_criterion']
self.fall_back_to_best_model = server_config['fall_back_to_best_model']
self.last_model_path = os.path.join(self.model_path, 'latest_model.tar')
self.best_model_path = os.path.join(self.model_path,
'best_val_{}_model.tar'.format(self.best_model_criterion))
self.log_path = os.path.join(self.model_path, 'status_log.json')
self.cur_iter_no = 0 # keep the iteration number for Tensor board plotting
self.lr_weight = 1.0
self.losses = []
self.no_label_updates = 0 # no. label updates
# Update the parameters above if the log file
if server_config.get('resume_from_checkpoint', False):
self.load_saved_status()
# Decoding config
self.decoder_config = decoder_config
self.spm_model = server_config['data_config']['test'].get('spm_model', None)
self.do_profiling = server_config.get('do_profiling', False)
# Parallel processing
self.clients_in_parallel = config['client_config'].get('clients_in_parallel', None)
StrategyClass = select_strategy(config['strategy'])
self.strategy = StrategyClass('server', self.config, self.model_path)
print_rank(f'Server successfully instantiated strategy {self.strategy}', loglevel=logging.DEBUG)
def load_saved_status(self):
'''Load checkpoint from disk'''
# Check if model is on disk, if so loads it onto trainer
if os.path.exists(self.last_model_path):
print_rank('Resuming from checkpoint model {}'.format(self.last_model_path))
self.worker_trainer.load(self.last_model_path, update_lr_scheduler=True, update_ss_scheduler=True)
if self.server_trainer is not None:
self.server_trainer.model = self.worker_trainer.model # make sure that the models are in sync
# Check if log is on disk, if so loads it onto current stats
if os.path.exists(self.log_path):
with open(self.log_path, 'r') as logfp: # loading the iteration no., best loss and CER
elems = json.load(logfp)
self.cur_iter_no = elems.get('i', 0)
self.metrics['best_val_loss'] = elems.get('best_val_loss', float('inf'))
self.metrics['best_val_acc'] = elems.get('best_val_acc', float('inf'))
self.metrics['best_test_loss'] = elems.get('best_test_loss', float('inf'))
self.metrics['best_test_acc'] = elems.get('best_test_acc', float('inf'))
self.lr_weight = elems.get('weight', 1.0)
self.no_label_updates = elems.get('num_label_updates', 0)
print_rank(f'Resuming from status_log: cur_iter: {self.cur_iter_no}')
def run(self):
'''Trigger training.
This is a simple wrapper to the `train` method.
'''
print_rank('server started')
self.train()
print_rank('server terminated')
def train(self):
'''Main method for training.'''
self.run_stats = {
'secsPerClientRound': [],
'secsPerClient': [],
'secsPerClientTraining': [],
'secsPerClientSetup': [],
'secsPerClientFull': [],
'secsPerRoundHousekeeping': [],
'secsPerRoundTotal': [],
'mpiCosts': []
}
run.log('Max iterations', self.max_iteration)
try:
self.worker_trainer.model.cuda() if torch.cuda.is_available() else None
# Do an initial validation round to understand the pretrained model's validation accuracy
# Skip if we resumed from a checkpoint (cur_iter_no > 0)
eval_list = []
if self.cur_iter_no == 0:
2022-02-23 20:17:12 +03:00
if self.config['server_config']['initial_rec']:
eval_list.append('test')
if self.config['server_config']['initial_val']:
eval_list.append('val')
run.log('LR for agg. opt.', get_lr(self.worker_trainer.optimizer))
2022-02-23 20:17:12 +03:00
print_rank("Running {} at itr={}".format(eval_list, self.cur_iter_no))
self.metrics = self.evaluation.run(eval_list, self.metrics, metric_logger=run.log)
2022-02-23 20:17:12 +03:00
eval_list=[] # some cleanup
# Dump all the information in aggregate_metric
print_rank('Saving Model Before Starting Training', loglevel=logging.INFO)
for token in ['best_val_loss', 'best_val_acc', 'best_test_acc', 'latest']:
self.worker_trainer.save(
model_path=self.model_path,
token=token,
config=self.config['server_config']
)
# Training loop
self.worker_trainer.model.train()
for i in range(self.cur_iter_no, self.max_iteration):
begin = time.time()
metrics_payload = {}
def log_metric(k, v):
metrics_payload[k] = v
print_rank('==== iteration {}'.format(i))
log_metric('Current iteration', i)
# Initial value for the learning rate of the worker
initial_lr = self.initial_lr_client * self.lr_weight
print_rank('Client learning rate {}'.format(initial_lr))
# Run training on clients
self.worker_trainer.model.zero_grad()
self.train_loss = []
server_data = (
initial_lr,
[p.data.to(torch.device('cpu')) for p in self.worker_trainer.model.parameters()]
)
# Random number of clients per iteration
if len(self.num_clients_per_iteration) > 1:
num_clients_curr_iter = random.randint(
self.num_clients_per_iteration[0],
self.num_clients_per_iteration[1]
)
else:
num_clients_curr_iter = self.num_clients_per_iteration[0]
log_metric('Clients for round', num_clients_curr_iter)
# Perform annealing in quantization threshold
if self.quant_thresh is not None:
self.config['client_config']['quant_thresh'] *= self.config['client_config'].get('quant_anneal', 1.0)
self.quant_thresh = self.config['client_config']['quant_thresh']
log_metric('Quantization Thresh.', self.config['client_config']['quant_thresh'])
# Create the pool of clients -- sample from this pool to assign to workers
sampled_idx_clients = random.sample(self.client_idx_list,
num_clients_curr_iter) if num_clients_curr_iter > 0 else self.client_idx_list
sampled_clients = [
Client(
client_id,
self.config,
self.config['client_config']['type'] == 'optimization',
None
) for client_id in sampled_idx_clients
]
# Initialize stats
clients_begin = time.time()
client_losses = []
client_mag_grads = []
client_mean_grads = []
client_var_grads = []
client_norm_grads = []
self.run_stats['secsPerClient'].append([])
self.run_stats['secsPerClientFull'].append([])
self.run_stats['secsPerClientTraining'].append([])
self.run_stats['secsPerClientSetup'].append([])
self.run_stats['mpiCosts'].append([])
# Check if we want privacy metrics
apply_privacy_metrics = self.config.get('privacy_metrics_config', None) and \
self.config['privacy_metrics_config']['apply_metrics']
adaptive_leakage = apply_privacy_metrics and \
self.config['privacy_metrics_config'].get('adaptive_leakage_threshold', None)
if apply_privacy_metrics:
privacy_metrics_stats = defaultdict(list)
# Initialize profiler
profiler = None
if self.do_profiling:
profiler = cProfile.Profile()
profiler.enable()
# Reset gradient for the model before assigning the new gradients
self.worker_trainer.model.zero_grad()
for client_output in self.process_clients(sampled_clients, server_data, self.clients_in_parallel):
# Process client output
client_timestamp = client_output['ts']
client_stats = client_output['cs']
client_loss = client_output['tl']
client_mag_grad = client_output['mg']
client_mean_grad = client_output['ng']
client_var_grad = client_output['vg']
client_norm_grad = client_output['rg']
client_payload = client_output['pl']
if apply_privacy_metrics:
privacy_stats = client_output['ps']
for metric, value in privacy_stats.items():
privacy_metrics_stats[metric].append(value)
self.run_stats['mpiCosts'][-1].append(time.time() - client_timestamp)
# Get actual pseudo-gradients for aggregation
payload_processed = self.strategy.process_individual_payload(self.worker_trainer, client_payload)
if not payload_processed:
print_rank('Dropping client', loglevel=logging.DEBUG)
num_clients_curr_iter -= 1
continue
# Aggregate stats
self.train_loss.append(client_loss)
client_losses.append(client_loss)
client_mag_grads.append(client_mag_grad.item())
client_mean_grads.append(client_mean_grad.item())
client_var_grads.append(client_var_grad.item())
client_norm_grads.append(client_norm_grad.item())
# Mark the end of client processing
client_end = time.time()
self.run_stats['secsPerClientFull'][-1].append(client_stats['full cost'])
self.run_stats['secsPerClientTraining'][-1].append(client_stats['training'])
self.run_stats['secsPerClientSetup'][-1].append(client_stats['setup'])
self.run_stats['secsPerClient'][-1].append(client_end - clients_begin)
# Tear down profiler
if self.do_profiling:
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats('cumulative').print_stats()
# Prepare output
client_mag_grads = np.array(client_mag_grads)
client_mean_grads = np.array(client_mean_grads)
client_var_grads = np.array(client_var_grads)
client_norm_grads = np.array(client_norm_grads)
client_stats = (client_mag_grads, client_mean_grads, client_var_grads)
dump_norm_stats = self.config.get('dump_norm_stats', False)
if dump_norm_stats:
with open(os.path.join(self.model_path, 'norm_stats.txt'), 'a', encoding='utf-8') as outF:
outF.write('{}\n'.format(json.dumps(list(client_norm_grads))))
# Print the privacy metrics
if apply_privacy_metrics:
for metric, values in privacy_metrics_stats.items():
if metric == 'Dropped clients':
log_metric(metric, sum(values))
else:
log_metric(metric, max(values))
if type(adaptive_leakage) is float:
values = privacy_metrics_stats['Practical epsilon (Max leakage)']
new_threshold = list(sorted(values))[int(adaptive_leakage*len(values))]
print_rank('Updating leakage threshold to {}'.format(new_threshold))
self.config['privacy_metrics_config']['max_allowed_leakage'] = new_threshold
# Mark that all clients have been processed
end = time.time()
self.run_stats['secsPerClientRound'].append(end - begin)
begin = end
# Log the training loss to tensorboard/AML
log_metric('Training loss', sum(self.train_loss))
# Combine payloads
self.losses = self.strategy.combine_payloads(
worker_trainer=self.worker_trainer,
curr_iter=i,
num_clients_curr_iter=num_clients_curr_iter,
client_stats=client_stats,
logger=log_metric,
)
# Run a couple of iterations of training data on the server
if self.server_trainer is not None:
print_rank('Running replay iterations on server')
if 'updatable_names' in self.server_trainer_config:
set_component_wise_lr(
self.worker_trainer.model,
self.server_optimizer_config,
self.server_trainer_config['updatable_names']
)
self.server_trainer.prepare_iteration(self.worker_trainer.model)
self.server_trainer.train_desired_samples(self.server_replay_iterations)
self.worker_trainer.model.load_state_dict(self.server_trainer.model.state_dict())
torch.cuda.empty_cache()
# Update a sampling scheduler
print_rank('Run ss scheduler')
self.worker_trainer.run_ss_scheduler()
# Run inference and score on val/test depending on the iter. number
if ((i+1) % self.val_freq) == 0:
eval_list.append("val")
if ((i+1) % self.req_freq) == 0 :
eval_list.append("test")
if len(eval_list)> 0:
print_rank('Running {} at itr={}'.format(eval_list,i+1))
self.metrics['worker_trainer'] = self.worker_trainer
self.metrics = self.evaluation.run(eval_list, self.metrics, metric_logger=run.log)
self.losses = self.evaluation.losses
eval_list = []
# Create a schedule for the initial_lr (for the worker)
if 'val' in eval_list:
run.log('LR for agg. opt.', get_lr(self.worker_trainer.optimizer))
if not (self.losses[0] < self.metrics['best_val_loss']):
self.lr_weight *= self.lr_decay_factor
print_rank('LOG: Client weight of learning rate {}..'.format(self.lr_weight))
# Backup the current best models
self.backup_models(i)
# Fall back to the best model if the option is enabled
self.fall_back_to_prev_best_status()
# Logging the latest best values
update_json_log(
self.log_path,
{
'i': i + 1,
'best_val_loss': float(self.metrics['best_val_loss']),
'best_val_acc': float(self.metrics['best_val_acc']),
'best_test_loss': float(self.metrics['best_test_loss']),
'best_test_acc': float(self.metrics['best_test_acc']),
'weight': float(self.lr_weight),
'num_label_updates': int(self.no_label_updates)
},
)
end = time.time()
# Aggregate stats
self.run_stats['secsPerRoundHousekeeping'].append(end - begin)
self.run_stats['secsPerRoundTotal'].append(self.run_stats['secsPerClientRound'][-1] + \
self.run_stats['secsPerRoundHousekeeping'][-1])
log_metric('secsPerRoundTotal', self.run_stats['secsPerRoundTotal'][-1])
if self.do_profiling:
log_metric('secsPerClientRound', self.run_stats['secsPerClientRound'][-1])
log_metric('secsPerRoundHousekeeping', self.run_stats['secsPerRoundHousekeeping'][-1])
metrics_for_stats = [
'secsPerClient',
'secsPerClientTraining',
'secsPerClientFull',
'secsPerClientSetup',
'mpiCosts',
]
for metric in metrics_for_stats:
log_metric(f'{metric}Mean', np.mean(self.run_stats[metric][-1]))
log_metric(f'{metric}Median', np.median(self.run_stats[metric][-1]))
log_metric(f'{metric}Max', max(self.run_stats[metric][-1]))
for k in self.run_stats:
if k in metrics_for_stats:
print_rank('{}: {}'.format(k, max(self.run_stats[k][-1])), loglevel=logging.DEBUG)
else:
print_rank('{}: {}'.format(k, self.run_stats[k][-1]), loglevel=logging.DEBUG)
# Log all the metrics
for k in metrics_payload:
run.log(k, metrics_payload[k])
finally: # perform cleanup even if error was raised above
self.terminate_workers(terminate=(not self.do_clustering))
def backup_models(self, i):
'''Save the current best models.
Save CER model, the best loss model and the best WER model. This occurs
at a specified period.
Args:
i: no. of iterations.
'''
# Always save the latest model
self.worker_trainer.save(
model_path=self.model_path,
token='latest',
config=self.config['server_config'],
)
if (i % self.model_backup_freq) == 0: # save the current best models
self.worker_trainer.save(
model_path=self.model_path,
token='epoch{}'.format(i),
config=self.config['server_config']
)
for bodyname in ['best_val_acc', 'best_val_loss', 'best_test_acc']:
src_model_path = os.path.join(self.model_path, '{}_model.tar'.format(bodyname))
if os.path.exists(src_model_path):
dst_model_path = os.path.join(self.model_path, 'epoch{}_{}_model.tar'.format(i, bodyname))
shutil.copyfile(src_model_path, dst_model_path)
print_rank('Saved {}'.format(dst_model_path))
def fall_back_to_prev_best_status(self):
'''Go back to the past best status and switch to the recent best model.'''
if self.fall_back_to_best_model:
print_rank('falling back to model {}'.format(self.best_model_path))
# Save current learning rate
tmp_lr = get_lr(self.worker_trainer.optimizer)
# Load previous best model
self.worker_trainer.load(self.best_model_path, update_lr_scheduler=False, update_ss_scheduler=False)
# Update previous learning rate on optimizer
for g in self.worker_trainer.optimizer.param_groups:
g['lr'] = tmp_lr
if self.server_trainer is not None:
self.server_trainer.model = self.worker_trainer.model # make sure that the models are in sync
def select_server(server_type, config):
'''Select a server type using different possible strings.
Right now this just returns `OptimizationServer`, but this
function could be useful when there are multiple choices of
server.
Args:
server_type (str): indicates server choice.
config (dict): config parsed from YAML, passed so that
parameters can be used to select a given server.
'''
return OptimizationServer