зеркало из https://github.com/microsoft/archai.git
The pipeline runs through nominally! But lots of work to be done still.
This commit is contained in:
Родитель
ca9c0cfecf
Коммит
027fdeac5a
|
@ -88,6 +88,10 @@ class EvolutionParetoSearchSegmentation(EvolutionParetoSearch):
|
|||
|
||||
|
||||
def _evaluate(self, arch:ArchWithMetaData)->float:
|
||||
# DEBUG: simulate architecture evaluation
|
||||
f1 = random.random()
|
||||
return f1
|
||||
|
||||
# see if we have visited this arch before
|
||||
if arch.metadata['archid'] in self.eval_cache:
|
||||
logger.info(f"{arch.metadata['archid']} is in cache! Returning from cache.")
|
||||
|
@ -95,31 +99,37 @@ class EvolutionParetoSearchSegmentation(EvolutionParetoSearch):
|
|||
|
||||
# if not in cache actually evaluate it
|
||||
# -------------------------------------
|
||||
logger.pushd(f"regular_training_{arch.metadata['archid']}")
|
||||
|
||||
# train
|
||||
# TODO: how do we set the number of epochs it will train for?
|
||||
dataset_dir = os.path.join(self.dataroot, 'face_synthetics')
|
||||
trainer = SegmentationTrainer(arch.arch, dataset_dir=dataset_dir, val_size=2000, gpus=1)
|
||||
# TODO: most of these should come from conf
|
||||
# TODO: batch size 16 has lr 2e-4. can we increase batch size? what lr?
|
||||
trainer = SegmentationTrainer(arch.arch,
|
||||
dataset_dir=dataset_dir,
|
||||
max_steps=100,
|
||||
val_size=2000,
|
||||
img_size=256,
|
||||
augmentation='none',
|
||||
batch_size=64,
|
||||
lr=8e-4,
|
||||
criterion_name='ce',
|
||||
gpus=1,
|
||||
seed=42)
|
||||
trainer.fit(run_path=utils.full_path(get_expdir()))
|
||||
|
||||
# validate
|
||||
val_dl = trainer.val_dataloader
|
||||
outputs = []
|
||||
|
||||
with torch.no_grad():
|
||||
for bi, b in enumerate(tqdm(val_dl)):
|
||||
b['image'] = b['image'].to('cuda')
|
||||
b['mask'] = b['mask'].to('cuda')
|
||||
trainer.model.to('cuda')
|
||||
outputs.append(trainer.model.validation_step(b, bi))
|
||||
|
||||
results = trainer.model.shared_epoch_end(outputs, stage='validation')
|
||||
|
||||
logger.popd()
|
||||
|
||||
# # DEBUG: simulate architecture evaluation
|
||||
# f1 = random.random()
|
||||
|
||||
f1 = results['validation_overall_f1']
|
||||
return f1
|
||||
|
||||
|
|
|
@ -119,7 +119,8 @@ class LightningModelWrapper(pl.LightningModule):
|
|||
results = get_custom_overall_metrics(tp, fp, fn, tn, stage=stage)
|
||||
results[f'{stage}_loss'] = avg_loss
|
||||
|
||||
self.log_dict(results, sync_dist=True)
|
||||
# TODO: enabling this causes error in lightning
|
||||
# self.log_dict(results, sync_dist=True)
|
||||
return results
|
||||
|
||||
def configure_optimizers(self):
|
||||
|
@ -173,7 +174,8 @@ class LightningModelWrapper(pl.LightningModule):
|
|||
class SegmentationTrainer():
|
||||
|
||||
def __init__(self, model: SegmentationNasModel, dataset_dir: str,
|
||||
max_steps: int = 12_000, val_size: int = 2000, img_size: int = 256,
|
||||
max_steps: int = 12000, val_size: int = 2000,
|
||||
val_interval: int = 1000, img_size: int = 256,
|
||||
augmentation: str = 'none', batch_size: int = 16,
|
||||
lr: float = 2e-4, criterion_name: str = 'ce', gpus: int = 1,
|
||||
seed: int = 1):
|
||||
|
@ -195,6 +197,7 @@ class SegmentationTrainer():
|
|||
exponential_decay_lr=True, img_size=img_size)
|
||||
self.img_size = img_size
|
||||
self.gpus = gpus
|
||||
self.val_interval = val_interval
|
||||
|
||||
def get_training_callbacks(self, run_dir: Path) -> List[pl.callbacks.Callback]:
|
||||
return [pl.callbacks.ModelCheckpoint(
|
||||
|
@ -211,7 +214,7 @@ class SegmentationTrainer():
|
|||
max_steps=self.max_steps,
|
||||
default_root_dir=run_path,
|
||||
gpus=self.gpus,
|
||||
val_check_interval=1_200,
|
||||
val_check_interval=self.val_interval,
|
||||
callbacks=self.get_training_callbacks(run_path)
|
||||
)
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ from overrides.overrides import overrides
|
|||
import copy
|
||||
import uuid
|
||||
|
||||
import torch
|
||||
|
||||
from archai.nas.arch_meta import ArchWithMetaData
|
||||
from archai.nas.discrete_search_space import DiscreteSearchSpace
|
||||
|
||||
|
@ -50,7 +52,7 @@ class DiscreteSearchSpaceSegmentation(DiscreteSearchSpace):
|
|||
# and change its operator at random
|
||||
# and its input sources
|
||||
# WARNING: this can result in some nodes left hanging
|
||||
chosen_node_idx = random.randint(1, len(graph))
|
||||
chosen_node_idx = random.randint(1, len(graph)-1)
|
||||
node = graph[chosen_node_idx]
|
||||
node['op'] = random.choice(self.operations)
|
||||
# choose up to k inputs from previous nodes
|
||||
|
@ -61,7 +63,7 @@ class DiscreteSearchSpaceSegmentation(DiscreteSearchSpace):
|
|||
|
||||
# now go through every node in the graph (except output node)
|
||||
# and make sure it is being used as input in some node after it
|
||||
for i, node in enumerate(graph):
|
||||
for i, node in enumerate(graph[:-1]):
|
||||
this_name = node['name']
|
||||
orphan = True
|
||||
# test whether not orphan
|
||||
|
@ -70,17 +72,22 @@ class DiscreteSearchSpaceSegmentation(DiscreteSearchSpace):
|
|||
orphan = False
|
||||
if orphan:
|
||||
# choose a forward node to connect it with
|
||||
chosen_forward_idx = random.randint(i+1, len(graph))
|
||||
chosen_forward_idx = random.randint(i+1, len(graph)-1)
|
||||
graph[chosen_forward_idx]['inputs'].append(this_name)
|
||||
|
||||
# compile the model
|
||||
model = SegmentationNasModel.from_config(graph, channels_per_scale)
|
||||
# TODO: these should come from config or elsewhere
|
||||
# such that they are not hardcoded in here
|
||||
out_shape = model.validate_forward(torch.randn(1, 3, 256, 256)).shape
|
||||
assert out_shape == torch.Size([1, 19, 256, 256])
|
||||
extradata = {
|
||||
'datasetname': self.datasetname,
|
||||
'graph': graph,
|
||||
'channels_per_scale': channels_per_scale,
|
||||
'archid': uuid.uuid4(), #TODO: need to replace with a string of the graph
|
||||
}
|
||||
|
||||
arch_meta = ArchWithMetaData(model, extradata)
|
||||
return [arch_meta]
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ dataset: {} # default dataset settings comes from __include__ on the top
|
|||
|
||||
nas:
|
||||
search:
|
||||
init_num_models: 20 # initial random models to seed the search
|
||||
init_num_models: 3 # initial random models to seed the search
|
||||
num_iters: 3 # number of pareto frontier search iterations
|
||||
num_random_mix: 20 # how many random models to add to the parent mixture
|
||||
use_benchmark: True
|
||||
|
|
Загрузка…
Ссылка в новой задаче