The pipeline runs through nominally! But lots of work to be done still.

This commit is contained in:
Debadeepta Dey 2022-04-21 17:00:57 -07:00 коммит произвёл Gustavo Rosa
Родитель ca9c0cfecf
Коммит 027fdeac5a
4 изменённых файлов: 35 добавлений и 15 удалений

Просмотреть файл

@ -88,6 +88,10 @@ class EvolutionParetoSearchSegmentation(EvolutionParetoSearch):
def _evaluate(self, arch:ArchWithMetaData)->float:
# DEBUG: simulate architecture evaluation
f1 = random.random()
return f1
# see if we have visited this arch before
if arch.metadata['archid'] in self.eval_cache:
logger.info(f"{arch.metadata['archid']} is in cache! Returning from cache.")
@ -95,31 +99,37 @@ class EvolutionParetoSearchSegmentation(EvolutionParetoSearch):
# if not in cache actually evaluate it
# -------------------------------------
logger.pushd(f"regular_training_{arch.metadata['archid']}")
# train
# TODO: how do we set the number of epochs it will train for?
dataset_dir = os.path.join(self.dataroot, 'face_synthetics')
trainer = SegmentationTrainer(arch.arch, dataset_dir=dataset_dir, val_size=2000, gpus=1)
# TODO: most of these should come from conf
# TODO: batch size 16 has lr 2e-4. can we increase batch size? what lr?
trainer = SegmentationTrainer(arch.arch,
dataset_dir=dataset_dir,
max_steps=100,
val_size=2000,
img_size=256,
augmentation='none',
batch_size=64,
lr=8e-4,
criterion_name='ce',
gpus=1,
seed=42)
trainer.fit(run_path=utils.full_path(get_expdir()))
# validate
val_dl = trainer.val_dataloader
outputs = []
with torch.no_grad():
for bi, b in enumerate(tqdm(val_dl)):
b['image'] = b['image'].to('cuda')
b['mask'] = b['mask'].to('cuda')
trainer.model.to('cuda')
outputs.append(trainer.model.validation_step(b, bi))
results = trainer.model.shared_epoch_end(outputs, stage='validation')
logger.popd()
# # DEBUG: simulate architecture evaluation
# f1 = random.random()
f1 = results['validation_overall_f1']
return f1

Просмотреть файл

@ -119,7 +119,8 @@ class LightningModelWrapper(pl.LightningModule):
results = get_custom_overall_metrics(tp, fp, fn, tn, stage=stage)
results[f'{stage}_loss'] = avg_loss
self.log_dict(results, sync_dist=True)
# TODO: enabling this causes error in lightning
# self.log_dict(results, sync_dist=True)
return results
def configure_optimizers(self):
@ -173,7 +174,8 @@ class LightningModelWrapper(pl.LightningModule):
class SegmentationTrainer():
def __init__(self, model: SegmentationNasModel, dataset_dir: str,
max_steps: int = 12_000, val_size: int = 2000, img_size: int = 256,
max_steps: int = 12000, val_size: int = 2000,
val_interval: int = 1000, img_size: int = 256,
augmentation: str = 'none', batch_size: int = 16,
lr: float = 2e-4, criterion_name: str = 'ce', gpus: int = 1,
seed: int = 1):
@ -195,6 +197,7 @@ class SegmentationTrainer():
exponential_decay_lr=True, img_size=img_size)
self.img_size = img_size
self.gpus = gpus
self.val_interval = val_interval
def get_training_callbacks(self, run_dir: Path) -> List[pl.callbacks.Callback]:
return [pl.callbacks.ModelCheckpoint(
@ -211,7 +214,7 @@ class SegmentationTrainer():
max_steps=self.max_steps,
default_root_dir=run_path,
gpus=self.gpus,
val_check_interval=1_200,
val_check_interval=self.val_interval,
callbacks=self.get_training_callbacks(run_path)
)

Просмотреть файл

@ -4,6 +4,8 @@ from overrides.overrides import overrides
import copy
import uuid
import torch
from archai.nas.arch_meta import ArchWithMetaData
from archai.nas.discrete_search_space import DiscreteSearchSpace
@ -50,7 +52,7 @@ class DiscreteSearchSpaceSegmentation(DiscreteSearchSpace):
# and change its operator at random
# and its input sources
# WARNING: this can result in some nodes left hanging
chosen_node_idx = random.randint(1, len(graph))
chosen_node_idx = random.randint(1, len(graph)-1)
node = graph[chosen_node_idx]
node['op'] = random.choice(self.operations)
# choose up to k inputs from previous nodes
@ -61,7 +63,7 @@ class DiscreteSearchSpaceSegmentation(DiscreteSearchSpace):
# now go through every node in the graph (except output node)
# and make sure it is being used as input in some node after it
for i, node in enumerate(graph):
for i, node in enumerate(graph[:-1]):
this_name = node['name']
orphan = True
# test whether not orphan
@ -70,17 +72,22 @@ class DiscreteSearchSpaceSegmentation(DiscreteSearchSpace):
orphan = False
if orphan:
# choose a forward node to connect it with
chosen_forward_idx = random.randint(i+1, len(graph))
chosen_forward_idx = random.randint(i+1, len(graph)-1)
graph[chosen_forward_idx]['inputs'].append(this_name)
# compile the model
model = SegmentationNasModel.from_config(graph, channels_per_scale)
# TODO: these should come from config or elsewhere
# such that they are not hardcoded in here
out_shape = model.validate_forward(torch.randn(1, 3, 256, 256)).shape
assert out_shape == torch.Size([1, 19, 256, 256])
extradata = {
'datasetname': self.datasetname,
'graph': graph,
'channels_per_scale': channels_per_scale,
'archid': uuid.uuid4(), #TODO: need to replace with a string of the graph
}
arch_meta = ArchWithMetaData(model, extradata)
return [arch_meta]

Просмотреть файл

@ -44,7 +44,7 @@ dataset: {} # default dataset settings comes from __include__ on the top
nas:
search:
init_num_models: 20 # initial random models to seed the search
init_num_models: 3 # initial random models to seed the search
num_iters: 3 # number of pareto frontier search iterations
num_random_mix: 20 # how many random models to add to the parent mixture
use_benchmark: True