Added script to analyse DARTS space regular evaluation.

2021-09-10 16:01:03 -07:00 · 2021-09-10 16:01:03 -07:00 · 1affe0cdc8
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -698,7 +698,17 @@
            "request": "launch",
            "program": "${cwd}/scripts/reports/fear_analysis/analysis_freeze_darts_space.py",
            "console": "integratedTerminal",
-            "args": ["--results-dir", "F:\\archaiphilly\\phillytools\\ft_dt_fb96_ftlr0.025_fte10_ct96_ftt0.6",
+            "args": ["--results-dir", "F:\\archaiphilly\\phillytools\\ft_dt_fb96_ftlr0.025_fte10_ct96_ftt0.6_ftonly",
            "--out-dir", "F:\\archai_experiment_reports", "--reg-evals-file", 
            "F:\\archai_experiment_reports\\ft_dt_fb96_ftlr0.025_fte10_ct96_ftt0.6\\darts_benchmark.yaml"]
        },
        {
            "name": "Analysis Regular Darts Space",
            "type": "python",
            "request": "launch",
            "program": "${cwd}/scripts/reports/fear_analysis/analysis_regular_darts_space.py",
            "console": "integratedTerminal",
            "args": ["--results-dir", "F:\\archaiphilly\\phillytools\\dt_reg_b96_e20",
            "--out-dir", "F:\\archai_experiment_reports", "--reg-evals-file", 
            "F:\\archai_experiment_reports\\ft_dt_fb96_ftlr0.025_fte10_ct96_ftt0.6\\darts_benchmark.yaml"]
        },
--- a/scripts/reports/fear_analysis/analysis_regular_darts_space.py
+++ b/scripts/reports/fear_analysis/analysis_regular_darts_space.py
@ -0,0 +1,282 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 import argparse
 from typing import Dict, List, Type, Iterator, Tuple
 import glob
 import os
 import pathlib
 from collections import OrderedDict, defaultdict
 from scipy.stats.stats import _two_sample_transform
 import yaml
 from inspect import getsourcefile
 import re
 from tqdm import tqdm
 import seaborn as sns
 import math as ma
 from scipy.stats import kendalltau, spearmanr, sem
 from runstats import Statistics
 #import matplotlib
 #matplotlib.use('TkAgg')
 import seaborn as sns
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
 import multiprocessing
 from multiprocessing import Pool
 from archai.common import utils
 from archai.common.ordereddict_logger import OrderedDictLogger
 from archai.common.analysis_utils import epoch_nodes, parse_a_job, fix_yaml, remove_seed_part, group_multi_runs, collect_epoch_nodes, EpochStats, FoldStats, stat2str, get_epoch_stats, get_summary_text, get_details_text, plot_epochs, write_report
 import re
 def main():
    parser = argparse.ArgumentParser(description='Report creator')
    parser.add_argument('--results-dir', '-d', type=str,
                        default=r'~/logdir/proxynas_test_0001',
                        help='folder with experiment results from pt')
    parser.add_argument('--out-dir', '-o', type=str, default=r'~/logdir/reports',
                        help='folder to output reports')
    parser.add_argument('--reg-evals-file', '-r', type=str, default=None,
                        help='optional yaml file which contains full evaluation \
                        of architectures on new datasets not part of natsbench')
    args, extra_args = parser.parse_known_args()
    # root dir where all results are stored
    results_dir = pathlib.Path(utils.full_path(args.results_dir))
    print(f'results_dir: {results_dir}')
    # extract experiment name which is top level directory
    exp_name = results_dir.parts[-1]
    # create results dir for experiment
    out_dir = utils.full_path(os.path.join(args.out_dir, exp_name))
    print(f'out_dir: {out_dir}')
    os.makedirs(out_dir, exist_ok=True)
    # if optional regular evaluation lookup file is provided
    if args.reg_evals_file:
        with open(args.reg_evals_file, 'r') as f:
            reg_evals_data = yaml.load(f, Loader=yaml.Loader)
    # get list of all structured logs for each job
    logs = {}
    confs = {}
    job_dirs = list(results_dir.iterdir())
    # # test single job parsing for debugging
    # # WARNING: very slow, just use for debugging
    # for job_dir in job_dirs:
    #     a = parse_a_job(job_dir)
    # parallel parsing of yaml logs
    num_workers = 6
    with Pool(num_workers) as p:
        a = p.map(parse_a_job, job_dirs)
    for storage in a:
        for key, val in storage.items():
            logs[key] = val[0]
            confs[key] = val[1]
    # examples of accessing logs
    # best_test = logs[key]['eval_arch']['eval_train']['best_test']['top1']
    # best_train = logs[key]['eval_arch']['eval_train']['best_train']['top1']
    # remove all search jobs
    for key in list(logs.keys()):
        if 'search' in key:
            logs.pop(key)
    # sometimes a job has not even written logs to yaml
    for key in list(logs.keys()):
        if not logs[key]:
            print(f'arch id {key} did not finish. removing from calculations.')
            logs.pop(key)
    # remove all arch_ids which did not finish
    for key in list(logs.keys()):
        to_delete = False
        # it might have died early
        if 'regular_evaluate' not in list(logs[key].keys()):
            to_delete = True
        if to_delete:
            print(f'arch id {key} did not finish. removing from calculations.')
            logs.pop(key)
            continue
        if 'best_train' not in list(logs[key]['regular_evaluate']['eval_arch']['eval_train'].keys()):
            print(f'arch id {key} did not finish. removing from calculations.')
            logs.pop(key)
            continue
    all_arch_ids = []
    all_reg_evals = []
    all_short_reg_evals = []
    all_short_reg_time = []
    for key in logs.keys():
        if 'eval' in key:
            try:                
                # regular evaluation 
                # important to get this first since if it is not 
                # available for in the created darts benchmark we need to 
                # remove it from consideration
                # --------------------
                # lookup from the provided darts benchmark file
                arch_id = confs[key]['nas']['eval']['dartsspace']['arch_index']
                if arch_id not in list(reg_evals_data.keys()):
                    # if the dataset used is not part of the standard benchmark some of the architectures
                    # may not have full evaluation accuracies available. Remove them from consideration.
                    continue
                reg_eval_top1 = reg_evals_data[arch_id]
                all_reg_evals.append(reg_eval_top1)
                best_train = logs[key]['regular_evaluate']['eval_arch']['eval_train']['best_train']['top1']
                all_short_reg_evals.append(best_train)
                # collect duration
                duration = 0.0
                for epoch_key in logs[key]['regular_evaluate']['eval_arch']['eval_train']['epochs']:
                    duration += logs[key]['regular_evaluate']['eval_arch']['eval_train']['epochs'][epoch_key]['train']['duration']
                all_short_reg_time.append(duration)
                # record the arch id
                # --------------------
                all_arch_ids.append(arch_id)
            except KeyError as err:
                print(f'KeyError {err} not in {key}!')
    # Store some key numbers in results.txt
    results_savename = os.path.join(out_dir, 'results.txt')
    # Sanity check
    assert len(all_reg_evals) == len(all_short_reg_evals)
    assert len(all_reg_evals) == len(all_short_reg_time)
    # Shortened training results       
    short_reg_tau, short_reg_p_value = kendalltau(all_reg_evals, all_short_reg_evals)
    short_reg_spe, short_reg_sp_value = spearmanr(all_reg_evals, all_short_reg_evals)
    print(f'Short reg Kendall Tau score: {short_reg_tau:3.03f}, p_value {short_reg_p_value:3.03f}')
    print(f'Short reg Spearman corr: {short_reg_spe:3.03f}, p_value {short_reg_sp_value:3.03f}')
    print(f'Valid archs: {len(all_reg_evals)}')
    with open(results_savename, 'w') as f:
        f.write(f'Short reg Kendall Tau score: {short_reg_tau:3.03f}, p_value {short_reg_p_value:3.03f} \n')
        f.write(f'Short reg Spearman corr: {short_reg_spe:3.03f}, p_value {short_reg_sp_value:3.03f} \n')
    plt.clf()
    sns.scatterplot(x=all_reg_evals, y=all_short_reg_evals)
    plt.xlabel('Test top1 at natsbench full training')
    plt.ylabel('Regular training with less epochs')
    plt.grid()
    savename = os.path.join(out_dir, 'shortened_training.png')
    plt.savefig(savename, dpi=plt.gcf().dpi, bbox_inches='tight')
    # Rank correlations at top n percent of architectures
    reg_shortreg_evals = [(all_reg_evals[i], all_short_reg_evals[i], all_short_reg_time[i]) for i in range(len(all_reg_evals))]
    # sort in descending order of accuracy of regular evaluation
    reg_shortreg_evals.sort(key=lambda x: x[0], reverse=True)
    top_percent_shortreg_times_avg = []
    top_percent_shortreg_times_std = []
    top_percent_shortreg_times_stderr = []
    spe_shortreg_top_percents = []
    top_percents = []
    top_percent_range = range(2, 101, 2) 
    for top_percent in top_percent_range:
        top_percents.append(top_percent)
        num_to_keep = int(ma.floor(len(reg_shortreg_evals) * top_percent * 0.01))
        top_percent_evals = reg_shortreg_evals[:num_to_keep]
        top_percent_reg = [x[0] for x in top_percent_evals] 
        top_percent_shortreg = [x[1] for x in top_percent_evals]
        top_percent_shortreg_times = [x[2] for x in top_percent_evals]
        top_percent_shortreg_times_avg.append(np.mean(np.array(top_percent_shortreg_times)))
        top_percent_shortreg_times_std.append(np.std(np.array(top_percent_shortreg_times)))
        top_percent_shortreg_times_stderr.append(sem(np.array(top_percent_shortreg_times)))        
        spe_shortreg, _ = spearmanr(top_percent_reg, top_percent_shortreg)
        spe_shortreg_top_percents.append(spe_shortreg)
    plt.clf()
    sns.scatterplot(top_percents, spe_shortreg_top_percents)
    plt.legend(labels=['Shortened Regular Training'])
    plt.ylim((-1.0, 1.0))
    plt.xlim((0,100))
    plt.xlabel('Top percent of architectures')
    plt.ylabel('Spearman Correlation')
    plt.grid()
    savename = os.path.join(out_dir, f'spe_top_archs.png')
    plt.savefig(savename, dpi=plt.gcf().dpi, bbox_inches='tight')
    plt.clf()
    plt.errorbar(top_percents, top_percent_shortreg_times_avg, yerr=np.array(top_percent_shortreg_times_std)/2, marker='s', mfc='red', ms=10, mew=4)
    plt.xlabel('Top percent of architectures')
    plt.ylabel('Avg. time (s)')
    plt.yticks(np.arange(0,600, step=50))
    plt.grid()
    savename = os.path.join(out_dir, f'shortreg_train_duration_top_archs.png')
    plt.savefig(savename, dpi=plt.gcf().dpi, bbox_inches='tight')
    # time taken
    avg_shortreg_runtime = np.mean(np.array(all_short_reg_time))
    stderr_shortreg_runtime = np.std(np.array(all_short_reg_time)) / np.sqrt(len(all_short_reg_time))
    with open(results_savename, 'a') as f:
        f.write(f'Avg. Shortened Training Runtime: {avg_shortreg_runtime:.03f}, stderr {stderr_shortreg_runtime:.03f} \n')
    # how much overlap in top x% of architectures between method and groundtruth
    # ----------------------------------------------------------------------------
    arch_id_reg_evals =  [(arch_id, reg_eval) for arch_id, reg_eval in zip(all_arch_ids, all_reg_evals)]
    arch_id_shortreg_evals = [(arch_id, shortreg_eval) for arch_id, shortreg_eval in zip(all_arch_ids, all_short_reg_evals)]
    arch_id_reg_evals.sort(key=lambda x: x[1], reverse=True)
    arch_id_shortreg_evals.sort(key=lambda x: x[1], reverse=True)
    assert len(arch_id_reg_evals) == len(arch_id_shortreg_evals)
    top_percents = []
    shortreg_ratio_common = []
    for top_percent in top_percent_range:
        top_percents.append(top_percent)
        num_to_keep = int(ma.floor(len(arch_id_reg_evals) * top_percent * 0.01))
        top_percent_arch_id_reg_evals = arch_id_reg_evals[:num_to_keep]
        top_percent_arch_id_shortreg_evals = arch_id_shortreg_evals[:num_to_keep]
        # take the set of arch_ids in each method and find overlap with top archs
        set_reg = set([x[0] for x in top_percent_arch_id_reg_evals])
        set_ft = set([x[0] for x in top_percent_arch_id_shortreg_evals])
        ft_num_common = len(set_reg.intersection(set_ft))
        shortreg_ratio_common.append(ft_num_common/num_to_keep)
    # save raw data for other aggregate plots over experiments
    raw_data_dict = {}
    raw_data_dict['top_percents'] = top_percents
    raw_data_dict['spe_shortreg'] = spe_shortreg_top_percents
    raw_data_dict['shortreg_times_avg'] = top_percent_shortreg_times_avg
    raw_data_dict['shortreg_times_std'] = top_percent_shortreg_times_std
    raw_data_dict['shortreg_times_stderr'] = top_percent_shortreg_times_stderr
    raw_data_dict['shortreg_ratio_common'] = shortreg_ratio_common
    savename = os.path.join(out_dir, 'raw_data.yaml')
    with open(savename, 'w') as f:
        yaml.dump(raw_data_dict, f)
 if __name__ == '__main__':
    main()