EDDI/active_learning_functions.py

416 строки
18 KiB
Python

from __future__ import division
from p_vae import *
from codings import *
import numpy as np
import tensorflow as tf
from scipy.stats import bernoulli
import argparse
import os
import random
from random import sample
#### parser configurations
parser = argparse.ArgumentParser(
description='EDDI')
parser.add_argument(
'--epochs',
type=int,
default=3000,
metavar='N_eps',
help='number of epochs to train (default: 3000)')
parser.add_argument(
'--latent_dim',
type=int,
default=10,
metavar='LD',
help='latent dimension (default: 10)')
parser.add_argument(
'--p',
type=float,
default=0.7,
metavar='probability',
help='dropout probability of artificial missingness during training')
parser.add_argument(
'--iteration',
type=int,
default=-1,
metavar='it',
help='iterations per epoch. set to -1 to run the full epoch. ')
parser.add_argument(
'--batch_size',
type=int,
default=100,
metavar='batch',
help='Mini Batch size per epoch. ')
parser.add_argument(
'--K',
type=int,
default=20,
metavar='K',
help='Dimension of PNP feature map ')
parser.add_argument(
'--M',
type=int,
default=50,
metavar='M',
help='Number of MC samples when perform imputing')
parser.add_argument(
'--eval',
type=str,
default='rmse',
metavar='eval',
help='eval: evaluation metric of active learning. ''rmse'':rmse; ''nllh'':negative log likelihood')
parser.add_argument(
'--repeat',
type=int,
default=5,
metavar='repeat',
help='Number of repeats of the active learning experiment')
parser.add_argument(
'--output_dir',
type=str,
default=os.getenv('PT_OUTPUT_DIR', '/tmp'))
parser.add_argument(
'--data_dir',
type=str,
default=os.getenv('PT_DATA_DIR', 'data'),
help='Directory where UCI dataset is stored.')
args = parser.parse_args()
#### Set directories
UCI = args.data_dir
ENCODER_WEIGHTS = os.path.join(args.output_dir, 'encoder.tensorflow')
FINETUNED_DECODER_WEIGHTS = os.path.join(args.output_dir, 'generator.tensorflow')
rs = 42 # random seed
def p_vae_active_learning(Data_train,mask_train,Data_test,mask_test,epochs,latent_dim,batch_size,p,K,M,eval,Repeat,estimation_method=0):
'''
This function loads a pretrained p-VAE model, and performs active learning using single global strategy.
Note that we assume that the last column of x is the target variable of interest
:param Data_train: training data matrix
:param mask_train: mask matrix that indicates the missingness of training data. 1=observed, 0 = missing
:param Data_test: test data matrix
:param mask_test: mask matrix that indicates the missingness of test data. 1=observed, 0 = missing
:param latent_dim: latent dimension of partial VAE.
:param K: dimension of feature map of PNP encoder
:param M: number of samples used for MC sampling
:param eval: evaluation metric of active learning. 'rmse':rmse; 'nllh':negative log likelihood
:param Repeat: number of repeats.
:param estimation_method: what method to use for single ordering information reward estimation.
In order to calculate the single best ordering, we need to somehow marginalize (average) the
information reward over the data set (in this case, the test set).
we provide two methods of marginalization.
- estimation_method = 0: information reward marginalized using the model distribution p_{vae_model}(x_o).
- estimation_method = 1: information reward marginalized using the data distribution p_{data}(x_o)
:return: None (active learning results are saved to args.output_dir)
'''
for r in range(Repeat):
## train partial VAE
tf.reset_default_graph()
vae = train_p_vae(Data_train,mask_train, epochs, latent_dim,batch_size, p, K,10)
n_test = Data_test.shape[0]
n_train = Data_train.shape[0]
OBS_DIM = Data_test.shape[1]
# kwargs = {
# 'K': K,
# 'obs_distrib': "Gaussian",
# 'latent_dim': latent_dim,
# 'encoder': PNP_fc_uci_encoder,
# 'decoder': fc_uci_decoder,
# 'obs_dim': OBS_DIM,
# 'load_model': 1,
# 'decoder_path': FINETUNED_DECODER_WEIGHTS,
# 'encoder_path': ENCODER_WEIGHTS,
# }
# vae = PN_Plus_VAE(**kwargs)
## create arrays to store results
if r == 0:
# information curves
information_curve_RAND = np.zeros(
(Repeat, n_test, OBS_DIM - 1 + 1))
information_curve_SING = np.zeros(
(Repeat, n_test, OBS_DIM - 1 + 1))
information_curve_CHAI = np.zeros(
(Repeat, n_test, OBS_DIM - 1 + 1))
# history of optimal actions
action_SING = np.zeros((Repeat, n_test,
OBS_DIM - 1 ))
action_CHAI = np.zeros((Repeat, n_test,
OBS_DIM - 1))
# history of information reward values
R_hist_SING = np.zeros(
(Repeat, OBS_DIM - 1 , n_test,
OBS_DIM - 1 ))
R_hist_CHAI = np.zeros(
(Repeat, OBS_DIM - 1, n_test,
OBS_DIM - 1))
# history of posterior samples of partial inference
im_SING = np.zeros((Repeat, OBS_DIM - 1 , M,
n_test, OBS_DIM ))
im_CHAI = np.zeros((Repeat, OBS_DIM - 1, M,
n_test, OBS_DIM))
## Perform active variable selection with random ordiner and SING (single sequence)
for strategy in range(3):
if strategy == 0:### random strategy
## create arrays to store data and missingness
x = Data_test[:, :] #
x = np.reshape(x, [n_test, OBS_DIM])
mask = np.zeros((n_test, OBS_DIM))
mask[:, -1] = 0 # we will never observe target value
## initialize array that stores optimal actions (i_optimal)
i_optimal = [
nums for nums in range(OBS_DIM - 1 )
]
i_optimal = np.tile(i_optimal, [n_test, 1])
random.shuffle([random.shuffle(c) for c in i_optimal])
## evaluate likelihood at initial stage (no observation)
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask,eval, M)
information_curve_RAND[r, :, 0] = negative_predictive_llh
for t in range(OBS_DIM - 1 ):
print("Repeat = {:.1f}".format(r))
print("Strategy = {:.1f}".format(strategy))
print("Step = {:.1f}".format(t))
io = np.eye(OBS_DIM)[i_optimal[:, t]]
mask = mask + io
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask,eval, M)
information_curve_RAND[r, :, t +
1] = negative_predictive_llh
elif strategy == 1:### single ordering strategy "SING"
#SING is obtrained by maximize mean information reward for each step for the test set to be consistant with the description in the paper.
#We can also get this order by using a subset of training set to obtain the optimal ordering and apply this to the testset.
x = Data_test[:, :] #
x = np.reshape(x, [n_test, OBS_DIM])
mask = np.zeros((n_test, OBS_DIM)) # this stores the mask of missingness (stems from both test data missingness and unselected features during active learing)
mask2 = np.zeros((n_test, OBS_DIM)) # this stores the mask indicating that which features has been selected of each data
mask[:, -1] = 0 # Note that no matter how you initialize mask, we always keep the target variable (last column) unobserved.
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask,eval, M)
information_curve_SING[r, :, 0] = negative_predictive_llh
for t in range(OBS_DIM - 1 ): # t is a indicator of step
print("Repeat = {:.1f}".format(r))
print("Strategy = {:.1f}".format(strategy))
print("Step = {:.1f}".format(t))
## note that for single ordering, there are two rewards.
# The first one (R) is calculated based on no observations.
# This is used for active learning phase, since single ordering should not depend on observations.
# The second one (R_eval) is calculated in the same way as chain rule approximation. This is only used for visualization.
R = -1e4 * np.ones(
(n_test, OBS_DIM - 1)
)
im_0 = completion(x, mask*0, M, vae) # sample from model prior
im = completion(x, mask, M, vae) # sample conditional on observations
im_SING[r, t, :, :, :] = im
for u in range(OBS_DIM - 1): # u is the indicator for features. calculate reward function for each feature candidates
loc = np.where(mask2[:, u] == 0)[0]
if estimation_method == 0:
R[loc, u] = R_lindley_chain(u, x, mask, M, vae, im_0,loc)
else:
R[loc, u] = R_lindley_chain(u, x, mask, M, vae, im, loc)
R_hist_SING[r, t, :, :] = R
i_optimal = (R.mean(axis=0)).argmax() # optimal decision based on reward averaged on all data
i_optimal = np.tile(i_optimal, [n_test])
io = np.eye(OBS_DIM)[i_optimal]
action_SING[r, :, t] = i_optimal
mask = mask + io*mask_test # this mask takes into account both data missingness and missingness of unselected features
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask,eval, M)
mask2 = mask2 + io # this mask only stores missingess of unselected features, i.e., which features has been selected of each data
information_curve_SING[r, :, t +
1] = negative_predictive_llh
elif strategy == 2: ### EDDI strategy (chain rule approximation)
# personalized active feature selection strategy
## create arrays to store data and missingness
x = Data_test[:, :] #
x = np.reshape(x, [n_test, OBS_DIM])
mask = np.zeros((n_test, OBS_DIM)) # this stores the mask of missingness (stems from both test data missingness and unselected features during active learing)
mask2 = np.zeros((n_test,OBS_DIM)) # this stores the mask indicating that which features has been selected of each data
mask[:,-1] = 0 # Note that no matter how you initialize mask, we always keep the target variable (last column) unobserved.
## evaluate likelihood at initial stage (no observation)
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask, eval,M)
information_curve_CHAI[r, :, 0] = negative_predictive_llh
for t in range(OBS_DIM - 1): # t is a indicator of step
print("Repeat = {:.1f}".format(r))
print("Strategy = {:.1f}".format(strategy))
print("Step = {:.1f}".format(t))
R = -1e4 * np.ones((n_test, OBS_DIM - 1))
im = completion(x, mask, M, vae)
im_CHAI[r, t, :, :, :] = im
for u in range(OBS_DIM - 1): # u is the indicator for features. calculate reward function for each feature candidates
loc = np.where(mask[:, u] == 0)[0]
R[loc, u] = R_lindley_chain(u, x, mask, M, vae, im,
loc)
R_hist_CHAI[r, t, :, :] = R
i_optimal = R.argmax(axis=1)
io = np.eye(OBS_DIM)[i_optimal]
action_CHAI[r, :, t] = i_optimal
mask = mask + io # this mask takes into account both data missingness and missingness of unselected features
negative_predictive_llh, uncertainty = vae.predictive_loss(
x, mask, eval,M)
mask2 = mask2 + io # this mask only stores missingess of unselected features, i.e., which features has been selected of each data
print(mask2[0:5, :])
information_curve_CHAI[r, :, t +
1] = negative_predictive_llh
# Save results
np.savez(
os.path.join(
args.output_dir,
'UCI_information_curve_RAND.npz'),
information_curve=information_curve_RAND)
np.savez(
os.path.join(
args.output_dir,
'UCI_information_curve_SING.npz'),
information_curve=information_curve_SING)
np.savez(
os.path.join(
args.output_dir,
'UCI_information_curve_CHAI.npz'),
information_curve=information_curve_CHAI)
np.savez(
os.path.join(args.output_dir,
'UCI_action_SING.npz'),
action=action_SING)
np.savez(
os.path.join(args.output_dir,
'UCI_action_CHAI.npz'),
action=action_CHAI)
np.savez(
os.path.join(args.output_dir,
'UCI_R_hist_SING.npz'),
R_hist=R_hist_SING)
np.savez(
os.path.join(args.output_dir,
'UCI_R_hist_CHAI.npz'),
R_hist=R_hist_CHAI)
np.savez(
os.path.join(args.output_dir,
'UCI_im_SING.npz'),
im=im_SING)
np.savez(
os.path.join(args.output_dir,
'UCI_im_CHAI.npz'),
im=im_CHAI)
return None
def train_p_vae(Data_train,mask_train, epochs, latent_dim,batch_size, p, K,iteration):
'''
This function trains the partial VAE.
:param Data_train: training Data matrix, N by D
:param mask_train: mask matrix that indicates the missingness. 1=observed, 0 = missing
:param epochs: number of epochs of training
:param LATENT_DIM: latent dimension for partial VAE model
:param p: dropout rate for creating additional missingness during training
:param K: dimension of feature map of PNP encoder
:param iteration: how many mini-batches are used each epoch. set to -1 to run the full epoch.
:return: trained VAE, together with the test data used for testing.
'''
obs_dim = Data_train.shape[1]
n_train = Data_train.shape[0]
list_train = np.arange(n_train)
####### construct
kwargs = {
'K': K,
'obs_distrib': "Gaussian",
'latent_dim': latent_dim,
'batch_size': batch_size,
'encoder': PNP_fc_uci_encoder,
'decoder': fc_uci_decoder,
'obs_dim': obs_dim,
'load_model':0,
'decoder_path': FINETUNED_DECODER_WEIGHTS,
'encoder_path': ENCODER_WEIGHTS,
}
vae = PN_Plus_VAE(**kwargs)
if iteration == -1:
n_it = int(np.ceil(n_train / float(kwargs['batch_size'])))
else:
n_it = iteration
for epoch in range(epochs):
training_loss_full = 0.
# test_loss, test_kl, test_recon = vae.full_batch_loss(Data_test,mask_test)
# test_loss = test_loss
# test_kl = test_kl / n_test
# test_recon = test_recon / n_test
# iterate through batches
# np.random.shuffle(list_train)
for it in range(n_it):
if iteration == -1:
batch_indices = list_train[it*kwargs['batch_size']:min(it*kwargs['batch_size'] + kwargs['batch_size'], n_train - 1)]
else:
batch_indices = sample(range(n_train), kwargs['batch_size'])
x = Data_train[batch_indices, :]
mask_train_batch = mask_train[batch_indices, :]
# DROPOUT_TRAIN = np.minimum(np.random.rand(1), p)
# while True:
# mask_drop = np.array([bernoulli.rvs(1 - DROPOUT_TRAIN, size=obs_dim)] *
# kwargs['batch_size'])
# if np.sum(mask_drop>0):
# break
DROPOUT_TRAIN = np.minimum(np.random.rand(mask_train_batch.shape[0], obs_dim), p)
while True:
# mask_drop = np.array([bernoulli.rvs(1 - DROPOUT_TRAIN)] )
mask_drop = bernoulli.rvs(1 - DROPOUT_TRAIN)
if np.sum(mask_drop > 0):
break
mask_drop = mask_drop.reshape([kwargs['batch_size'], obs_dim])
_ = vae.update(x, mask_drop*mask_train_batch)
loss_full, _, _ = vae.full_batch_loss(x,mask_drop*mask_train_batch)
training_loss_full += loss_full
# average loss over most recent epoch
training_loss_full /= n_it
print(
'Epoch: {} \tnegative training ELBO per observed feature: {:.2f}'
.format(epoch, training_loss_full))
vae.save_generator(FINETUNED_DECODER_WEIGHTS)
vae.save_encoder(ENCODER_WEIGHTS)
return vae