Torchmodel (#12)
* working on gpu model * initial GPU implementation * added teardown method * more progress * GPU draft done * everything works besides layering * test cases passing * gpu timing code ... no time spent in IO * added newline * Moved comments around
This commit is contained in:
Родитель
fc253f1175
Коммит
434f0d7e07
|
@ -3,6 +3,9 @@
|
|||
##
|
||||
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
||||
|
||||
CGData.mat
|
||||
*Profile.txt
|
||||
|
||||
src/keys.json
|
||||
src/metadata.json
|
||||
src/metadata_other.json
|
||||
|
|
|
@ -48,10 +48,20 @@ class CountingGridModel():
|
|||
T, Z = data.shape
|
||||
pi_la = np.zeros([self.extent[0], self.extent[1], Z, L])
|
||||
h_la = np.zeros([self.extent[0], self.extent[1], Z, L])
|
||||
|
||||
# Uses self variable from cg_layers namespace
|
||||
def compute_h(pi, W):
|
||||
PI = np.pad(pi, [(0, W[0]), (0, W[1]), (0, 0)],
|
||||
'wrap').cumsum(axis=0).cumsum(axis=1)
|
||||
PI = np.pad(PI, [(1, 0), (1, 0), (0, 0)], 'constant')
|
||||
w0 = W[0]
|
||||
w1 = W[1]
|
||||
cumsum_output = self.compute_h_noLoopFull(PI, w0, w1)
|
||||
return np.moveaxis(np.moveaxis(cumsum_output[:-1, :-1, :], 2, 0)/np.sum(cumsum_output[:-1, :-1, :], axis=2), 0, -1)
|
||||
|
||||
# Modifies: h_la
|
||||
def layer_compute_h(pi_la, h_la):
|
||||
h = self.compute_h(pi_la[:, :, :, l], self.window)
|
||||
h = compute_h(pi_la[:, :, :, l], self.window)
|
||||
h_la[:, :, :, l] = np.transpose(
|
||||
np.transpose(h) / np.transpose(np.sum(h, axis=2))
|
||||
)
|
||||
|
@ -257,7 +267,8 @@ class CountingGridModel():
|
|||
|
||||
def fit(
|
||||
self, data, max_iter=100, returnSumSquareDifferencesOfPi=False,
|
||||
noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./", heartBeaters=None
|
||||
noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./",
|
||||
heartBeaters=None, writeOutput=True
|
||||
):
|
||||
"""
|
||||
Implements variational expectation maximization for the Counting Grid model
|
||||
|
@ -273,6 +284,7 @@ class CountingGridModel():
|
|||
def SSD(pi, piHat):
|
||||
A = np.abs(pi - piHat)
|
||||
return np.sum(A * A)
|
||||
|
||||
alpha = 1e-10
|
||||
SSDPi = []
|
||||
data = data.astype(np.float64)
|
||||
|
@ -280,6 +292,7 @@ class CountingGridModel():
|
|||
self.initializePi(data)
|
||||
else:
|
||||
self.pi = pi
|
||||
|
||||
self.h = self.compute_h(self.pi, self.window)
|
||||
self.check_model()
|
||||
extentProduct = np.prod(self.extent)
|
||||
|
@ -310,14 +323,15 @@ class CountingGridModel():
|
|||
i = i + 1
|
||||
[(h.makeProgress(int(100*i/max_iter)) if h is not None else False)
|
||||
for h in heartBeaters] if heartBeaters is not None else False
|
||||
|
||||
|
||||
if layers > 1:
|
||||
self.layercgdata = self.cg_layers(data, L=layers, noise=noise)
|
||||
scipy.io.savemat(str(output_directory) +
|
||||
"/CountingGridDataMatrices.mat", self.layercgdata)
|
||||
else:
|
||||
scipy.io.savemat(str(output_directory) +
|
||||
"/CGData.mat", {"pi": self.pi, "q": self.q})
|
||||
|
||||
if writeOutput:
|
||||
if layers > 1:
|
||||
scipy.io.savemat(str(output_directory) + "/CountingGridDataMatrices.mat", self.layercgdata)
|
||||
else:
|
||||
scipy.io.savemat(str(output_directory) + "/CGData.mat", {"pi": self.pi, "q": self.q})
|
||||
return self.pi
|
||||
|
||||
# assumptions that we need for the model to be valid
|
||||
|
@ -350,7 +364,7 @@ class CountingGridModel():
|
|||
# How to initialize pi
|
||||
# Note that we don't want pi to be 0, since our update equations depend on a multiplication by pi
|
||||
def initializePi(self, data, technique="uniform"):
|
||||
if technique is "uniform":
|
||||
if technique == "uniform":
|
||||
size = [x for x in self.extent]
|
||||
size.append(data.shape[1])
|
||||
self.pi = np.random.random(size=tuple(size)).astype(np.float64)
|
||||
|
|
|
@ -0,0 +1,165 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import torch
|
||||
import scipy
|
||||
import os
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from CountingGridsPy.models import CountingGridModel
|
||||
|
||||
|
||||
class CountingGridModelWithGPU(CountingGridModel):
|
||||
def __init__(self, extent, window):
|
||||
'''
|
||||
Assumes:
|
||||
extent is a 1-D numpy array of size D.
|
||||
window is a 1-D numpy array of size D.
|
||||
|
||||
D is often 2, since it makes the model easily visualizable.
|
||||
'''
|
||||
self.extent = np.array(extent)
|
||||
self.window = np.array(window)
|
||||
|
||||
def compute_h_noLoopFull(self, PI, w0, w1):
|
||||
'''
|
||||
Critical method for computing the histogram using the pi parameters.
|
||||
|
||||
Notes:
|
||||
The code has the same syntax between CPU and GPU implementation due to similar APIs for indexing PyTorch tensors and numpy ndarrays.
|
||||
This function can be deleted because CountingGridModelWithGPU inherits from CountingGridModel
|
||||
|
||||
Potential optimization:
|
||||
* remove this function to reduce an extra stack frame.
|
||||
'''
|
||||
return PI[w0:,w1:,:] - PI[:-w0,w1:,:] - PI[w0:,:-w1,:] + PI[:-w0,:-w1,:]
|
||||
|
||||
def compute_h(self, pi, W):
|
||||
'''
|
||||
Compute the histogram.
|
||||
'''
|
||||
# optimization is to do this without moving any data back to the cpu to do the padding
|
||||
PI = np.pad(pi.cpu().numpy(), [(0,W[0]),(0,W[1]),(0,0)], 'wrap')
|
||||
PI = torch.from_numpy(np.pad(PI,[(1,0),(1,0),(0,0)],'constant')).cumsum(0).cumsum(1).cuda()
|
||||
cumsum_output = self.compute_h_noLoopFull(PI,W[0],W[1])
|
||||
return (
|
||||
(cumsum_output[:-1,:-1,:]).permute((2,0,1)) / cumsum_output[:-1,:-1,:].sum(dim=2)
|
||||
).permute((1, 2, 0))
|
||||
|
||||
def q_update(self, data):
|
||||
'''
|
||||
Updates belief of where document should be mapped.
|
||||
'''
|
||||
L = np.prod(self.extent)
|
||||
reshapedHistogram = torch.log(self.h).reshape((L, data.shape[1]))
|
||||
transposedDataMatrix = torch.transpose(data, 1, 0)
|
||||
|
||||
lql = torch.matmul(reshapedHistogram,transposedDataMatrix)
|
||||
lqlmax = torch.max(lql, 0)[0]
|
||||
min_prob = 1.0/(10 * L)
|
||||
Lq = (
|
||||
(lql-lqlmax) - torch.log(torch.sum(torch.exp(lql-lqlmax), 0))
|
||||
).reshape(tuple(list(self.extent) + [data.shape[0]]))
|
||||
q = torch.exp(Lq)
|
||||
q[q < min_prob] = min_prob
|
||||
q = q / torch.sum(torch.sum(q, 0), 0)
|
||||
|
||||
return q.permute([2,0,1])
|
||||
|
||||
def pi_update(self, data, pseudocounts, alpha):
|
||||
T, Z = data.shape
|
||||
W = self.window
|
||||
device = torch.device("cuda:0")
|
||||
# QdotC is called nrm in matlab engine, but padding is done beforehand in matlab
|
||||
|
||||
# permute([1,2,0])
|
||||
# [x,y,z] => [y,z,x]
|
||||
L = np.prod(self.extent)
|
||||
QdotC = torch.matmul(
|
||||
self.q.permute([1, 2, 0]).reshape((L, T)),
|
||||
data
|
||||
).reshape(self.extent[0], self.extent[1], Z)
|
||||
|
||||
# PyTorch only implements circular padding for 1 dimension at a time.
|
||||
# We will pass the data back to the CPU, do the padding, and bring it back to the GPU.
|
||||
QH = np.pad(
|
||||
(QdotC/(self.h + np.prod(self.window)*alpha)).cpu().numpy(),
|
||||
[(W[0], 0), (W[1], 0), (0, 0)],
|
||||
'wrap'
|
||||
).cumsum(axis=0).cumsum(axis=1)
|
||||
QH = torch.tensor(QH, device=device, dtype=torch.double)
|
||||
|
||||
w0 = W[0]; w1 = W[1]
|
||||
QH = self.compute_h_noLoopFull(QH, w0, w1)
|
||||
QH[QH < 0] = 0
|
||||
|
||||
un_pi = pseudocounts + QH * (self.pi + alpha)
|
||||
mask = (torch.sum(un_pi, 2) != 0).double()
|
||||
not_mask = (torch.sum(un_pi, 2) == 0).double()
|
||||
denom = torch.sum(un_pi, 2)
|
||||
|
||||
updated_pi = torch.transpose((torch.transpose(mask, 0, 1) * torch.transpose(un_pi, 0, 2)) / torch.transpose(denom, 0, 1), 0, 2) + \
|
||||
(1.0 / Z) * torch.transpose(torch.ones([Z, self.extent[1], self.extent[0]], device=device, dtype=torch.double) * torch.transpose(not_mask, 0, 1), 0, 2)
|
||||
return updated_pi
|
||||
|
||||
def fit(self, data_cpu, max_iter=100, noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./", heartBeaters=None, writeOutput=True):
|
||||
'''
|
||||
Fits the model, using GPU.
|
||||
|
||||
Assumes:
|
||||
1. pi is a torch tensor on the GPU
|
||||
'''
|
||||
|
||||
if not os.path.exists(str(output_directory)):
|
||||
raise Exception("output_directory does not exist for counting grids trainer.")
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
raise Exception("No GPU available for training.")
|
||||
device = torch.device("cuda:0")
|
||||
alpha = 1e-10
|
||||
data = torch.tensor(data_cpu, device=device, dtype=torch.double)
|
||||
|
||||
if pi is None:
|
||||
self.initializePi(data) # potentially optimize by initializing data in GPU
|
||||
self.pi = torch.tensor(self.pi, device=device, dtype=torch.double)
|
||||
else:
|
||||
self.pi = pi
|
||||
|
||||
self.h = self.compute_h(self.pi, self.window)
|
||||
P = np.prod(self.extent)
|
||||
T, Z = data.size()
|
||||
|
||||
pseudocounts = torch.mean(data.sum(1) / P) / 2.5
|
||||
# q is an m x dim(extent) structure
|
||||
qshape = [len(data)]
|
||||
for v in self.extent:
|
||||
qshape.append(v)
|
||||
self.q = torch.zeros(tuple(qshape))
|
||||
|
||||
for i in tqdm(range(max_iter)):
|
||||
# E-Step
|
||||
self.q = self.q_update(data)
|
||||
# M-Step
|
||||
if learn_pi:
|
||||
self.pi = self.pi_update(data, pseudocounts, alpha)
|
||||
|
||||
self.h = self.compute_h(self.pi, self.window)
|
||||
[(h.makeProgress(int(100*i/max_iter)) if h is not None else False)
|
||||
for h in heartBeaters] if heartBeaters is not None else False
|
||||
|
||||
if layers > 1:
|
||||
self.pi = self.pi.cpu().numpy()
|
||||
self.q = self.q.cpu().numpy()
|
||||
data = data.cpu().numpy()
|
||||
self.layercgdata = self.cg_layers(data, L=layers, noise=noise)
|
||||
self.pi = torch.tensor(self.pi, device=device, dtype=torch.double)
|
||||
self.q = torch.tensor(self.q, device=device, dtype=torch.double)
|
||||
|
||||
if writeOutput:
|
||||
if layers > 1:
|
||||
scipy.io.savemat(str(output_directory) + "/CountingGridDataMatrices.mat", self.layercgdata)
|
||||
else:
|
||||
scipy.io.savemat(str(output_directory) + "/CGData.mat", {"pi": self.pi, "q": self.q})
|
||||
return self.pi
|
|
@ -1,3 +1,4 @@
|
|||
from .CountingGridModel import CountingGridModel
|
||||
from .CountingGridModelWithGPU import CountingGridModelWithGPU
|
||||
|
||||
__all__ = ['CountingGridModel']
|
||||
__all__ = ['CountingGridModel', 'CountingGridModelWithGPU']
|
||||
|
|
|
@ -0,0 +1,134 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
import os
|
||||
from CountingGridsPy.models import CountingGridModel, CountingGridModelWithGPU
|
||||
|
||||
|
||||
class TestGPUvsCPU(unittest.TestCase):
|
||||
def setUp(self):
|
||||
SEED = "03071994"
|
||||
np.random.seed(int(SEED))
|
||||
M, N = [1000, 500]
|
||||
extentSize = 40
|
||||
self.data = np.round(np.random.random((M, N)) * 10)
|
||||
self.extent = np.array([extentSize, extentSize])
|
||||
self.window = np.array([5, 5])
|
||||
self.pi_init = np.random.random([extentSize] * 2 + [N]) * 20
|
||||
self.pi_init[0:5,0:5,0:5] += 30
|
||||
self.cpuModel = CountingGridModel(self.extent, self.window)
|
||||
self.gpuModel = CountingGridModelWithGPU(self.extent, self.window)
|
||||
|
||||
def tearDown(self):
|
||||
potentialFilesGenerated = [
|
||||
"CountingGridDataMatrices.mat",
|
||||
"CGData.mat"
|
||||
]
|
||||
|
||||
for fileName in potentialFilesGenerated:
|
||||
if os.path.exists(fileName):
|
||||
os.remove(fileName)
|
||||
|
||||
def test_q(self):
|
||||
numIters = 20
|
||||
device = torch.device("cuda:0")
|
||||
self.gpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
|
||||
layers=1,
|
||||
writeOutput=False,
|
||||
learn_pi=False
|
||||
)
|
||||
|
||||
self.cpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
returnSumSquareDifferencesOfPi=False,
|
||||
pi=np.copy(self.pi_init),
|
||||
layers=1,
|
||||
writeOutput=False,
|
||||
learn_pi=False
|
||||
)
|
||||
|
||||
doWordMappingsMatchValue = np.isclose(self.cpuModel.pi, self.gpuModel.pi.cpu().numpy()).flatten()
|
||||
assert(
|
||||
all(doWordMappingsMatchValue)
|
||||
)
|
||||
|
||||
doDocumentMappingsMatchValue = np.isclose(self.cpuModel.q, self.gpuModel.q.cpu().numpy()).flatten()
|
||||
assert(
|
||||
all(doDocumentMappingsMatchValue)
|
||||
)
|
||||
|
||||
def test_fitted_model_no_layers(self):
|
||||
numIters = 100
|
||||
|
||||
device = torch.device("cuda:0")
|
||||
self.gpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
|
||||
layers=1,
|
||||
writeOutput=False
|
||||
)
|
||||
|
||||
self.cpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
returnSumSquareDifferencesOfPi=False,
|
||||
pi=np.copy(self.pi_init),
|
||||
layers=1,
|
||||
writeOutput=False
|
||||
)
|
||||
doDocumentMappingsMatchValue = np.isclose(self.cpuModel.q, self.gpuModel.q.cpu().numpy()).flatten()
|
||||
assert(
|
||||
all(doDocumentMappingsMatchValue)
|
||||
)
|
||||
|
||||
doWordMappingsMatchValue = np.isclose(self.cpuModel.pi, self.gpuModel.pi.cpu().numpy()).flatten()
|
||||
assert(
|
||||
all(doWordMappingsMatchValue)
|
||||
)
|
||||
|
||||
def test_fitted_model_with_layers(self):
|
||||
numIters = 50
|
||||
layers = 2
|
||||
self.cpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
returnSumSquareDifferencesOfPi=False,
|
||||
pi=np.copy(self.pi_init),
|
||||
layers=layers,
|
||||
writeOutput=False
|
||||
)
|
||||
|
||||
device = torch.device("cuda:0")
|
||||
self.gpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
|
||||
layers=layers,
|
||||
writeOutput=False
|
||||
)
|
||||
|
||||
assert(
|
||||
all(
|
||||
np.isclose(
|
||||
self.cpuModel.q,
|
||||
self.gpuModel.q.cpu().numpy()
|
||||
).flatten()
|
||||
)
|
||||
)
|
||||
|
||||
assert(
|
||||
all(
|
||||
np.isclose(
|
||||
self.cpuModel.pi,
|
||||
self.gpuModel.pi.cpu().numpy()
|
||||
).flatten()
|
||||
)
|
||||
)
|
|
@ -15,9 +15,10 @@ class TestCorrectnessOfNontrivialDesignMatrix(unittest.TestCase):
|
|||
[1]*7+[0, 1, 0, 1, 0, 1, 0]+list(range(1, 8))
|
||||
).reshape((M, N))
|
||||
# note: after one iteration h distribution is the same regardless of position on matrix or window size
|
||||
self.extent = np.array([5, 5])
|
||||
extentSize = 5
|
||||
self.extent = np.array([extentSize, extentSize])
|
||||
window = np.array([2, 3])
|
||||
self.pi_init = np.ones([5]*2+[N])/1000
|
||||
self.pi_init = np.ones([extentSize]*2+[N])/N
|
||||
self.model = CountingGridModel(self.extent, window)
|
||||
|
||||
def test_correct_data(self):
|
||||
|
@ -27,3 +28,4 @@ class TestCorrectnessOfNontrivialDesignMatrix(unittest.TestCase):
|
|||
self.model.fit(self.data, max_iter=1,
|
||||
returnSumSquareDifferencesOfPi=False, pi=np.copy(self.pi_init))
|
||||
assert(np.all(np.isclose(self.model.q, .04)))
|
||||
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
import torch
|
||||
import os
|
||||
import cProfile
|
||||
from CountingGridsPy.models import CountingGridModel, CountingGridModelWithGPU
|
||||
|
||||
|
||||
class TimeGPUvsCPU(object):
|
||||
def __init__(self):
|
||||
SEED = "03071994"
|
||||
np.random.seed(int(SEED))
|
||||
M, N = [5000, 1000]
|
||||
extentSize = 40
|
||||
self.data = np.round(np.random.random((M, N)) * 10)
|
||||
self.extent = np.array([extentSize, extentSize])
|
||||
self.window = np.array([5, 5])
|
||||
self.pi_init = np.ones([extentSize] * 2 + [N]) / N
|
||||
self.cpuModel = CountingGridModel(self.extent, self.window)
|
||||
self.gpuModel = CountingGridModelWithGPU(self.extent, self.window)
|
||||
|
||||
def run_nolayers(self):
|
||||
numIters = 50
|
||||
|
||||
device = torch.device("cuda:0")
|
||||
outfileForGPU = "gpuProfile.txt"
|
||||
gpuJob = '''self.gpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
|
||||
layers=1
|
||||
)
|
||||
'''
|
||||
cProfile.runctx(gpuJob, globals(), locals(), outfileForGPU)
|
||||
|
||||
|
||||
outfileForCPU = "cpuProfile.txt"
|
||||
cpuJob = '''self.cpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
returnSumSquareDifferencesOfPi=False,
|
||||
pi=np.copy(self.pi_init),
|
||||
layers=1
|
||||
)
|
||||
'''
|
||||
cProfile.runctx(cpuJob, globals(), locals(), outfileForCPU)
|
||||
|
||||
def run_withlayers(self):
|
||||
numIters = 50
|
||||
|
||||
device = torch.device("cuda:0")
|
||||
outfileForGPU = "gpu2LayersProfile.txt"
|
||||
gpuJob = '''self.gpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
|
||||
layers=2,
|
||||
writeOutput=False
|
||||
)
|
||||
'''
|
||||
cProfile.runctx(gpuJob, globals(), locals(), outfileForGPU)
|
||||
|
||||
|
||||
outfileForCPU = "cpu2LayersProfile.txt"
|
||||
cpuJob = '''self.cpuModel.fit(
|
||||
self.data,
|
||||
max_iter=numIters,
|
||||
returnSumSquareDifferencesOfPi=False,
|
||||
pi=np.copy(self.pi_init),
|
||||
layers=2,
|
||||
writeOutput=False
|
||||
)
|
||||
'''
|
||||
cProfile.runctx(cpuJob, globals(), locals(), outfileForCPU)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
o = TimeGPUvsCPU()
|
||||
o.run_withlayers()
|
Загрузка…
Ссылка в новой задаче