* working on gpu model

* initial GPU implementation

* added teardown method

* more progress

* GPU draft  done

* everything works besides layering

* test cases passing

* gpu timing code ... no time spent in IO

* added newline

* Moved comments around
This commit is contained in:
Spencer Buja 2021-01-06 16:19:54 -08:00 коммит произвёл GitHub
Родитель fc253f1175
Коммит 434f0d7e07
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 413 добавлений и 12 удалений

3
CountingGridsPy/.gitignore поставляемый
Просмотреть файл

@ -3,6 +3,9 @@
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
CGData.mat
*Profile.txt
src/keys.json
src/metadata.json
src/metadata_other.json

Просмотреть файл

@ -48,10 +48,20 @@ class CountingGridModel():
T, Z = data.shape
pi_la = np.zeros([self.extent[0], self.extent[1], Z, L])
h_la = np.zeros([self.extent[0], self.extent[1], Z, L])
# Uses self variable from cg_layers namespace
def compute_h(pi, W):
PI = np.pad(pi, [(0, W[0]), (0, W[1]), (0, 0)],
'wrap').cumsum(axis=0).cumsum(axis=1)
PI = np.pad(PI, [(1, 0), (1, 0), (0, 0)], 'constant')
w0 = W[0]
w1 = W[1]
cumsum_output = self.compute_h_noLoopFull(PI, w0, w1)
return np.moveaxis(np.moveaxis(cumsum_output[:-1, :-1, :], 2, 0)/np.sum(cumsum_output[:-1, :-1, :], axis=2), 0, -1)
# Modifies: h_la
def layer_compute_h(pi_la, h_la):
h = self.compute_h(pi_la[:, :, :, l], self.window)
h = compute_h(pi_la[:, :, :, l], self.window)
h_la[:, :, :, l] = np.transpose(
np.transpose(h) / np.transpose(np.sum(h, axis=2))
)
@ -257,7 +267,8 @@ class CountingGridModel():
def fit(
self, data, max_iter=100, returnSumSquareDifferencesOfPi=False,
noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./", heartBeaters=None
noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./",
heartBeaters=None, writeOutput=True
):
"""
Implements variational expectation maximization for the Counting Grid model
@ -273,6 +284,7 @@ class CountingGridModel():
def SSD(pi, piHat):
A = np.abs(pi - piHat)
return np.sum(A * A)
alpha = 1e-10
SSDPi = []
data = data.astype(np.float64)
@ -280,6 +292,7 @@ class CountingGridModel():
self.initializePi(data)
else:
self.pi = pi
self.h = self.compute_h(self.pi, self.window)
self.check_model()
extentProduct = np.prod(self.extent)
@ -310,14 +323,15 @@ class CountingGridModel():
i = i + 1
[(h.makeProgress(int(100*i/max_iter)) if h is not None else False)
for h in heartBeaters] if heartBeaters is not None else False
if layers > 1:
self.layercgdata = self.cg_layers(data, L=layers, noise=noise)
scipy.io.savemat(str(output_directory) +
"/CountingGridDataMatrices.mat", self.layercgdata)
else:
scipy.io.savemat(str(output_directory) +
"/CGData.mat", {"pi": self.pi, "q": self.q})
if writeOutput:
if layers > 1:
scipy.io.savemat(str(output_directory) + "/CountingGridDataMatrices.mat", self.layercgdata)
else:
scipy.io.savemat(str(output_directory) + "/CGData.mat", {"pi": self.pi, "q": self.q})
return self.pi
# assumptions that we need for the model to be valid
@ -350,7 +364,7 @@ class CountingGridModel():
# How to initialize pi
# Note that we don't want pi to be 0, since our update equations depend on a multiplication by pi
def initializePi(self, data, technique="uniform"):
if technique is "uniform":
if technique == "uniform":
size = [x for x in self.extent]
size.append(data.shape[1])
self.pi = np.random.random(size=tuple(size)).astype(np.float64)

Просмотреть файл

@ -0,0 +1,165 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import torch
import scipy
import os
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from CountingGridsPy.models import CountingGridModel
class CountingGridModelWithGPU(CountingGridModel):
def __init__(self, extent, window):
'''
Assumes:
extent is a 1-D numpy array of size D.
window is a 1-D numpy array of size D.
D is often 2, since it makes the model easily visualizable.
'''
self.extent = np.array(extent)
self.window = np.array(window)
def compute_h_noLoopFull(self, PI, w0, w1):
'''
Critical method for computing the histogram using the pi parameters.
Notes:
The code has the same syntax between CPU and GPU implementation due to similar APIs for indexing PyTorch tensors and numpy ndarrays.
This function can be deleted because CountingGridModelWithGPU inherits from CountingGridModel
Potential optimization:
* remove this function to reduce an extra stack frame.
'''
return PI[w0:,w1:,:] - PI[:-w0,w1:,:] - PI[w0:,:-w1,:] + PI[:-w0,:-w1,:]
def compute_h(self, pi, W):
'''
Compute the histogram.
'''
# optimization is to do this without moving any data back to the cpu to do the padding
PI = np.pad(pi.cpu().numpy(), [(0,W[0]),(0,W[1]),(0,0)], 'wrap')
PI = torch.from_numpy(np.pad(PI,[(1,0),(1,0),(0,0)],'constant')).cumsum(0).cumsum(1).cuda()
cumsum_output = self.compute_h_noLoopFull(PI,W[0],W[1])
return (
(cumsum_output[:-1,:-1,:]).permute((2,0,1)) / cumsum_output[:-1,:-1,:].sum(dim=2)
).permute((1, 2, 0))
def q_update(self, data):
'''
Updates belief of where document should be mapped.
'''
L = np.prod(self.extent)
reshapedHistogram = torch.log(self.h).reshape((L, data.shape[1]))
transposedDataMatrix = torch.transpose(data, 1, 0)
lql = torch.matmul(reshapedHistogram,transposedDataMatrix)
lqlmax = torch.max(lql, 0)[0]
min_prob = 1.0/(10 * L)
Lq = (
(lql-lqlmax) - torch.log(torch.sum(torch.exp(lql-lqlmax), 0))
).reshape(tuple(list(self.extent) + [data.shape[0]]))
q = torch.exp(Lq)
q[q < min_prob] = min_prob
q = q / torch.sum(torch.sum(q, 0), 0)
return q.permute([2,0,1])
def pi_update(self, data, pseudocounts, alpha):
T, Z = data.shape
W = self.window
device = torch.device("cuda:0")
# QdotC is called nrm in matlab engine, but padding is done beforehand in matlab
# permute([1,2,0])
# [x,y,z] => [y,z,x]
L = np.prod(self.extent)
QdotC = torch.matmul(
self.q.permute([1, 2, 0]).reshape((L, T)),
data
).reshape(self.extent[0], self.extent[1], Z)
# PyTorch only implements circular padding for 1 dimension at a time.
# We will pass the data back to the CPU, do the padding, and bring it back to the GPU.
QH = np.pad(
(QdotC/(self.h + np.prod(self.window)*alpha)).cpu().numpy(),
[(W[0], 0), (W[1], 0), (0, 0)],
'wrap'
).cumsum(axis=0).cumsum(axis=1)
QH = torch.tensor(QH, device=device, dtype=torch.double)
w0 = W[0]; w1 = W[1]
QH = self.compute_h_noLoopFull(QH, w0, w1)
QH[QH < 0] = 0
un_pi = pseudocounts + QH * (self.pi + alpha)
mask = (torch.sum(un_pi, 2) != 0).double()
not_mask = (torch.sum(un_pi, 2) == 0).double()
denom = torch.sum(un_pi, 2)
updated_pi = torch.transpose((torch.transpose(mask, 0, 1) * torch.transpose(un_pi, 0, 2)) / torch.transpose(denom, 0, 1), 0, 2) + \
(1.0 / Z) * torch.transpose(torch.ones([Z, self.extent[1], self.extent[0]], device=device, dtype=torch.double) * torch.transpose(not_mask, 0, 1), 0, 2)
return updated_pi
def fit(self, data_cpu, max_iter=100, noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./", heartBeaters=None, writeOutput=True):
'''
Fits the model, using GPU.
Assumes:
1. pi is a torch tensor on the GPU
'''
if not os.path.exists(str(output_directory)):
raise Exception("output_directory does not exist for counting grids trainer.")
if not torch.cuda.is_available():
raise Exception("No GPU available for training.")
device = torch.device("cuda:0")
alpha = 1e-10
data = torch.tensor(data_cpu, device=device, dtype=torch.double)
if pi is None:
self.initializePi(data) # potentially optimize by initializing data in GPU
self.pi = torch.tensor(self.pi, device=device, dtype=torch.double)
else:
self.pi = pi
self.h = self.compute_h(self.pi, self.window)
P = np.prod(self.extent)
T, Z = data.size()
pseudocounts = torch.mean(data.sum(1) / P) / 2.5
# q is an m x dim(extent) structure
qshape = [len(data)]
for v in self.extent:
qshape.append(v)
self.q = torch.zeros(tuple(qshape))
for i in tqdm(range(max_iter)):
# E-Step
self.q = self.q_update(data)
# M-Step
if learn_pi:
self.pi = self.pi_update(data, pseudocounts, alpha)
self.h = self.compute_h(self.pi, self.window)
[(h.makeProgress(int(100*i/max_iter)) if h is not None else False)
for h in heartBeaters] if heartBeaters is not None else False
if layers > 1:
self.pi = self.pi.cpu().numpy()
self.q = self.q.cpu().numpy()
data = data.cpu().numpy()
self.layercgdata = self.cg_layers(data, L=layers, noise=noise)
self.pi = torch.tensor(self.pi, device=device, dtype=torch.double)
self.q = torch.tensor(self.q, device=device, dtype=torch.double)
if writeOutput:
if layers > 1:
scipy.io.savemat(str(output_directory) + "/CountingGridDataMatrices.mat", self.layercgdata)
else:
scipy.io.savemat(str(output_directory) + "/CGData.mat", {"pi": self.pi, "q": self.q})
return self.pi

Просмотреть файл

@ -1,3 +1,4 @@
from .CountingGridModel import CountingGridModel
from .CountingGridModelWithGPU import CountingGridModelWithGPU
__all__ = ['CountingGridModel']
__all__ = ['CountingGridModel', 'CountingGridModelWithGPU']

Просмотреть файл

@ -0,0 +1,134 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import unittest
import numpy as np
import torch
import os
from CountingGridsPy.models import CountingGridModel, CountingGridModelWithGPU
class TestGPUvsCPU(unittest.TestCase):
def setUp(self):
SEED = "03071994"
np.random.seed(int(SEED))
M, N = [1000, 500]
extentSize = 40
self.data = np.round(np.random.random((M, N)) * 10)
self.extent = np.array([extentSize, extentSize])
self.window = np.array([5, 5])
self.pi_init = np.random.random([extentSize] * 2 + [N]) * 20
self.pi_init[0:5,0:5,0:5] += 30
self.cpuModel = CountingGridModel(self.extent, self.window)
self.gpuModel = CountingGridModelWithGPU(self.extent, self.window)
def tearDown(self):
potentialFilesGenerated = [
"CountingGridDataMatrices.mat",
"CGData.mat"
]
for fileName in potentialFilesGenerated:
if os.path.exists(fileName):
os.remove(fileName)
def test_q(self):
numIters = 20
device = torch.device("cuda:0")
self.gpuModel.fit(
self.data,
max_iter=numIters,
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
layers=1,
writeOutput=False,
learn_pi=False
)
self.cpuModel.fit(
self.data,
max_iter=numIters,
returnSumSquareDifferencesOfPi=False,
pi=np.copy(self.pi_init),
layers=1,
writeOutput=False,
learn_pi=False
)
doWordMappingsMatchValue = np.isclose(self.cpuModel.pi, self.gpuModel.pi.cpu().numpy()).flatten()
assert(
all(doWordMappingsMatchValue)
)
doDocumentMappingsMatchValue = np.isclose(self.cpuModel.q, self.gpuModel.q.cpu().numpy()).flatten()
assert(
all(doDocumentMappingsMatchValue)
)
def test_fitted_model_no_layers(self):
numIters = 100
device = torch.device("cuda:0")
self.gpuModel.fit(
self.data,
max_iter=numIters,
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
layers=1,
writeOutput=False
)
self.cpuModel.fit(
self.data,
max_iter=numIters,
returnSumSquareDifferencesOfPi=False,
pi=np.copy(self.pi_init),
layers=1,
writeOutput=False
)
doDocumentMappingsMatchValue = np.isclose(self.cpuModel.q, self.gpuModel.q.cpu().numpy()).flatten()
assert(
all(doDocumentMappingsMatchValue)
)
doWordMappingsMatchValue = np.isclose(self.cpuModel.pi, self.gpuModel.pi.cpu().numpy()).flatten()
assert(
all(doWordMappingsMatchValue)
)
def test_fitted_model_with_layers(self):
numIters = 50
layers = 2
self.cpuModel.fit(
self.data,
max_iter=numIters,
returnSumSquareDifferencesOfPi=False,
pi=np.copy(self.pi_init),
layers=layers,
writeOutput=False
)
device = torch.device("cuda:0")
self.gpuModel.fit(
self.data,
max_iter=numIters,
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
layers=layers,
writeOutput=False
)
assert(
all(
np.isclose(
self.cpuModel.q,
self.gpuModel.q.cpu().numpy()
).flatten()
)
)
assert(
all(
np.isclose(
self.cpuModel.pi,
self.gpuModel.pi.cpu().numpy()
).flatten()
)
)

Просмотреть файл

@ -15,9 +15,10 @@ class TestCorrectnessOfNontrivialDesignMatrix(unittest.TestCase):
[1]*7+[0, 1, 0, 1, 0, 1, 0]+list(range(1, 8))
).reshape((M, N))
# note: after one iteration h distribution is the same regardless of position on matrix or window size
self.extent = np.array([5, 5])
extentSize = 5
self.extent = np.array([extentSize, extentSize])
window = np.array([2, 3])
self.pi_init = np.ones([5]*2+[N])/1000
self.pi_init = np.ones([extentSize]*2+[N])/N
self.model = CountingGridModel(self.extent, window)
def test_correct_data(self):
@ -27,3 +28,4 @@ class TestCorrectnessOfNontrivialDesignMatrix(unittest.TestCase):
self.model.fit(self.data, max_iter=1,
returnSumSquareDifferencesOfPi=False, pi=np.copy(self.pi_init))
assert(np.all(np.isclose(self.model.q, .04)))

Просмотреть файл

@ -0,0 +1,82 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import unittest
import numpy as np
import torch
import os
import cProfile
from CountingGridsPy.models import CountingGridModel, CountingGridModelWithGPU
class TimeGPUvsCPU(object):
def __init__(self):
SEED = "03071994"
np.random.seed(int(SEED))
M, N = [5000, 1000]
extentSize = 40
self.data = np.round(np.random.random((M, N)) * 10)
self.extent = np.array([extentSize, extentSize])
self.window = np.array([5, 5])
self.pi_init = np.ones([extentSize] * 2 + [N]) / N
self.cpuModel = CountingGridModel(self.extent, self.window)
self.gpuModel = CountingGridModelWithGPU(self.extent, self.window)
def run_nolayers(self):
numIters = 50
device = torch.device("cuda:0")
outfileForGPU = "gpuProfile.txt"
gpuJob = '''self.gpuModel.fit(
self.data,
max_iter=numIters,
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
layers=1
)
'''
cProfile.runctx(gpuJob, globals(), locals(), outfileForGPU)
outfileForCPU = "cpuProfile.txt"
cpuJob = '''self.cpuModel.fit(
self.data,
max_iter=numIters,
returnSumSquareDifferencesOfPi=False,
pi=np.copy(self.pi_init),
layers=1
)
'''
cProfile.runctx(cpuJob, globals(), locals(), outfileForCPU)
def run_withlayers(self):
numIters = 50
device = torch.device("cuda:0")
outfileForGPU = "gpu2LayersProfile.txt"
gpuJob = '''self.gpuModel.fit(
self.data,
max_iter=numIters,
pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
layers=2,
writeOutput=False
)
'''
cProfile.runctx(gpuJob, globals(), locals(), outfileForGPU)
outfileForCPU = "cpu2LayersProfile.txt"
cpuJob = '''self.cpuModel.fit(
self.data,
max_iter=numIters,
returnSumSquareDifferencesOfPi=False,
pi=np.copy(self.pi_init),
layers=2,
writeOutput=False
)
'''
cProfile.runctx(cpuJob, globals(), locals(), outfileForCPU)
if __name__ == "__main__":
o = TimeGPUvsCPU()
o.run_withlayers()