Torchmodel (#12)

* working on gpu model * initial GPU implementation * added teardown method * more progress * GPU draft done * everything works besides layering * test cases passing * gpu timing code ... no time spent in IO * added newline * Moved comments around
2021-01-06 16:19:54 -08:00 · 2021-01-06 16:19:54 -08:00 · 434f0d7e07
--- a/CountingGridsPy/.gitignore
+++ b/CountingGridsPy/.gitignore
@ -3,6 +3,9 @@
 ##
 ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore

+CGData.mat
+*Profile.txt
+
 src/keys.json
 src/metadata.json
 src/metadata_other.json
--- a/CountingGridsPy/models/CountingGridModel.py
+++ b/CountingGridsPy/models/CountingGridModel.py
@ -49,9 +49,19 @@ class CountingGridModel():
        pi_la = np.zeros([self.extent[0], self.extent[1], Z, L])
        h_la = np.zeros([self.extent[0], self.extent[1], Z, L])
        
+        # Uses self variable from cg_layers namespace
+        def compute_h(pi, W):
+            PI = np.pad(pi, [(0, W[0]), (0, W[1]), (0, 0)],
+                        'wrap').cumsum(axis=0).cumsum(axis=1)
+            PI = np.pad(PI, [(1, 0), (1, 0), (0, 0)], 'constant')
+            w0 = W[0]
+            w1 = W[1]
+            cumsum_output = self.compute_h_noLoopFull(PI, w0, w1)
+            return np.moveaxis(np.moveaxis(cumsum_output[:-1, :-1, :], 2, 0)/np.sum(cumsum_output[:-1, :-1, :], axis=2), 0, -1)
+
        # Modifies: h_la
        def layer_compute_h(pi_la, h_la):
-            h = self.compute_h(pi_la[:, :, :, l], self.window)
+            h = compute_h(pi_la[:, :, :, l], self.window)
            h_la[:, :, :, l] = np.transpose(
                np.transpose(h) / np.transpose(np.sum(h, axis=2))
            )
@ -257,7 +267,8 @@ class CountingGridModel():

    def fit(
        self, data, max_iter=100, returnSumSquareDifferencesOfPi=False,
-        noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./", heartBeaters=None
+        noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./",
+        heartBeaters=None, writeOutput=True
    ):
        """
        Implements variational expectation maximization for the Counting Grid model
@ -273,6 +284,7 @@ class CountingGridModel():
        def SSD(pi, piHat):
            A = np.abs(pi - piHat)
            return np.sum(A * A)
+
        alpha = 1e-10
        SSDPi = []
        data = data.astype(np.float64)
@ -280,6 +292,7 @@ class CountingGridModel():
            self.initializePi(data)
        else:
            self.pi = pi
+            
        self.h = self.compute_h(self.pi, self.window)
        self.check_model()
        extentProduct = np.prod(self.extent)
@ -313,11 +326,12 @@ class CountingGridModel():
        
        if layers > 1:
            self.layercgdata = self.cg_layers(data, L=layers, noise=noise)
-            scipy.io.savemat(str(output_directory) +
-                             "/CountingGridDataMatrices.mat", self.layercgdata)
-        else:
-            scipy.io.savemat(str(output_directory) +
-                             "/CGData.mat", {"pi": self.pi, "q": self.q})
+
+        if writeOutput:
+            if layers > 1:
+                scipy.io.savemat(str(output_directory) + "/CountingGridDataMatrices.mat", self.layercgdata)
+            else:
+                scipy.io.savemat(str(output_directory) + "/CGData.mat", {"pi": self.pi, "q": self.q})
        return self.pi

    # assumptions that we need for the model to be valid
@ -350,7 +364,7 @@ class CountingGridModel():
    # How to initialize pi
    # Note that we don't want pi to be 0, since our update equations depend on a multiplication by pi
    def initializePi(self, data, technique="uniform"):
-        if technique is "uniform":
+        if technique == "uniform":
            size = [x for x in self.extent]
            size.append(data.shape[1])
            self.pi = np.random.random(size=tuple(size)).astype(np.float64)
--- a/CountingGridsPy/models/CountingGridModelWithGPU.py
+++ b/CountingGridsPy/models/CountingGridModelWithGPU.py
@ -0,0 +1,165 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import torch
+import scipy
+import os
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from CountingGridsPy.models import CountingGridModel
+
+
+class CountingGridModelWithGPU(CountingGridModel):
+    def __init__(self, extent, window):
+        '''
+        Assumes:
+        extent is a 1-D numpy array of size D.
+        window is a 1-D numpy array of size D.
+
+        D is often 2, since it makes the model easily visualizable.
+        '''
+        self.extent = np.array(extent)
+        self.window = np.array(window)
+   
+    def compute_h_noLoopFull(self, PI, w0, w1):
+        '''
+        Critical method for computing the histogram using the pi parameters.
+        
+        Notes:
+        The code has the same syntax between CPU and GPU implementation  due to similar APIs for indexing PyTorch tensors and numpy ndarrays.
+        This function can be deleted because CountingGridModelWithGPU inherits from CountingGridModel
+        
+        Potential optimization:
+        * remove this function to reduce an extra stack frame.
+        '''
+        return PI[w0:,w1:,:] - PI[:-w0,w1:,:] - PI[w0:,:-w1,:] + PI[:-w0,:-w1,:]
+
+    def compute_h(self, pi, W):
+        '''
+        Compute the histogram.
+        '''
+        # optimization is to do this without moving any data back to the cpu to do the padding
+        PI = np.pad(pi.cpu().numpy(), [(0,W[0]),(0,W[1]),(0,0)], 'wrap')
+        PI = torch.from_numpy(np.pad(PI,[(1,0),(1,0),(0,0)],'constant')).cumsum(0).cumsum(1).cuda()
+        cumsum_output = self.compute_h_noLoopFull(PI,W[0],W[1])
+        return (
+                    (cumsum_output[:-1,:-1,:]).permute((2,0,1)) / cumsum_output[:-1,:-1,:].sum(dim=2)
+               ).permute((1, 2, 0))   
+
+    def q_update(self, data):
+        '''
+        Updates belief of where document should be mapped.
+        '''
+        L = np.prod(self.extent)
+        reshapedHistogram = torch.log(self.h).reshape((L, data.shape[1]))
+        transposedDataMatrix = torch.transpose(data, 1, 0)
+
+        lql = torch.matmul(reshapedHistogram,transposedDataMatrix) 
+        lqlmax = torch.max(lql, 0)[0]
+        min_prob = 1.0/(10 * L)
+        Lq = (
+            (lql-lqlmax) - torch.log(torch.sum(torch.exp(lql-lqlmax), 0))
+        ).reshape(tuple(list(self.extent) + [data.shape[0]]))
+        q = torch.exp(Lq)
+        q[q < min_prob] = min_prob
+        q = q / torch.sum(torch.sum(q, 0), 0)
+        
+        return q.permute([2,0,1])
+
+    def pi_update(self, data, pseudocounts, alpha):
+        T, Z = data.shape
+        W = self.window
+        device = torch.device("cuda:0")
+        # QdotC is called nrm in matlab engine, but padding is done beforehand in matlab
+       
+        # permute([1,2,0])
+        # [x,y,z] => [y,z,x]
+        L = np.prod(self.extent)
+        QdotC = torch.matmul(
+            self.q.permute([1, 2, 0]).reshape((L, T)),
+            data
+        ).reshape(self.extent[0], self.extent[1], Z)
+        
+        # PyTorch only implements circular padding for 1 dimension at a time.
+        # We will pass the data back to the CPU, do the padding, and bring it back to the GPU.
+        QH = np.pad(
+            (QdotC/(self.h + np.prod(self.window)*alpha)).cpu().numpy(),
+            [(W[0], 0), (W[1], 0), (0, 0)],
+            'wrap'
+        ).cumsum(axis=0).cumsum(axis=1)
+        QH = torch.tensor(QH, device=device, dtype=torch.double)
+        
+        w0 = W[0]; w1 = W[1]
+        QH = self.compute_h_noLoopFull(QH, w0, w1)
+        QH[QH < 0] = 0
+        
+        un_pi = pseudocounts + QH * (self.pi + alpha)
+        mask = (torch.sum(un_pi, 2) != 0).double()
+        not_mask = (torch.sum(un_pi, 2) == 0).double()
+        denom = torch.sum(un_pi, 2)
+        
+        updated_pi = torch.transpose((torch.transpose(mask, 0, 1) * torch.transpose(un_pi, 0, 2)) / torch.transpose(denom, 0, 1), 0, 2) + \
+            (1.0 / Z) * torch.transpose(torch.ones([Z, self.extent[1], self.extent[0]], device=device, dtype=torch.double) * torch.transpose(not_mask, 0, 1), 0, 2)
+        return updated_pi
+
+    def fit(self, data_cpu, max_iter=100, noise=.000001, learn_pi=True, pi=None, layers=1, output_directory="./", heartBeaters=None, writeOutput=True):
+        '''
+        Fits the model, using GPU.
+
+        Assumes:
+        1. pi is a torch tensor on the GPU
+        '''
+    
+        if not os.path.exists(str(output_directory)):
+            raise Exception("output_directory does not exist for counting grids trainer.")
+
+        if not torch.cuda.is_available():
+            raise Exception("No GPU available for training.")
+        device = torch.device("cuda:0")
+        alpha = 1e-10
+        data = torch.tensor(data_cpu, device=device, dtype=torch.double)
+
+        if pi is None:
+            self.initializePi(data) # potentially optimize by initializing data in GPU 
+            self.pi = torch.tensor(self.pi, device=device, dtype=torch.double)
+        else:
+            self.pi = pi
+
+        self.h = self.compute_h(self.pi, self.window)
+        P = np.prod(self.extent)
+        T, Z = data.size()
+        
+        pseudocounts = torch.mean(data.sum(1) / P) / 2.5
+        # q is an m x dim(extent) structure
+        qshape = [len(data)]
+        for v in self.extent:
+            qshape.append(v)
+        self.q = torch.zeros(tuple(qshape))
+    
+        for i in tqdm(range(max_iter)):    
+            # E-Step 
+            self.q = self.q_update(data)
+            # M-Step
+            if learn_pi:
+                self.pi = self.pi_update(data, pseudocounts, alpha)
+    
+                self.h = self.compute_h(self.pi, self.window)
+                [(h.makeProgress(int(100*i/max_iter)) if h is not None else False)
+                for h in heartBeaters] if heartBeaters is not None else False
+
+        if layers > 1:
+            self.pi = self.pi.cpu().numpy()
+            self.q = self.q.cpu().numpy()
+            data = data.cpu().numpy()
+            self.layercgdata = self.cg_layers(data, L=layers, noise=noise)
+            self.pi = torch.tensor(self.pi, device=device, dtype=torch.double)
+            self.q = torch.tensor(self.q, device=device, dtype=torch.double)
+
+        if writeOutput:
+            if layers > 1:
+                scipy.io.savemat(str(output_directory) + "/CountingGridDataMatrices.mat", self.layercgdata)
+            else:
+                scipy.io.savemat(str(output_directory) + "/CGData.mat", {"pi": self.pi, "q": self.q})
+        return self.pi
--- a/CountingGridsPy/models/init.py
+++ b/CountingGridsPy/models/init.py
@ -1,3 +1,4 @@
 from .CountingGridModel import CountingGridModel
+from .CountingGridModelWithGPU import CountingGridModelWithGPU

-__all__ = ['CountingGridModel']
+__all__ = ['CountingGridModel', 'CountingGridModelWithGPU']
--- a/CountingGridsPy/tests/test_models/test_gpu.py
+++ b/CountingGridsPy/tests/test_models/test_gpu.py
@ -0,0 +1,134 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+import numpy as np
+import torch
+import os
+from CountingGridsPy.models import CountingGridModel, CountingGridModelWithGPU
+
+
+class TestGPUvsCPU(unittest.TestCase):
+    def setUp(self):
+        SEED = "03071994"
+        np.random.seed(int(SEED))
+        M, N = [1000, 500]
+        extentSize = 40
+        self.data = np.round(np.random.random((M, N)) * 10)
+        self.extent = np.array([extentSize, extentSize])
+        self.window = np.array([5, 5])
+        self.pi_init = np.random.random([extentSize] * 2 + [N]) * 20
+        self.pi_init[0:5,0:5,0:5] += 30
+        self.cpuModel = CountingGridModel(self.extent, self.window)
+        self.gpuModel = CountingGridModelWithGPU(self.extent, self.window)
+    
+    def tearDown(self):
+        potentialFilesGenerated = [
+            "CountingGridDataMatrices.mat",
+            "CGData.mat"
+        ]
+
+        for fileName in potentialFilesGenerated:
+            if os.path.exists(fileName):
+                os.remove(fileName)
+
+    def test_q(self):
+        numIters = 20
+        device = torch.device("cuda:0")
+        self.gpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
+            layers=1,
+            writeOutput=False,
+            learn_pi=False
+        )
+
+        self.cpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            returnSumSquareDifferencesOfPi=False,
+            pi=np.copy(self.pi_init),
+            layers=1,
+            writeOutput=False,
+            learn_pi=False
+        )
+
+        doWordMappingsMatchValue = np.isclose(self.cpuModel.pi, self.gpuModel.pi.cpu().numpy()).flatten()
+        assert(
+            all(doWordMappingsMatchValue)
+        )
+
+        doDocumentMappingsMatchValue = np.isclose(self.cpuModel.q, self.gpuModel.q.cpu().numpy()).flatten()
+        assert(
+            all(doDocumentMappingsMatchValue)
+        )
+    
+    def test_fitted_model_no_layers(self):
+        numIters = 100
+        
+        device = torch.device("cuda:0")
+        self.gpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
+            layers=1,
+            writeOutput=False
+        )
+
+        self.cpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            returnSumSquareDifferencesOfPi=False,
+            pi=np.copy(self.pi_init),
+            layers=1,
+            writeOutput=False
+        )
+        doDocumentMappingsMatchValue = np.isclose(self.cpuModel.q, self.gpuModel.q.cpu().numpy()).flatten()
+        assert(
+            all(doDocumentMappingsMatchValue)
+        )
+        
+        doWordMappingsMatchValue = np.isclose(self.cpuModel.pi, self.gpuModel.pi.cpu().numpy()).flatten()
+        assert(
+            all(doWordMappingsMatchValue)
+        )
+
+    def test_fitted_model_with_layers(self):
+        numIters = 50
+        layers = 2
+        self.cpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            returnSumSquareDifferencesOfPi=False,
+            pi=np.copy(self.pi_init),
+            layers=layers,
+            writeOutput=False
+        )
+
+        device = torch.device("cuda:0")
+        self.gpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
+            layers=layers,
+            writeOutput=False
+        )
+
+        assert(
+            all(
+                np.isclose(
+                    self.cpuModel.q,
+                    self.gpuModel.q.cpu().numpy()
+                ).flatten()
+            )
+        )
+
+        assert(
+            all(
+                np.isclose(
+                    self.cpuModel.pi,
+                    self.gpuModel.pi.cpu().numpy()
+                ).flatten()
+            )
+        )
--- a/CountingGridsPy/tests/test_models/test_small.py
+++ b/CountingGridsPy/tests/test_models/test_small.py
@ -15,9 +15,10 @@ class TestCorrectnessOfNontrivialDesignMatrix(unittest.TestCase):
            [1]*7+[0, 1, 0, 1, 0, 1, 0]+list(range(1, 8))
        ).reshape((M, N))
        # note: after one iteration h distribution is the same regardless of position on matrix or window size
-        self.extent = np.array([5, 5])
+        extentSize = 5
+        self.extent = np.array([extentSize, extentSize])
        window = np.array([2, 3])
-        self.pi_init = np.ones([5]*2+[N])/1000
+        self.pi_init = np.ones([extentSize]*2+[N])/N
        self.model = CountingGridModel(self.extent, window)

    def test_correct_data(self):
@ -27,3 +28,4 @@ class TestCorrectnessOfNontrivialDesignMatrix(unittest.TestCase):
        self.model.fit(self.data, max_iter=1,
                       returnSumSquareDifferencesOfPi=False, pi=np.copy(self.pi_init))
        assert(np.all(np.isclose(self.model.q, .04)))
+
--- a/CountingGridsPy/tests/time_models/time_gpuvscpu.py
+++ b/CountingGridsPy/tests/time_models/time_gpuvscpu.py
@ -0,0 +1,82 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+import numpy as np
+import torch
+import os
+import cProfile
+from CountingGridsPy.models import CountingGridModel, CountingGridModelWithGPU
+
+
+class TimeGPUvsCPU(object):
+    def __init__(self):
+        SEED = "03071994"
+        np.random.seed(int(SEED))
+        M, N = [5000, 1000]
+        extentSize = 40
+        self.data = np.round(np.random.random((M, N)) * 10)
+        self.extent = np.array([extentSize, extentSize])
+        self.window = np.array([5, 5])
+        self.pi_init = np.ones([extentSize] * 2 + [N]) / N
+        self.cpuModel = CountingGridModel(self.extent, self.window)
+        self.gpuModel = CountingGridModelWithGPU(self.extent, self.window)
+
+    def run_nolayers(self):
+        numIters = 50
+        
+        device = torch.device("cuda:0")
+        outfileForGPU = "gpuProfile.txt"
+        gpuJob = '''self.gpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
+            layers=1
+        )
+        '''
+        cProfile.runctx(gpuJob, globals(), locals(), outfileForGPU)
+
+
+        outfileForCPU = "cpuProfile.txt"
+        cpuJob = '''self.cpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            returnSumSquareDifferencesOfPi=False,
+            pi=np.copy(self.pi_init),
+            layers=1
+        )
+        '''
+        cProfile.runctx(cpuJob, globals(), locals(), outfileForCPU)
+
+    def run_withlayers(self):
+        numIters = 50
+        
+        device = torch.device("cuda:0")
+        outfileForGPU = "gpu2LayersProfile.txt"
+        gpuJob = '''self.gpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            pi=torch.tensor(self.pi_init, device=device, dtype=torch.double),
+            layers=2,
+            writeOutput=False
+        )
+        '''
+        cProfile.runctx(gpuJob, globals(), locals(), outfileForGPU)
+
+
+        outfileForCPU = "cpu2LayersProfile.txt"
+        cpuJob = '''self.cpuModel.fit(
+            self.data,
+            max_iter=numIters,
+            returnSumSquareDifferencesOfPi=False,
+            pi=np.copy(self.pi_init),
+            layers=2,
+            writeOutput=False
+        )
+        '''
+        cProfile.runctx(cpuJob, globals(), locals(), outfileForCPU)
+
+
+if __name__ == "__main__":
+    o = TimeGPUvsCPU()
+    o.run_withlayers()