Adding int8 support for NDArrayView

2018-05-01 11:40:05 -07:00 · 2018-05-01 11:40:05 -07:00 · c0d5386502
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -86,6 +86,7 @@ namespace CNTK
        Double = 2,
        UChar = 3, // So far only used internally in deserializers.
        Float16 = 4,
+        Int8 = 5,

        /* TODO:
        Bit,
@ -115,6 +116,10 @@ namespace CNTK
            return DataType::Double;
        else if (std::is_same<ElementType, float16>())
            return DataType::Float16;
+        else if (std::is_same<ElementType, int8_t>())
+            return DataType::Int8;
+        else if (std::is_same<ElementType, char>())
+            return DataType::Int8;
        else
            NOT_IMPLEMENTED;
    }
@ -127,6 +132,8 @@ namespace CNTK
            return "Double";
        else if (dataType == DataType::Float16)
            return "Float16";
+        else if (dataType == DataType::Int8)
+            return "Int8";
        else
            LogicError("Unknown DataType.");
    }
@ -139,6 +146,8 @@ namespace CNTK
            return sizeof(double);
        else if (dataType == DataType::Float16)
            return sizeof(float16);
+        else if (dataType == DataType::Int8)
+            return sizeof(int8_t);
        else
            LogicError("Unknown DataType.");
    }
@ -765,6 +774,9 @@ namespace CNTK
            case DataType::Float16:
                SetValue(float16::create(value));
                break;
+            case DataType::Int8:
+                SetValue((int8_t)value);
+                break;
            default:
                LogicError("Unsupported DataType %s.", DataTypeName(m_dataType));
                break;
@ -864,6 +876,11 @@ namespace CNTK
        ///
        CNTK_API void SetValue(float16 value);

+        ///
+        /// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Int8.
+        ///
+        CNTK_API void SetValue(int8_t value);
+
        ///
        /// Creates a new NDArrayView with newly allocated storage on the specified device and copies 'this' view's contents into the newly allocated view.
        ///
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -347,7 +347,7 @@ namespace CNTK
            {
                auto firstValue = data1[i];
                auto secondValue = data2[i];
-                ElementType allowedTolerance = (std::max<ElementType>)(std::abs((ElementType)absoluteTolerance), std::abs(((ElementType)relativeTolerance) * firstValue));
+                ElementType allowedTolerance = (std::max<ElementType>)((ElementType)std::abs((ElementType)absoluteTolerance), (ElementType)std::abs(((ElementType)relativeTolerance) * firstValue));
                if (std::abs(firstValue - secondValue) > allowedTolerance)
                    return false;
            }
@ -369,7 +369,7 @@ namespace CNTK
            {
                auto firstValue = values1[i];
                auto secondValue = values2[i];
-                ElementType allowedTolerance = (std::max<ElementType>)(std::abs((ElementType)absoluteTolerance), std::abs(((ElementType)relativeTolerance) * firstValue));
+                ElementType allowedTolerance = (std::max<ElementType>)((ElementType)std::abs((ElementType)absoluteTolerance), (ElementType)std::abs(((ElementType)relativeTolerance) * firstValue));
                if (std::abs(firstValue - secondValue) > allowedTolerance)
                    return false;
            }
@ -473,6 +473,9 @@ namespace CNTK
            if (view1.GetDataType() == DataType::Double)
                return AreEqual<double>(view1, view2, relativeTolerance, absoluteTolerance);

+            if (view1.GetDataType() == DataType::Int8)
+                return AreEqual<int8_t>(view1, view2, relativeTolerance, absoluteTolerance);
+
            LogicError("AreEqual(NDArrayView): Unknown DataType.");
        }

--- a/Source/CNTKv2LibraryDll/NDArrayView.cpp
+++ b/Source/CNTKv2LibraryDll/NDArrayView.cpp
@ -29,6 +29,18 @@ namespace CNTK
        return float16(std::numeric_limits<float>::quiet_NaN());
    }

+    template <>
+    inline char quiet_NaN<char>()
+    {
+        return char(std::numeric_limits<int>::quiet_NaN());
+    }
+
+    template <>
+    inline int8_t quiet_NaN<int8_t>()
+    {
+        return char(std::numeric_limits<int>::quiet_NaN());
+    }
+
    template <typename V1ElemType>
    static TensorView<V1ElemType>* AllocateTensorView(const NDShape& viewShape,
                                                       const DeviceDescriptor& device,
@ -61,6 +73,8 @@ namespace CNTK
            return AllocateTensorView<double>(viewShape, device, dataBuffer, bufferSizeInBytes);
        case DataType::Float16:
            return AllocateTensorView<half>(viewShape, device, dataBuffer, bufferSizeInBytes);
+        case DataType::Int8:
+            return AllocateTensorView<char>(viewShape, device, dataBuffer, bufferSizeInBytes);
        default:
            LogicError("Unsupported DataType %s", DataTypeName(dataType));
            break;
@ -97,6 +111,8 @@ namespace CNTK
            return AllocateTensorView<double>(viewShape, storageType, device, numNonZeroValues);
        case DataType::Float16:
            return AllocateTensorView<half>(viewShape, storageType, device, numNonZeroValues);
+        case DataType::Int8:
+            return AllocateTensorView<char>(viewShape, storageType, device, numNonZeroValues);
        default:
            LogicError("Unsupported DataType %s", DataTypeName(dataType));
            break;
@ -135,6 +151,13 @@ namespace CNTK
                sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const half*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
                break;
            }
+            case DataType::Int8:
+            {
+                auto sparseMatrix = GetWritableMatrix<char>(1);
+                sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const char*)nonZeroValues, numNonZeroValues,
+                    sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
+                break;
+            }
            default:
                LogicError("Unsupported DataType %s", DataTypeName(dataType));
                break;
@ -157,6 +180,9 @@ namespace CNTK
            case DataType::Float16:
                delete GetTensorView<half>();
                break;
+            case DataType::Int8:
+                delete GetTensorView<char>();
+                break;
            default:
                LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
                break;
@ -194,6 +220,14 @@ namespace CNTK
        GetWritableMatrix<double>()->SetValue(value);
    }

+    void NDArrayView::SetValue(int8_t value)
+    {
+        if (IsSparse())
+            LogicError("NDArrayView::SetValue: Setting a NDArrayView contents to a scalar is only allowed for objects with dense storage format.");
+
+        GetWritableMatrix<char>()->SetValue(value);
+    }
+
    bool NDArrayView::IsSliceView()
    {
        switch (m_dataType)
@ -213,6 +247,11 @@ namespace CNTK
            auto currentMatrix = GetMatrix<half>();
            return currentMatrix->IsView();
        }
+        case DataType::Int8:
+        {
+            auto currentMatrix = GetMatrix<char>();
+            return currentMatrix->IsView();
+        }
        }
        return false;
    }
@ -289,6 +328,8 @@ namespace CNTK
            return GetMatrixImpl<double>(GetTensorView<double>(), rowColSplitPoint);
        case DataType::Float16:
            return GetMatrixImpl<half>(GetTensorView<half>(), rowColSplitPoint);
+        case DataType::Int8:
+            return GetMatrixImpl<char>(GetTensorView<char>(), rowColSplitPoint);
        default:
            LogicError("Unknown m_dataType %d", (int)m_dataType);
        }
@ -305,6 +346,8 @@ namespace CNTK
            return GetMatrixImpl<double>(GetWritableTensorView<double>(), rowColSplitPoint);
        case DataType::Float16:
            return GetMatrixImpl<half>(GetWritableTensorView<half>(), rowColSplitPoint);
+        case DataType::Int8:
+            return GetMatrixImpl<char>(GetWritableTensorView<char>(), rowColSplitPoint);
        default:
            LogicError("Unknown m_dataType %d", (int)m_dataType);
        }
@ -355,6 +398,13 @@ namespace CNTK
            newMatrix->AssignValuesOf(*thisMatrix);
            break;
        }
+        case DataType::Int8:
+        {
+            auto newMatrix = newView->GetWritableMatrix<char>();
+            auto thisMatrix = GetMatrix<char>();
+            newMatrix->AssignValuesOf(*thisMatrix);
+            break;
+        }
        default:
            LogicError("NDArrayView::DeepClone: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -396,6 +446,13 @@ namespace CNTK
            destMatrix->AssignValuesOf(*sourceMatrix);
            break;
        }
+        case DataType::Int8:
+        {
+            auto sourceMatrix = source.GetMatrix<char>();
+            auto destMatrix = GetWritableMatrix<char>();
+            destMatrix->AssignValuesOf(*sourceMatrix);
+            break;
+        }
        default:
            LogicError("NDArrayView::CopyFrom: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -416,6 +473,9 @@ namespace CNTK
        case DataType::Float16:
            tensorView = new TensorView<half>(*(GetTensorView<half>()));
            break;
+        case DataType::Int8:
+            tensorView = new TensorView<char>(*(GetTensorView<char>()));
+            break;
        default:
            LogicError("NDArrayView::Alias: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -501,6 +561,22 @@ namespace CNTK
            tensorView = new TensorView<half>(slicedMatrixView, AsTensorViewShape(sliceViewShape));
            break;
        }
+        case DataType::Int8:
+        {
+            auto currentMatrix = GetMatrix<char>();
+            std::pair<size_t, size_t> currentMatrixDims = { currentMatrix->GetNumRows(), currentMatrix->GetNumCols() };
+            std::shared_ptr<Matrix<char>> slicedMatrixView;
+            if (sliceViewMatrixDims.first != currentMatrixDims.first)
+                slicedMatrixView =
+                make_shared<Matrix<char>>(currentMatrix->Reshaped(1, currentMatrix->GetNumElements())
+                    .ColumnSlice(flatBufferOffset, sliceViewShape.TotalSize()));
+            else
+                slicedMatrixView = make_shared<Matrix<char>>(
+                    currentMatrix->ColumnSlice(sliceMatrixColumnOffset, sliceViewMatrixDims.second));
+
+            tensorView = new TensorView<char>(slicedMatrixView, AsTensorViewShape(sliceViewShape));
+            break;
+        }
        default:
            LogicError("NDArrayView::SliceView: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -531,6 +607,9 @@ namespace CNTK
        case DataType::Float16:
            tensorView = new TensorView<half>(*(GetTensorView<half>()), newTensorShape);
            break;
+        case DataType::Int8:
+            tensorView = new TensorView<char>(*(GetTensorView<char>()), newTensorShape);
+            break;
        default:
            LogicError("NDArrayView::AsShape: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -539,16 +618,6 @@ namespace CNTK
        return MakeSharedObject<NDArrayView>(GetDataType(), Device(), GetStorageFormat(), newShape, IsReadOnly(), tensorView);
    }

-    // TODO: This could actually be strided?
-    template <typename ElementType>
-    ElementType* NDArrayView::WritableDataBuffer()
-    {
-        if (IsReadOnly())
-            InvalidArgument("NDArrayView::WritableDataBuffer: Cannot get writable data buffer from a read-only NDArrayView.");
-
-        return const_cast<ElementType*>(DataBuffer<ElementType>());
-    }
-
    template <typename ElementType>
    const ElementType* NDArrayView::DataBuffer() const
    {
@ -561,6 +630,12 @@ namespace CNTK
        return const_cast<float16*>(_DataBuffer<float16, half>());
    }

+    template<>
+    const int8_t* NDArrayView::DataBuffer<int8_t>() const
+    {
+        return const_cast<int8_t*>(_DataBuffer<int8_t, char>());
+    }
+
    // TODO: This could actually be strided?
    template <typename ElementType, typename V1ElemType>
    const ElementType* NDArrayView::_DataBuffer() const
@ -577,6 +652,25 @@ namespace CNTK
        return reinterpret_cast<ElementType*>(matrix->Data());
    }

+    // TODO: This could actually be strided?
+    template <typename ElementType>
+    ElementType* NDArrayView::WritableDataBuffer()
+    {
+        if (IsReadOnly())
+            InvalidArgument("NDArrayView::WritableDataBuffer: Cannot get writable data buffer from a read-only NDArrayView.");
+
+        return const_cast<ElementType*>(DataBuffer<ElementType>());
+    }
+
+    template <>
+    int8_t* NDArrayView::WritableDataBuffer()
+    {
+        if (IsReadOnly())
+            InvalidArgument("NDArrayView::WritableDataBuffer: Cannot get writable data buffer from a read-only NDArrayView.");
+
+        return const_cast<int8_t*>(DataBuffer<int8_t>());
+    }
+
    template <typename ElementType>
    std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers() const
    {
@ -589,6 +683,12 @@ namespace CNTK
        return _SparseCSCDataBuffers<float16, half>();
    }

+    template <>
+    std::tuple<const int8_t *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<int8_t>() const
+    {
+        return _SparseCSCDataBuffers<int8_t, char>();
+    }
+
    template <typename ElementType, typename V1ElemType>
    std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::_SparseCSCDataBuffers() const
    {
@ -656,6 +756,12 @@ namespace CNTK
        return _SparseBlockColumnDataBuffers<float16, half>();
    }

+    template <>
+    std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<int8_t>() const
+    {
+        return _SparseBlockColumnDataBuffers<int8_t, char>();
+    }
+
    template <typename ElementType, typename V1ElemType>
    std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::_SparseBlockColumnDataBuffers() const
    {
@ -713,6 +819,12 @@ namespace CNTK
            matrix->AdjustSparseBlockColumn(cpuCol2BlockId, numBlocks, useBlockId2Col);
            break;
        }
+        case DataType::Int8:
+        {
+            auto matrix = GetWritableMatrix<char>();
+            matrix->AdjustSparseBlockColumn(cpuCol2BlockId, numBlocks, useBlockId2Col);
+            break;
+        }
        default:
            LogicError("NDArrayView::AdjustSparseBlockColumn: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -747,6 +859,13 @@ namespace CNTK
            matrix->CollapseDataLocation();
            break;
        }
+        case DataType::Int8:
+        {
+            auto matrix = GetMatrix<char>();
+            matrix->TransferFromDeviceToDevice(matrix->GetDeviceId(), AsCNTKImplDeviceId(device), /*isBeingMoved = */ true, /*emptyTransfer =*/ false, /*updatePreferredDevice =*/ true);
+            matrix->CollapseDataLocation();
+            break;
+        }
        default:
            LogicError("NDArrayView::ChangeDevice: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -767,6 +886,12 @@ namespace CNTK
        return NDArrayView::_RandomNormal<float16, half>(shape, mean, stdDev, seed, device);
    }

+    template <>
+    /*static*/ NDArrayViewPtr NDArrayView::RandomNormal<int8_t>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device)
+    {
+        return NDArrayView::_RandomNormal<int8_t, char>(shape, mean, stdDev, seed, device);
+    }
+
    template <typename ElementType, typename V1ElemType>
    /*static*/ NDArrayViewPtr NDArrayView::_RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::UseDefaultDevice()*/)
    {
@ -789,6 +914,12 @@ namespace CNTK
        return NDArrayView::_RandomUniform<float16, half>(shape, rangeBegin, rangeEnd, seed, device);
    }

+    template <>
+    /*static*/ NDArrayViewPtr NDArrayView::RandomUniform<int8_t>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device)
+    {
+        return NDArrayView::_RandomUniform<int8_t, char>(shape, rangeBegin, rangeEnd, seed, device);
+    }
+
    template <typename ElementType, typename V1ElemType>
    /*static*/ NDArrayViewPtr NDArrayView::_RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/)
    {
@ -811,6 +942,12 @@ namespace CNTK
        return _AsScalar<float16, half>();
    }

+    template <>
+    int8_t NDArrayView::AsScalar<int8_t>() const
+    {
+        return _AsScalar<int8_t, char>();
+    }
+
    template <typename ElementType, typename V1ElemType>
    ElementType NDArrayView::_AsScalar() const
    {
@ -830,11 +967,13 @@ namespace CNTK
        }

        if (scalarData->GetDataType() == DataType::Float)
-            scalar = *(cpuData->DataBuffer<float>());
+            scalar = static_cast<ElementType>(*(cpuData->DataBuffer<float>()));
        else if (scalarData->GetDataType() == DataType::Double)
            scalar = static_cast<ElementType>(*(cpuData->DataBuffer<double>()));
        else if (scalarData->GetDataType() == DataType::Float16)
            scalar = static_cast<ElementType>(*(cpuData->DataBuffer<float16>()));
+        else if (scalarData->GetDataType() == DataType::Int8)
+            scalar = static_cast<ElementType>(*(cpuData->DataBuffer<char>()));
        else
            LogicError("NDArrayView::AsScalar: Unsupported DataType");

@ -853,43 +992,54 @@ namespace CNTK
    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float16>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
+    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<int8_t>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);

    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<double>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float16>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
+    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<int8_t>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);

    template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
    template CNTK_API const double* NDArrayView::DataBuffer<double>() const;
    template CNTK_API const float16* NDArrayView::DataBuffer<float16>() const;
+    template CNTK_API const int8_t* NDArrayView::DataBuffer<int8_t>() const;

    template CNTK_API const TensorView<float>* NDArrayView::GetTensorView<float>() const;
    template CNTK_API const TensorView<double>* NDArrayView::GetTensorView<double>() const;
    template CNTK_API const TensorView<half>* NDArrayView::GetTensorView<half>() const;
+    template CNTK_API const TensorView<char>* NDArrayView::GetTensorView<char>() const;

    template CNTK_API std::tuple<const float*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float>() const;
    template CNTK_API std::tuple<const double*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<double>() const;
    template CNTK_API std::tuple<const float16*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float16>() const;
+    template CNTK_API std::tuple<const int8_t*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<int8_t>() const;

    template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float>() const;
    template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<double>() const;
    template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float16>() const;
+    template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<int8_t>() const;

    template CNTK_API float* NDArrayView::WritableDataBuffer<float>();
    template CNTK_API double* NDArrayView::WritableDataBuffer<double>();
    template CNTK_API float16* NDArrayView::WritableDataBuffer<float16>();
+    template CNTK_API int8_t* NDArrayView::WritableDataBuffer<int8_t>();

    template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
    template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
    template std::shared_ptr<const Matrix<half>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
+    template std::shared_ptr<const Matrix<char>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;

    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
    template std::shared_ptr<Matrix<half>> NDArrayView::GetWritableMatrix<half>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<char>> NDArrayView::GetWritableMatrix<char>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
    template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
    template TensorView<double>* NDArrayView::GetWritableTensorView<double>();
    template TensorView<half>* NDArrayView::GetWritableTensorView<half>();
+    template TensorView<char>* NDArrayView::GetWritableTensorView<char>();

    template float NDArrayView::AsScalar<float>() const;
    template double NDArrayView::AsScalar<double>() const;
    template float16 NDArrayView::AsScalar<float16>() const;
+    template int8_t NDArrayView::AsScalar<int8_t>() const;
 }
--- a/Source/CNTKv2LibraryDll/Serialization.cpp
+++ b/Source/CNTKv2LibraryDll/Serialization.cpp
@ -244,6 +244,17 @@ namespace CNTK
                    dst->mutable_data()[i] = (DstT)buffer[i];
        }

+        static void WriteInt8Data(const NDArrayView& src, io::CodedOutputStream& output)
+        {
+            auto size = src.Shape().TotalSize();
+            const int8_t* buffer = src.DataBuffer<int8_t>();
+            for (auto i = 0; i < size; i++)
+            {
+                auto value = buffer[i];
+                output.WriteVarint32SignExtended(Encode<int8_t, int8_t>(value));
+            }
+        }
+
        template <typename T>
        static void WriteData(const NDArrayView& src, io::CodedOutputStream& output)
        {
@ -294,8 +305,6 @@ namespace CNTK
            }
        }

-        
-
        UsingUTF8 m_locale;
        Arena m_arena;
        Message* m_proto;
@ -332,6 +341,10 @@ namespace CNTK
            {
                CopyData<float16, float>(src, dst->mutable_float_values()->mutable_value());
            }
+            else if (src.GetDataType() == DataType::Int8)
+            {
+                CopyData<int8_t, int32>(src, dst->mutable_sint32_values()->mutable_value());
+            }
        }
    }

@ -352,6 +365,10 @@ namespace CNTK
            {
                WriteData<float16>(src, output);
            }
+            else if (src.GetDataType() == DataType::Int8)
+            {
+                WriteInt8Data(src, output);
+            }
        }
    }

@ -379,6 +396,11 @@ namespace CNTK
                if (!ReadData<float, float16>(wrapper, dst))
                    return false;
            }
+            else if (dst.GetDataType() == DataType::Int8)
+            {
+                if (!ReadData<int8_t, int8_t>(wrapper, dst))
+                    return false;
+            }
        }
        return true;
    }
@ -482,6 +504,13 @@ namespace CNTK
            else
                m_arrayViews.push_back({ dst, nullptr });
        }
+        else if (dataType == DataType::Int8)
+        {
+            if (src.sint32_values().value().size() == shape->TotalSize())
+                CopyData<int32, int8_t>(src.sint32_values().value(), dst);
+            else
+                m_arrayViews.push_back({ dst, nullptr });
+        }
        return dst;
    }

--- a/Source/CNTKv2LibraryDll/Trainer.cpp
+++ b/Source/CNTKv2LibraryDll/Trainer.cpp
@ -380,12 +380,16 @@ namespace CNTK
            m_rootGradientValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(m_aggregatedLossFunction->Output().GetDataType(), m_prevMinibatchAggregateTrainingLossValue->Shape(), computeDevice), outputs.at(m_aggregatedLossFunction)->Mask());
        }

-        if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Float)
+        DataType aggregateDataType = m_aggregatedLossFunction->Output().GetDataType();
+
+        if (aggregateDataType == DataType::Float)
            m_rootGradientValue->Data()->SetValue(1.0f);
-        else if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Double)
+        else if (aggregateDataType == DataType::Double)
            m_rootGradientValue->Data()->SetValue(1.0);
+        else if (aggregateDataType == DataType::Float16)
+            m_rootGradientValue->Data()->SetValue(float16(1.0));
        else
-            m_rootGradientValue->Data()->SetValue(half(1.0));
+            RuntimeError("DataType %s is not supported for root gradients", DataTypeName(aggregateDataType));

        for (const auto& parameter : m_learnerParameters)
            parameterGradients[parameter] = nullptr;
--- a/Source/CNTKv2LibraryDll/proto/CNTK.proto
+++ b/Source/CNTKv2LibraryDll/proto/CNTK.proto
@ -23,7 +23,8 @@ message NDArrayView {
    Unknown = 0;
    Float = 1;
    Double = 2;
-        Float16 = 4;
+    Float16 = 4;
+    Int8 = 5;
  }
  
  enum StorageFormat {
@ -44,12 +45,17 @@ message NDArrayView {
    repeated double value = 1 [packed = true];
  }

+    message IntValues {
+    repeated sint32 value = 1 [packed = true];
+  }
+
  oneof values {
    FloatValues float_values = 4;
    DoubleValues double_values = 5;
+    IntValues sint32_values = 6;
  }

-  // TODO: bool read_only = 6;
+  // TODO: bool read_only = 7;
 }

 message Vector {
--- a/Source/ComputationNetworkLib/InputAndParamNodes.cpp
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.cpp
@ -348,6 +348,14 @@ void LearnableParameter<ElemType>::InitBilinear(Matrix<ElemType>& valueMatrix, c
    valueMatrix.TransferToDeviceIfNotThere(deviceId, true);
 }

+// Initialize with bilinear interpolation coefficients (useful for deconvolution layer).
+template<>
+void LearnableParameter<char>::InitBilinear(Matrix<char>& valueMatrix, const TensorShape& sampleShape, size_t kernelWidth, size_t kernelHeight, DEVICEID_TYPE deviceId)
+{
+    RuntimeError("Unsupported template argument(char) in InitBilinear");
+}
+
+
 // initialize by reading a matrix from a text file
 template <class ElemType>
 void LearnableParameter<ElemType>::InitFromFile(const wstring& initFromFilePath)
--- a/Source/Math/CPUMatrixImpl.h
+++ b/Source/Math/CPUMatrixImpl.h
@ -7269,6 +7269,7 @@ template CPUMatrix<char>& CPUMatrix<char>::operator=(CPUMatrix<char>&&);
 template void CPUMatrix<char>::SetValue(const char);
 template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char* pArray, size_t matrixFlags);
 template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
+template bool CPUMatrix<char>::IsEqualTo(const CPUMatrix<char>& a, const char threshold) const;
 //template void CPUMatrix<char>::SetValue(GPUMatrix<char> const&);
 //template void CPUMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
 //template void CPUMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
@ -7277,6 +7278,9 @@ template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols
 template char* CPUMatrix<char>::CopyToArray(void) const;
 template void CPUMatrix<char>::CopySection(size_t numRows, size_t numCols, char* dst, size_t colStride) const;
 template void CPUMatrix<char>::Reshape(const size_t, const size_t);
+template void CPUMatrix<char>::SetUniformRandomValue(const char low, const char high, unsigned long seed);
+template void CPUMatrix<char>::SetUniformRandomValue(RNGHandle& rngHandle, const char low, const char high);
+template void CPUMatrix<char>::SetGaussianRandomValue(const char mean, const char sigma, unsigned long seed);

 // Support <short>
 template CPUMatrix<short>::CPUMatrix(const size_t numRows, const size_t numCols);
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -1849,6 +1849,8 @@ template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t st
 template void CPUSparseMatrix<char>::AssignColumnSliceToDense(CPUMatrix<char>&, size_t startColumn, size_t numCols) const;
 template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);
 template void CPUSparseMatrix<char>::ScaleAndAdd(char, class Microsoft::MSR::CNTK::CPUSparseMatrix<char> const &, class Microsoft::MSR::CNTK::CPUMatrix<char> &);
+template void CPUSparseMatrix<char>::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE* h_CSCCol, const CPUSPARSE_INDEX_TYPE* h_Row, const char* h_Val,
+    const size_t nz, const size_t numRows, const size_t numCols);

 // Support <short>
 template CPUSparseMatrix<short>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -5018,6 +5018,10 @@ template void GPUMatrix<char>::CopySection(size_t numRows, size_t numCols, char*
 template void GPUMatrix<char>::Reshape(const size_t, const size_t);
 template GPUMatrix<char>& GPUMatrix<char>::operator*=(char);
 template DEVICEID_TYPE GPUMatrix<char>::PrepareDevice(DEVICEID_TYPE deviceId) const;
+template void GPUMatrix<char>::SetUniformRandomValue(const char low, const char high, unsigned long seed);
+template void GPUMatrix<char>::SetUniformRandomValue(RNGHandle& rngHandle, const char low, const char high);
+template void GPUMatrix<char>::SetGaussianRandomValue(const char mean, const char sigma, unsigned long seed);
+template void GPUMatrix<char>::SetGaussianRandomValue(RNGHandle& rngHandle, const char mean, const char stdev);

 // Support <short>
 template GPUMatrix<short>::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId);
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -3044,6 +3044,9 @@ template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix
 template void GPUSparseMatrix<char>::Reshape(const size_t, const size_t);
 template void GPUSparseMatrix<char>::ScaleAndAdd(char, GPUSparseMatrix<char> const &, GPUMatrix<char> &);
 template void GPUSparseMatrix<char>::ColumnwiseScaleAndWeightedAdd(char, const GPUSparseMatrix<char>&, const GPUMatrix<char>&, char, GPUMatrix<char>&);
+template void GPUSparseMatrix<char>::AdjustCol2BlockId(const GPUSPARSE_INDEX_TYPE* cpuCol2BlockId, size_t numBlocks, bool useBlockId2Col);
+template void GPUSparseMatrix<char>::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE*, const CPUSPARSE_INDEX_TYPE*, const char*,
+    const size_t, const size_t, const size_t, const bool, const DEVICEID_TYPE, DataTransferer*);

 // Support <short>
 template GPUSparseMatrix<short>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -6245,6 +6245,7 @@ void Matrix<ElemType>::TensorArgOp(const Matrix<ElemType>& a, ElementWiseOperato
 template class Matrix<float>;
 template class Matrix<double>;
 template class Matrix<half>;
+//template class Matrix<char>;

 // instantiate some templated methods
 template MATH_API void Matrix<float>::AdaDeltaUpdate(Matrix<float>& gradients, Matrix<float>& functionvalues, float learningRatePerSample, float rho, float epsilon, int* timestamps, int currentTimestamp);
@ -6286,6 +6287,14 @@ template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, c
 template void Matrix<char>::Reshape(const size_t, const size_t);
 template char* Matrix<char>::CopyToArray(void) const;
 template bool Matrix<char>::IsView() const;
+template Matrix<char> Matrix<char>::RandomUniform(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const char low, const char high, unsigned long seed);
+template void Matrix<char>::SetUniformRandomValue(const char low, const char high, unsigned long seed);
+template void Matrix<char>::SetUniformRandomValue(RNGHandle& rngHandle, const char low, const char high);
+template Matrix<char> Matrix<char>::RandomGaussian(const size_t rows, const size_t cols, DEVICEID_TYPE deviceId, const char mean, const char sigma, unsigned long seed);
+template void Matrix<char>::SetMatrixFromCSCFormat(const CPUSPARSE_INDEX_TYPE* h_CSCCol, const CPUSPARSE_INDEX_TYPE* h_Row, const char* h_Val,
+    const size_t nz, const size_t numRows, const size_t numCols, DataTransferer* transferer);
+template void Matrix<char>::AdjustSparseBlockColumn(const GPUSPARSE_INDEX_TYPE* cpuCol2BlockId, size_t numBlocks, bool useBlockId2Col);
+template void Matrix<char>::TransferFromDeviceToDevice(int from_id, int to_id, bool isBeingMoved, bool emptyTransfer/* = false*/, bool updatePreferredDevice/* = true*/) const;

 // Matrix<short> methods
 template Matrix<short>::Matrix(DEVICEID_TYPE);
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -2482,6 +2482,9 @@ template void MatrixQuantizerGPU<double>::QuantizeAsync(const Matrix<double>&, c
 template void GPUMatrix<char>::CastAssignValuesOf<float>(const GPUMatrix<float>* other);
 template void GPUMatrix<char>::CastAssignValuesOf<double>(const GPUMatrix<double>* other);
 template void GPUMatrix<char>::CastAssignValuesOf<half>(const GPUMatrix<half>* other);
+template void GPUMatrix<char>::SetUniformRandomValue(const char low, const char high, unsigned long seed);
+template void GPUMatrix<char>::SetUniformRandomValue(RNGHandle& rngHandle, const char low, const char high);
+template void GPUMatrix<char>::SetGaussianRandomValue(const char mean, const char sigma, unsigned long seed);
 template void GPUMatrix<short>::CastAssignValuesOf<float>(const GPUMatrix<float>* other);
 template void GPUMatrix<short>::CastAssignValuesOf<double>(const GPUMatrix<double>* other);
 template void GPUMatrix<short>::CastAssignValuesOf<half>(const GPUMatrix<half>* other);
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@ -409,4 +409,11 @@ template class TensorView<float>;
 template class TensorView<double>;
 template class TensorView<half>;

+template Microsoft::MSR::CNTK::TensorView<char>::TensorView(const MatrixBasePtr& sob, const TensorShape& shape);
+template Microsoft::MSR::CNTK::TensorView<char>::TensorView(const TensorView<char>& other, const TensorShape& shape);
+template Microsoft::MSR::CNTK::TensorView<char>::TensorView(const TensorView<char>& other);
+template Microsoft::MSR::CNTK::TensorView<char> Microsoft::MSR::CNTK::TensorView<char>::Reshaped(const TensorShape& shape) const;
+template shared_ptr<Matrix<char>> Microsoft::MSR::CNTK::TensorView<char>::AsMatrix() const;
+template const TensorShape& Microsoft::MSR::CNTK::TensorView<char>::GetShape() const;
+
 }}}
--- a/Source/Math/fpgeneric.h
+++ b/Source/Math/fpgeneric.h
@ -402,6 +402,11 @@ inline curandStatus_t curandGenerateUniformHelper(curandGenerator_t, half *outpu
    return (curandStatus_t) 0;
 }

+inline curandStatus_t curandGenerateUniformHelper(curandGenerator_t, char *, size_t)
+{
+    RuntimeError("Unsupported template argument(half) in GPUSparseMatrix");
+}
+
 inline curandStatus_t curandGenerateNormalHelper(curandGenerator_t generator, float *outputPtr, size_t n, float mean, float stddev)
 {
    return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
@ -423,6 +428,10 @@ inline curandStatus_t curandGenerateNormalHelper(curandGenerator_t, half *output
    return (curandStatus_t) 0;
 }

+inline curandStatus_t curandGenerateNormalHelper(curandGenerator_t, char*, size_t, char, char)
+{
+    RuntimeError("Unsupported template argument(half) in GPUSparseMatrix");
+}

 // cusparse
 inline cusparseStatus_t cusparsecsr2denseHelper(cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA, const int *csrColIndA, float *A, int lda)
--- a/Tests/UnitTests/MathTests/CPUMatrixTests.cpp
+++ b/Tests/UnitTests/MathTests/CPUMatrixTests.cpp
@ -37,6 +37,26 @@ BOOST_FIXTURE_TEST_CASE(CPUMatrixConstructorNoFlags, RandomSeedFixture)
    BOOST_CHECK(m1.IsEqualTo(m));
 }

+BOOST_FIXTURE_TEST_CASE(CPUMatrixCharConstructorNoFlags, RandomSeedFixture)
+{
+    CPUMatrix<char> m;
+    BOOST_CHECK(m.IsEmpty());
+
+    m.Resize(2, 3);
+    BOOST_CHECK(!m.IsEmpty());
+    BOOST_CHECK_EQUAL(m.GetNumRows(), 2);
+    BOOST_CHECK_EQUAL(m.GetNumCols(), 3);
+    BOOST_CHECK_EQUAL(m.GetNumElements(), 6);
+
+    m(0, 0) = 1;
+    m(1, 2) = 2;
+    BOOST_CHECK_EQUAL(m(0, 0), 1);
+    BOOST_CHECK_EQUAL(m(1, 2), 2);
+
+    CPUMatrix<char> m1(m);
+    BOOST_CHECK(m1.IsEqualTo(m));
+}
+
 BOOST_FIXTURE_TEST_CASE(CPUMatrixConstructorFlagNormal, RandomSeedFixture)
 {
    std::array<float, 6> array = {1, 2, 3, 4, 5, 6};
--- a/bindings/csharp/CNTKLibraryManagedDll/CNTKLibraryManagedDll.csproj
+++ b/bindings/csharp/CNTKLibraryManagedDll/CNTKLibraryManagedDll.csproj
@ -110,6 +110,7 @@
    <Compile Include="SwigProxyClasses\StreamInformation.cs" />
    <Compile Include="SwigProxyClasses\StringVector.cs" />
    <Compile Include="SwigProxyClasses\SWIGTYPE_p_void.cs" />
+    <Compile Include="SwigProxyClasses\SWIGTYPE_p_int8_t.cs" />
    <Compile Include="SwigProxyClasses\Trainer.cs" />
    <Compile Include="SwigProxyClasses\TrainingParameterScheduleDouble.cs" />
    <Compile Include="SwigProxyClasses\UnorderedMapParameterNDArrayViewPtr.cs" />
--- a/bindings/csharp/Swig/cntk_cs.i
+++ b/bindings/csharp/Swig/cntk_cs.i
@ -78,6 +78,11 @@
        return new CNTK::NDArrayView(CNTK::DataType::Double, viewShape, colStarts, rowIndices, nonZeroValues, numNonZeroValues, device, readOnly);
    }

+    NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const int8_t* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly = false)
+    {
+        return new CNTK::NDArrayView(CNTK::DataType::Int8, viewShape, colStarts, rowIndices, nonZeroValues, numNonZeroValues, device, readOnly);
+    }
+
    static NDArrayViewPtr CNTK::NDArrayView::RandomNormalFloat(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device)
    {
        return CNTK::NDArrayView::RandomNormal<float>(shape, mean, stdDev, seed, device);
--- a/bindings/python/cntk/cntk_py.i
+++ b/bindings/python/cntk/cntk_py.i
@ -279,6 +279,11 @@ def dynamic_axes(self):
            numpy_type = NPY_HALF;
            buffer = (void*)cpuView->DataBuffer<float16>();
        }
+        else if (cntk_type == CNTK::DataType::Int8)
+        {
+            numpy_type = NPY_INT8;
+            buffer = (void*)cpuView->DataBuffer<int8_t>();
+        }
        else
        {
            throw std::invalid_argument("unknown CNTK data type");
@ -1859,9 +1864,22 @@ extern "C" CNTKPYTHON_API bool CreateDeserializer(DataDeserializerPtr& deseriali
                 view->CopyFrom(tmp);
            }
        }
+        else if (typecode == NPY_INT8)
+        {
+            if (borrow)
+            {
+                 view = new NDArrayView(DataType::Int8, NDShape(shape), PyArray_DATA(array), num_elements * DataTypeSize(DataType::Int8), DeviceDescriptor::CPUDevice(), readOnly);
+            }
+            else
+            {
+                 NDArrayView  tmp(DataType::Int8, NDShape(shape), PyArray_DATA(array), num_elements * DataTypeSize(DataType::Int8), DeviceDescriptor::CPUDevice(), readOnly);
+                 view = new NDArrayView(DataType::Int8, tmp.Shape(), device);
+                 view->CopyFrom(tmp);
+            }
+        }
        else
        {
-            throw std::logic_error("NumPy array of type float16, float32 or float64 expected");
+            throw std::logic_error("NumPy array of type int8, float16, float32 or float64 expected");
        }

        return view;
@ -1964,9 +1982,30 @@ extern "C" CNTKPYTHON_API bool CreateDeserializer(DataDeserializerPtr& deseriali
                view->CopyFrom(tmp);
            }
        }
+        else if (typecode == NPY_INT8)
+        {
+            if (borrow)
+            {
+                view = new NDArrayView(DataType::Int8, shape,
+                 (CNTK::SparseIndexType*)PyArray_DATA(indices),
+                 (CNTK::SparseIndexType*)PyArray_DATA(indptr),
+                 PyArray_DATA(data), numNonZeroValues,
+                 DeviceDescriptor::CPUDevice(), readOnly);
+            }
+            else
+            {
+                NDArrayView tmp(DataType::Int8, shape,
+                 (CNTK::SparseIndexType*)PyArray_DATA(indices),
+                 (CNTK::SparseIndexType*)PyArray_DATA(indptr),
+                 PyArray_DATA(data), numNonZeroValues,
+                 DeviceDescriptor::CPUDevice(), readOnly);
+                view = new NDArrayView(DataType::Int8, StorageFormat::SparseCSC, tmp.Shape(), device);
+                view->CopyFrom(tmp);
+            }
+        }
        else
        {
-            throw std::logic_error("NumPy array of type float16, float32 or float64 expected");
+            throw std::logic_error("NumPy array of type int8, float16, float32 or float64 expected");
        }

        return view;
--- a/bindings/python/cntk/internal/sanitize.py
+++ b/bindings/python/cntk/internal/sanitize.py
@ -61,6 +61,8 @@ def sanitize_precision(precision):
        return np.float64
    elif precision in [cntk_py.DataType_Float16, 'float16', np.float16]:
        return np.float16
+    elif precision in [cntk_py.DataType_Int8, 'int8_t', 'int8', np.int8]:
+        return np.int8
    elif precision in [cntk_py.DataType_Unknown]:
        return None
    else:
@ -444,6 +446,8 @@ def data_type_to_dtype(data_type):
        return np.float64
    elif data_type == cntk_py.DataType_Float16:
        return np.float16
+    elif data_type == cntk_py.DataType_Int8:
+        return np.int8
    elif data_type == cntk_py.DataType_Unknown:
        return object
    else:
@ -464,12 +468,15 @@ def sanitize_dtype_numpy(dtype):
    elif is_type and dtype in (float, np.float16) or \
            is_str and dtype in ('float16'):
        return np.float16
+    elif is_type and dtype in (int, np.int8) or \
+            is_str and dtype in ('int8'):
+        return np.int8
    else:
        raise ValueError('data type "%s" is not supported' % dtype)


 def sanitize_dtype_cntk(dtype):
-    if isinstance(dtype, int) and dtype in (cntk_py.DataType_Float, cntk_py.DataType_Double, cntk_py.DataType_Float16, cntk_py.DataType_Unknown):
+    if isinstance(dtype, int) and dtype in (cntk_py.DataType_Float, cntk_py.DataType_Double, cntk_py.DataType_Float16,cntk_py.DataType_Int8, cntk_py.DataType_Unknown):
        return dtype
    if dtype is None:
        return cntk_py.DataType_Unknown
@ -481,6 +488,8 @@ def sanitize_dtype_cntk(dtype):
        return cntk_py.DataType_Double
    elif dtype == np.float16:
        return cntk_py.DataType_Float16
+    elif dtype == np.int8:
+        return cntk_py.DataType_Int8
    elif dtype == object:
        return cntk_py.DataType_Unknown
    else:
--- a/bindings/python/cntk/internal/tests/types_test.py
+++ b/bindings/python/cntk/internal/tests/types_test.py
@ -0,0 +1,50 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+import os
+import numpy as np
+import pytest
+import cntk as C
+
+def delete_if_file_exists(file):
+    try:
+        os.remove(file)
+    except OSError:
+        pass
+
+def test_saving_int8_ndarray(tmpdir):
+    protobuf_file = str(tmpdir/'dictionary_val.bin')
+    delete_if_file_exists(protobuf_file)
+
+    data = np.arange(0,64, dtype=np.int8).reshape(16,4)
+    dict_val = C._to_cntk_dict_value(data)
+    dict_val.save(protobuf_file)
+
+    assert(os.path.getsize(protobuf_file) == 82)
+
+    a = dict_val.load(protobuf_file)
+    assert(a==dict_val)
+
+def test_saving_and_loading_int8_ndarray_as_attribute(tmpdir):
+    model_file = str(tmpdir/'test_model.bin')
+    delete_if_file_exists(model_file)
+
+    data = np.arange(0,64, dtype=np.int8).reshape(16,4)
+    dict_val = C._to_cntk_dict_value(data)
+
+    W = C.Parameter((C.InferredDimension, 42), init=C.glorot_uniform(), dtype=np.float)
+    x = C.input_variable(12, dtype=np.float)
+    y = C.times(x, W)
+    y.custom_attributes = {'int8_nd':dict_val}
+    y.save(model_file)
+
+    assert(os.path.isfile(model_file))
+
+    z = C.load_model(model_file)
+    int8_data = z.custom_attributes['int8_nd']
+    assert(int8_data.shape == (16,4))
+
+    assert (np.array_equal(int8_data, data))
--- a/bindings/python/cntk/internal/tests/utils_test.py
+++ b/bindings/python/cntk/internal/tests/utils_test.py
@ -23,12 +23,16 @@ def test_sanitize_dtype_numpy():
        assert sanitize_dtype_numpy(dtype) == np.float32, dtype
    for dtype in [float, 'float64', np.float64]:
        assert sanitize_dtype_numpy(dtype) == np.float64, dtype
+    for dtype in ['int8', np.int8]:
+        assert sanitize_dtype_numpy(dtype) == np.int8, dtype

 def test_sanitize_dtype_cntk():
    for dtype in ['float', 'float32', np.float32, int]:
        assert sanitize_dtype_cntk(dtype) == C.cntk_py.DataType_Float, dtype
    for dtype in [float, 'float64', np.float64]:
        assert sanitize_dtype_cntk(dtype) == C.cntk_py.DataType_Double, dtype
+    for dtype in ['int8', np.int8]:
+        assert sanitize_dtype_cntk(dtype) == C.cntk_py.DataType_Int8, dtype

@pytest.mark.parametrize("data, dtype", [
    ([1], np.float32),