adding source files to new repo

2020-09-03 21:10:31 -07:00 · 2020-09-03 21:10:31 -07:00 · 0933d4dc27
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,14 @@
+
+# Contributing
+
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
+contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
--- a/README.md
+++ b/README.md
@ -1,14 +1,4 @@
+# pSDCA

-# Contributing
+A variant of SDCA (Stochastic Dual Coordinate Ascent) method with primal gradient update, instead of using the proximal mappig of the conjugate loss functions which may be hard to compute.

-This project welcomes contributions and suggestions.  Most contributions require you to agree to a
-Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
-the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
-
-When you submit a pull request, a CLA bot will automatically determine whether you need to provide
-a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
-provided by the bot. You will only need to do this once across all repos using our CLA.
-
-This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
-For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
-contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
--- a/src/LBFGS/LBFGS.cpp
+++ b/src/LBFGS/LBFGS.cpp
@ -0,0 +1,229 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <iomanip>
+#include <mpi.h>
+
+#include "floattype.h"
+#include "utils.h"
+#include "randalgms.h"
+#include "lbfgs_omp.h"
+
+using namespace functions;
+using namespace distropt;
+
+int main(int argc, char* argv[])
+{
+	// In our implementation, only the main thread calls MPI, so we use MPI_THREAD_FUNNELED
+	int mpi_thread_provided;
+	MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &mpi_thread_provided);
+
+	MPI_Comm comm_world = MPI_COMM_WORLD;
+	int mpi_size, mpi_rank;
+	MPI_Comm_size(comm_world, &mpi_size);
+	MPI_Comm_rank(comm_world, &mpi_rank);
+
+	//---------------------------------------------------------------------------------------------
+	// parse the input arguments
+	std::string data_file;
+	std::string init_mthd = "none";
+	char loss_type, regu_type;
+	double lambda, l1weight = 0;	// default l1 weight set to zero
+	double rownorm = 1;				// default to normalize rows of X
+	lbfgs_params params;
+
+	int must_count = 0;
+	for (int i = 1; i < argc; i += 2) {
+		std::string optstr(argv[i]);
+		std::string valstr(argv[i + 1]);
+		if (optstr == "-data") {
+			data_file = valstr;
+			must_count++;
+		}
+		else if (optstr == "-loss") {
+			loss_type = valstr[0];
+			must_count++;
+		}
+		else if (optstr == "-lambda") {
+			lambda = std::stof(valstr);
+			must_count++;
+		}
+		else if (optstr == "-regu") {
+			regu_type = valstr[0];
+			must_count++;
+		}
+		else if (optstr == "-init") {
+			init_mthd = valstr;
+		}
+		else if (optstr == "-maxitrs") {
+			params.max_itrs = std::stoi(valstr);
+		}
+		else if (optstr == "-epsgrad") {
+			params.eps_grad = std::stof(valstr);
+		}
+		else if (optstr == "-memory") {
+			params.m_memory = std::stoi(valstr);
+		}
+		else if (optstr == "-lsrho") {
+			params.btls_rho = std::stof(valstr);
+		}
+		else if (optstr == "-lsdec") {
+			params.btls_dec = std::stof(valstr);
+		}
+		else if (optstr == "-lsmax") {
+			params.btls_max = std::stoi(valstr);
+		}
+		else if (optstr == "-lsada") {
+			params.btls_ada = (std::stoi(valstr) > 0);
+		}
+		else if (optstr == "-display") {
+			params.display = (std::stoi(valstr) > 0);
+		}
+		else {
+			std::cout << "LBFGS: Invalid arguments, please try again." << std::endl;
+			std::exit(0);
+		}
+	}
+	if (must_count < 4) {
+		std::cout << "LBFGS arguments: -data <string> -loss <char> -lambda <double> -regu <char>" << std::endl;
+		std::cout << "                 -init <string> -maxitrs <int> -epsgrad <double> -memory <int>" << std::endl;
+		std::cout << "                 -lsrho <double> -lsdec <double> -lsmax <int> -lsada <bool>" << std::endl;
+		std::cout << "                 -display <int>" << std::endl;
+		std::cout << "The first 4 options must be given, others can use default values." << std::endl;
+		std::exit(0);
+	}
+
+	// generate the output file name reflecting the dataset, algorithm, loss+regu, and lambda value
+	std::string data_name = data_file.substr(data_file.find_last_of("/\\") + 1);
+	std::stringstream sslambda;
+	sslambda << std::scientific << std::setprecision(1) << lambda;
+	std::string slambda = sslambda.str();
+	slambda.erase(slambda.find_first_of("."), 2);
+	std::string lsada = params.btls_ada ? "ada_" : "_";
+	params.filename = data_name + "_" + std::string(1, loss_type) + std::string(1, regu_type) + "_lbfgs"
+		+ std::to_string(params.m_memory) + lsada
+		+ std::to_string(mpi_size) + "_" + slambda + "_" + init_mthd;
+
+	//---------------------------------------------------------------------------------------------
+	// read local data file
+	string myfile = data_file + '_' + std::to_string(mpi_rank + 1);		// file labels start with 1
+
+	std::vector<spmatT> labels;
+	std::vector<spmatT> weights;
+	std::vector<spmatT> values;
+	std::vector<size_t> colidx;
+	std::vector<size_t> rowptr;
+
+	//std::cout << "Loading training data ... " << std::endl;
+	double time_start = MPI_Wtime();
+	size_t n_examples = load_datafile(myfile, labels, weights, values, colidx, rowptr, false, false, true);
+
+	SparseMatrixCSR X(values, colidx, rowptr, false);
+	Vector y(labels);
+	if (rownorm > 0) { X.normalize_rows(rownorm); }
+
+	MPI_Barrier(comm_world);
+
+	if (mpi_rank == 0) {
+		std::cout << "Loading files took " << MPI_Wtime() - time_start << " sec" << std::endl;
+	}
+
+	// use collective communications to get nTotalSamples and nAllFeatures
+	size_t nSamples = X.nrows();
+	size_t nFeatures = X.ncols();
+	size_t nnzs = X.nnzs();
+	size_t N, D, NZ;
+
+	time_start = MPI_Wtime();
+	MPI_Allreduce(&nSamples, &N, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, comm_world);
+	MPI_Allreduce(&nFeatures, &D, 1, MPI_UNSIGNED_LONG_LONG, MPI_MAX, comm_world);
+	MPI_Allreduce(&nnzs, &NZ, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, comm_world);
+
+	// make sure everyone have the same feature dimension
+	X.reset_ncols(D);
+
+	if (mpi_rank == 0) {
+		std::cout << "MPI_Allreduce took " << MPI_Wtime() - time_start << " sec" << std::endl;
+		std::cout << "N  = " << N << std::endl;
+		std::cout << "D  = " << D << std::endl;
+		std::cout << "NZ = " << NZ << std::endl;
+	}
+
+	//---------------------------------------------------------------------------------------------
+	// construct loss function and regularization
+
+	RegularizedLoss reguloss(X, y, loss_type, lambda, '2');
+
+	auto localfval = [&reguloss](const Vector &w) { return reguloss.regu_loss(w); };
+	auto localgrad = [&reguloss](const Vector &w, Vector &g) { return reguloss.regu_grad(w, g); };
+
+	auto fval = [&reguloss, &comm_world, N, lambda](const Vector &w) {
+		double sumloss = reguloss.sum_loss(w);
+		MPI_Allreduce(MPI_IN_PLACE, &sumloss, 1, MPI_DOUBLE, MPI_SUM, comm_world);
+		return sumloss / N + (lambda / 2)*w.dot(w);
+	};
+
+	auto grad = [&reguloss, &comm_world, N, lambda](const Vector &w, Vector &g) {
+		double sumloss = reguloss.sum_loss(w);
+		MPI_Allreduce(MPI_IN_PLACE, &sumloss, 1, MPI_DOUBLE, MPI_SUM, comm_world);
+		reguloss.sum_grad(w, g);
+		MPI_Allreduce(MPI_IN_PLACE, g.data(), g.length(), MPI_VECTOR_TYPE, MPI_SUM, comm_world);
+		g.scale(1.0 / N);
+		g.axpy(lambda, w);
+		return sumloss / N + (lambda / 2)*w.dot(w);
+	};
+
+	//---------------------------------------------------------------------------------------------
+	// compute initial condition using different methods
+	Vector w(D), a(X.nrows());
+	if (init_mthd == "sdca") {
+		ISmoothLoss *f = nullptr;
+		switch (loss_type)
+		{
+		case 'L':
+		case 'l':
+			f = new LogisticLoss();
+			break;
+		case 'S':
+		case 's':
+			f = new SmoothedHinge();
+			break;
+		case '2':
+			f = new SquareLoss();
+			break;
+		default:
+			throw std::runtime_error("Loss function type " + std::to_string(loss_type) + " not defined.");
+		}
+		SquaredL2Norm g;
+
+		randalgms::sdca(X, y, *f, lambda, g, 20, 1e-8, w, a, 'p', 'd', params.display);
+		delete f;
+	}
+	else if (init_mthd == "lbfgs") {
+		params.display = false;
+		lbfgs_omp(localfval, localgrad, w, w, params);
+	}
+	else {}
+
+	// Allreduce to compute the sum and then the average of local solutions
+	int w_len = int(w.length());
+	MPI_Allreduce(MPI_IN_PLACE, w.data(), w_len, MPI_VECTOR_TYPE, MPI_SUM, comm_world);
+	// need to compute the average as starting points for all machines
+	w.scale(1.0 / mpi_size);
+
+
+	// LBFGS algorithm ------------------------------------------------------------------------------
+	params.display = mpi_rank == 0 ? true : false;
+	params.fileoutput = mpi_rank == 0 ? true : false;
+
+	MPI_Barrier(comm_world);
+	lbfgs_omp(fval, grad, w, w, params);
+
+	// always call MPI_Finalize()
+	MPI_Finalize();
+	return 0;
+}
--- a/src/LBFGS/LBFGS.vcxproj
+++ b/src/LBFGS/LBFGS.vcxproj
@ -0,0 +1,165 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>LBFGS</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>../Numerics;../Utils;../SDCA;$(MSMPI_INC);$(MSMPI_INC)\x64</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>$(OutDir);C:\Program Files %28x86%29\Microsoft SDKs\MPI\Lib\x64</AdditionalLibraryDirectories>
+      <AdditionalDependencies>Numerics.lib;Utils.lib;msmpi.lib;Arica.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>../Numerics;../Utils;../SDCA;$(MSMPI_INC);$(MSMPI_INC)\x64</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>$(OutDir);C:\Program Files %28x86%29\Microsoft SDKs\MPI\Lib\x64</AdditionalLibraryDirectories>
+      <AdditionalDependencies>Numerics.lib;Utils.lib;msmpi.lib;Arica.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>Use</PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <Text Include="ReadMe.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="lbfgs_omp.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="LBFGS.cpp" />
+    <ClCompile Include="lbfgs_omp.cpp" />
+    <ClCompile Include="test_lbfgs.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/src/LBFGS/LBFGS.vcxproj.filters
+++ b/src/LBFGS/LBFGS.vcxproj.filters
@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <Text Include="ReadMe.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="lbfgs_omp.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="LBFGS.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="lbfgs_omp.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="test_lbfgs.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/src/LBFGS/ReadMe.txt
+++ b/src/LBFGS/ReadMe.txt
@ -0,0 +1,37 @@
+========================================================================
+    CONSOLE APPLICATION : [!output PROJECT_NAME] Project Overview
+========================================================================
+
+AppWizard has created this [!output PROJECT_NAME] application for you.
+
+This file contains a summary of what you will find in each of the files that
+make up your [!output PROJECT_NAME] application.
+
+
+    This is the main project file for VC++ projects generated using an Application Wizard.
+    It contains information about the version of Visual C++ that generated the file, and
+    information about the platforms, configurations, and project features selected with the
+    Application Wizard.
+
+    This is the filters file for VC++ projects generated using an Application Wizard. 
+    It contains information about the association between the files in your project 
+    and the filters. This association is used in the IDE to show grouping of files with
+    similar extensions under a specific node (for e.g. ".cpp" files are associated with the
+    "Source Files" filter).
+
+    This is the main application source file.
+
+/////////////////////////////////////////////////////////////////////////////
+Other standard files:
+
+StdAfx.h, StdAfx.cpp
+    These files are used to build a precompiled header (PCH) file
+    named [!output PROJECT_NAME].pch and a precompiled types file named StdAfx.obj.
+
+/////////////////////////////////////////////////////////////////////////////
+Other notes:
+
+AppWizard uses "TODO:" comments to indicate parts of the source code you
+should add to or customize.
+
+/////////////////////////////////////////////////////////////////////////////
--- a/src/LBFGS/lbfgs_omp.cpp
+++ b/src/LBFGS/lbfgs_omp.cpp
@ -0,0 +1,167 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include<deque>
+#include<string>
+#include<iostream>
+#include<fstream>
+#include<iomanip>
+
+#include "timer.h"
+#include "lbfgs_omp.h"
+
+namespace distropt
+{
+	// formatted output: "iter  pcg_itrs   primal_obj    newton_dec    time" 
+	void formatted_output(std::ostream &ofs, const int iters, const double stepsize, const int lscnt, const double f, const double gnorm, const double t)
+	{
+		ofs << std::setw(3) << iters
+			<< std::scientific << std::setprecision(3)
+			<< std::setw(12) << stepsize
+			<< std::setw(4) << lscnt
+			<< std::fixed << std::setprecision(12)
+			<< std::setw(17) << f
+			<< std::scientific << std::setprecision(3)
+			<< std::setw(12) << gnorm
+			<< std::fixed << std::setprecision(3)
+			<< std::setw(9) << t
+			<< std::endl;
+	}
+
+	int lbfgs_omp(std::function<double(const Vector&)> fval, std::function<double(const Vector&, Vector &)> grad,
+		          const Vector &x0, Vector &x, const lbfgs_params &params)
+	{
+		// need more careful check, including function signature? may not be possible
+		assert(x0.length() == x.length());
+
+		size_t d = x0.length();
+		int m = params.m_memory;
+
+		// construct variables and working space
+		Vector g(d), p(d), x_try(d);				// gradient, search direction, and next trial
+		Vector x_pre(d), g_pre(d), dx(d), dg(d);	// vectors for L-BFGS memory
+		std::deque<Vector> s, y;
+		std::deque<double> rho;
+		std::vector<double> alpha(m);
+
+		// timing and recording
+		double t_elapsed;
+		HighResTimer timer;
+		std::vector<double> stepsizes, objvals, gradnorms, time;
+		std::vector<int> nls;
+
+		if (params.display) {
+			std::cout << std::endl << "iter   stepsize  ls         f(x)       |grad(x)|     time" << std::endl;
+		}
+
+		// L-BFGS iterations
+		x.copy(x0);
+		int k = 0, lscnt = 0;
+		double stepsize = 1.0;
+
+		// compute gradient at x
+		double fx = grad(x, g);
+		double g_norm = g.norm2();
+		t_elapsed = timer.seconds_from_start();
+
+		// record progress
+		stepsizes.push_back(1.0);
+		objvals.push_back(fx);
+		gradnorms.push_back(g_norm);
+		time.push_back(t_elapsed);
+		nls.push_back(0);
+
+		if (params.display)
+		{
+			formatted_output(std::cout, k, stepsize, lscnt, fx, g_norm, t_elapsed);
+		}
+
+		for (k = 1; k <= params.max_itrs; k++)
+		{
+			// update previous vectors
+			x_pre.copy(x);
+			g_pre.copy(g);
+
+			// compute the limited-memory-matrix-vector product
+			p.copy(g);
+			for (int i = 0; i < s.size(); i++)
+			{
+				alpha[i] = rho[i] * s[i].dot(p);
+				p.axpy(-alpha[i], y[i]);
+			}
+			double gamma = s.size() > 0 ? s[0].dot(y[0]) / (y[0].dot(y[0])) : 1.0;
+			p.scale(gamma);
+			for (int i = s.size() - 1; i > -1; i--)
+			{
+				double beta = rho[i] * y[i].dot(p);
+				p.axpy(alpha[i] - beta, s[i]);
+			}
+
+			// backtracking line search (for nonconvex functions, need Wolfe conditions)
+			stepsize = params.btls_ada ? stepsize / params.btls_rho : 1.0;
+			double gdp = g.dot(p);
+			for (lscnt = 0; lscnt < params.btls_max; lscnt++)
+			{
+				x_try.copy(x);
+				x_try.axpy(-stepsize, p);
+				if (fval(x_try) < fx - params.btls_dec*stepsize*gdp) {
+					break;
+				}
+				stepsize *= params.btls_rho;
+			}
+			x.copy(x_try);
+
+			// compute function value and gradient at new point
+			fx = grad(x, g);
+			g_norm = g.norm2();
+			t_elapsed = timer.seconds_from_start();
+
+			// record progress
+			stepsizes.push_back(stepsize);
+			objvals.push_back(fx);
+			gradnorms.push_back(g_norm);
+			time.push_back(t_elapsed);
+			nls.push_back(lscnt);
+
+			if (params.display)
+			{
+				formatted_output(std::cout, k, stepsize, lscnt, fx, g_norm, t_elapsed);
+			}
+
+			// stop criteria: norm of gradient 
+			if (g_norm < params.eps_grad)
+			{
+				break;
+			}
+
+			// remove old items from queues if k > m
+			if (k > m)
+			{
+				s.pop_back();
+				y.pop_back();
+				rho.pop_back();
+			}
+
+			// add new s and y to queue 
+			Vector::elem_subtract(x, x_pre, dx);
+			Vector::elem_subtract(g, g_pre, dg);
+			s.push_front(dx);
+			y.push_front(dg);
+			rho.push_front(1.0 / dx.dot(dg));
+		}
+
+		//---------------------------------------------------------------------
+		// write progress record in a file
+		if (params.fileoutput) {
+			std::ofstream ofs(params.filename, std::ofstream::out);
+			ofs << "iter   stepsize  ls         f(x)       |grad(x)|     time" << std::endl;
+			for (int i = 0; i < objvals.size(); i++) {
+				formatted_output(ofs, i, stepsizes[i], nls[i], objvals[i], gradnorms[i], time[i]);
+			}
+			ofs.close();
+		}
+
+		// return number of iterations performed
+		return k;
+	}
+}
--- a/src/LBFGS/lbfgs_omp.h
+++ b/src/LBFGS/lbfgs_omp.h
@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <functional>
+
+#include "regu_loss.h"
+
+using namespace functions;
+
+namespace distropt {
+
+	struct lbfgs_params {
+		int    max_itrs = 100;
+		int    m_memory = 20;
+		double eps_grad = 1e-8;
+		double btls_rho = 0.5;
+		double btls_dec = 1e-4;
+		int    btls_max = 10;
+		bool   btls_ada = false;
+		bool   display = true;
+		bool   fileoutput = false;
+		std::string filename = "temp.txt";
+
+		// is this a good place to put it? probably not, good to leave with the application
+		//void parse(int argc, char* argv[]);
+	};
+
+	int lbfgs_omp(std::function<double(const Vector&)> fval, std::function<double(const Vector&, Vector &)> grad, 
+		          const Vector &x0, Vector &x, const lbfgs_params &params);
+}
--- a/src/LBFGS/test_lbfgs.cpp
+++ b/src/LBFGS/test_lbfgs.cpp
@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include<iostream>
+//#include<functional>
+
+#include "utils.h"
+#include "randalgms.h"
+#include "lbfgs_omp.h"
+
+using namespace std;
+using namespace distropt;
+
+int main_test_lbfgs(int argc, char* argv[])
+//int main(int argc, char* argv[])
+{
+	if (argc < 2) {
+		std::cout << "Need to specify training data file." << std::endl;
+		std::exit(0);
+	}
+	string data_file = argv[1];
+
+	// read training and testing files
+	vector<spmatT> labels;
+	vector<spmatT> weights;
+	vector<spmatT> values;
+	vector<size_t> colidx;
+	vector<size_t> rowptr;
+
+	std::cout << "Loading training data ... " << std::endl;
+	// news20 actually has column index starting from 0, rcv1 and most others start with 1. Need to revise code!
+	//size_t n_examples = load_datafile(data_file, labels, weights, values, colidx, rowptr, false, true, true, 0);
+	size_t n_examples = load_datafile(data_file, labels, weights, values, colidx, rowptr, false, true, true);
+	SparseMatrixCSR X(values, colidx, rowptr, false);
+	Vector y(labels);
+
+	X.normalize_rows();
+
+	// -------------------------------------------------------------------------------------
+	lbfgs_params params;
+	params.max_itrs = 100;
+	params.eps_grad = 1e-8;
+	params.m_memory = 20;
+	params.btls_rho = 0.5;
+	params.btls_dec = 1e-4;
+	params.btls_max = 20;
+	params.btls_ada = false;
+
+	double lambda = 1.0e-5;
+
+	size_t dim = X.ncols();
+	Vector w0(dim), w(dim);
+
+	// -------------------------------------------------------------------------------------
+	RegularizedLoss reguloss(X, y, 'l', lambda, '2');
+
+	// Replace std::bind solution by using lambdas
+	//using namespace std::placeholders;
+	//auto fval = std::bind(&RegularizedLoss::regu_loss, &reguloss, _1);
+	//auto grad = std::bind(&RegularizedLoss::regu_grad, &reguloss, _1, _2);
+
+	auto fval = [&reguloss](const Vector &x) {return reguloss.regu_loss(x); };
+	auto grad = [&reguloss](const Vector &x, Vector &g) {return reguloss.regu_grad(x, g); };
+
+	lbfgs_omp(fval, grad, w0, w, params);
+
+	// -------------------------------------------------------------------------------------
+	float train_err = binary_error_rate(X, y, w);
+	std::cout << "Training error rate = " << train_err * 100 << " %" << std::endl;
+
+	return 0;
+}
+
--- a/src/Numerics/Numerics.cpp
+++ b/src/Numerics/Numerics.cpp
@ -0,0 +1,52 @@
+// Numerics.cpp : Defines the entry point for the console application.
+
+#include <string>
+#include <iostream>
+#include <stdexcept>
+#include <ctime>
+#include <math.h>
+
+#include "spmatrix.h"
+#include "utils.h"
+
+using namespace std;
+using namespace Numerics;
+
+//int main_test_sparsematrix(int argc, char* argv[])
+int main(int argc, char* argv[])
+{
+	//string filename = "C:/Data/LIBSVMdata/splice1percent100_1";
+	string filename = "C:/Data/LIBSVMdata/ads1percent100_1";
+	//string filename = "C:/Data/LIBSVMdata/rcv1_test.binary";
+	//string filename = "C:/Data/LIBSVMdata/rcv1_train.binary";
+	//string filename = "C:/Data/LIBSVMdata/news20.binary";
+	//string filename = "C:/Data/LIBSVMdata/covtype.libsvm.binary.scale";
+
+	vector<float>  vals, labl;
+	vector<size_t> cidx;
+	vector<size_t> rptr;
+
+	clock_t time;
+	time = clock();
+	Load_LIBSVM_file(filename, vals, cidx, rptr, labl);
+	time = clock() - time;
+	std::cout << "Loading file took " << ((float)time) / CLOCKS_PER_SEC << " seconds)." << std::endl;
+
+	SparseMatrixCSR A(vals, cidx, rptr);
+	std::cout << A.Rows() << ' ' << A.Cols() << ' ' << A.NNZs() << std::endl;
+
+	Vector y(labl);
+	Vector x(A.Cols());
+
+	time = clock();
+	A.aAxby(1, x, 0, y);
+	time = clock() - time;
+	std::cout << "Matrix-Vector multiplications took " << ((float)time) / CLOCKS_PER_SEC << " seconds)." << std::endl;
+
+	time = clock();
+	A.aATxby(1, y, 0, x);
+	time = clock() - time;
+	std::cout << "Matrix.T-vector multiplications took " << ((float)time) / CLOCKS_PER_SEC << " seconds)." << std::endl;
+
+	return 0;
+}
--- a/src/Numerics/Numerics.vcxproj
+++ b/src/Numerics/Numerics.vcxproj
@ -0,0 +1,142 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{3BB4DA69-B2E0-46D0-86DD-1757C5922506}</ProjectGuid>
+    <RootNamespace>Numerics</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>..\Utils</AdditionalIncludeDirectories>
+      <OpenMPSupport>false</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>Utils.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>..\Utils</AdditionalIncludeDirectories>
+      <OpenMPSupport>true</OpenMPSupport>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>Utils.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="dnvector.h" />
+    <ClInclude Include="floattype.h" />
+    <ClInclude Include="functions.h" />
+    <ClInclude Include="regu_loss.h" />
+    <ClInclude Include="spmatrix.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="dnvector.cpp" />
+    <ClCompile Include="functions.cpp" />
+    <ClCompile Include="regu_loss.cpp" />
+    <ClCompile Include="spmatrix.cpp" />
+    <ClCompile Include="test_numerics.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/src/Numerics/Numerics.vcxproj.filters
+++ b/src/Numerics/Numerics.vcxproj.filters
@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="dnvector.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="spmatrix.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="floattype.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="functions.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="regu_loss.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="dnvector.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="spmatrix.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="test_numerics.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="functions.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="regu_loss.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/src/Numerics/dnvector.cpp
+++ b/src/Numerics/dnvector.cpp
@ -0,0 +1,673 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <assert.h>
+#include <math.h> 
+#include <algorithm>
+#include <functional>
+#include <random>
+#include <cstdint>
+#include <omp.h>
+
+#include "dnvector.h"
+
+// OpenMP 3.0 supports unsigned for-loop iterator, so can replace std::int64_t by size_t
+using std::int64_t;
+
+namespace numerics {
+
+	// Construct an all-zero vector
+	template <typename T> 
+	Vector<T>::Vector(const size_t n)
+	{
+		_length = n;
+		_elements = new T[n];
+		memset(_elements, 0, n * sizeof(T));
+		_need_cleanup = true;
+	}
+
+	// Constructor from another Vector
+	template <typename T>
+	Vector<T>::Vector(const Vector<T> &v)
+	{
+		_length = v._length;
+		_elements = new T[_length];
+		memcpy(_elements, v._elements, _length * sizeof(T));
+		_need_cleanup = true;
+	}
+
+	template <typename T>
+	Vector<T>::Vector(const std::vector<float> &v)
+	{
+		_length = v.size();
+		_elements = new T[_length];
+		// cannot use memcpy() here since T may not be float
+		//memcpy(_elements, v.data(), _length * sizeof(T));	
+		for (size_t i = 0; i < _length; i++) {
+			_elements[i] = v[i];
+		}
+		_need_cleanup = true;
+	}
+
+	template <typename T>
+	Vector<T>::Vector(const std::vector<double> &v)
+	{
+		_length = v.size();
+		_elements = new T[_length];
+		// cannot use memcpy() here since T may not be double
+		//memcpy(_elements, v.data(), _length * sizeof(T));
+		for (size_t i = 0; i < _length; i++) {
+			_elements[i] = v[i];
+		}
+		_need_cleanup = true;
+	}
+
+	template <typename T>
+	Vector<T>::Vector(const Vector<T> &v, const std::vector<size_t> &indices)
+	{
+		assert(*std::min_element(std::begin(indices), std::end(indices)) >= 0
+			&& *std::max_element(std::begin(indices), std::end(indices)) < v.length());
+
+		_length = indices.size();
+		_elements = new T[_length];
+		for (size_t i = 0; i < _length; i++) {
+			_elements[i] = v[indices[i]];
+		}
+		_need_cleanup = true;
+	}
+	
+	// Destructor, also called by derived class SubVector without cleaning up memory
+	template <typename T>
+	Vector<T>::~Vector()
+	{
+		if (_need_cleanup) delete[] _elements;
+	}
+
+	// convert to std::vector<T>, with move return after c++11
+	template <typename T>
+	std::vector<T> Vector<T>::to_vector()
+	{
+		std::vector<T> v(_length);
+		memcpy(v.data(), _elements, _length * sizeof(T));
+		return std::move(v);
+	}
+
+	// find pre-permutation vector v[p[i]] = this[i]
+	template <typename T>
+	void Vector<T>::pre_permute(const std::vector<size_t> &p, Vector<T> &v)
+	{
+		assert(v._length == _length && p.size() == _length);
+		assert(*std::min_element(std::begin(p), std::end(p)) >= 0
+			&& *std::max_element(std::begin(p), std::end(p)) < v.length());
+
+		auto elem = _elements;
+		auto len = _length;
+		for (size_t i = 0; i < len; i++) {
+			v[p[i]] = elem[i];
+		}
+	}
+
+	// find post-permutation vector v[i] = this[p[i]]
+	template <typename T>
+	void Vector<T>::post_permute(const std::vector<size_t> &p, Vector<T> &v)
+	{
+		assert(v._length == _length && p.size() == _length);
+		assert(*std::min_element(std::begin(p), std::end(p)) >= 0
+			&& *std::max_element(std::begin(p), std::end(p)) < v.length());
+
+		auto elem = _elements;
+		auto len = _length;
+		for (size_t i = 0; i < len; i++) {
+			v[i] = elem[p[i]];
+		}
+	}
+
+	// Copy from part of a longer vector x
+	template <typename T>
+	void Vector<T>::copy(const Vector<T> &v, const size_t start_idx)
+	{
+		assert(v._length >= start_idx + _length);
+		memcpy(_elements, v._elements + start_idx, _length*sizeof(T));
+	}
+
+	// vector concatenation: z[] = [ x[]; y[] ] 
+	template <typename T>
+	void Vector<T>::concat(const Vector<T> &x, const Vector<T> &y)
+	{
+		assert(x._length + y._length == _length);
+		memcpy(_elements, x._elements, x._length*sizeof(T));
+		memcpy(_elements + x._length, y._elements, y._length*sizeof(T));
+	}
+
+	// split vector into two z[] = [ x[]; y[] ] 
+	template <typename T>
+	void Vector<T>::split(Vector<T> &x, Vector<T> &y) const
+	{
+		assert(x._length + y._length == _length);
+		memcpy(x._elements, _elements, x._length * sizeof(T));
+		memcpy(y._elements, _elements + x._length, y._length * sizeof(T));
+	}
+
+	// Set every element to zero
+	template <typename T>
+	void Vector<T>::fill_zero()
+	{
+		memset(_elements, 0, _length*sizeof(T));
+	}
+
+	// Fill Vector with random numbers in the interval [a, b)
+	template <typename T>
+	void Vector<T>::fill_rand(const T a, const T b)
+	{
+		std::mt19937_64 reng(std::random_device{}());
+		std::uniform_real_distribution<T> unif(a, b);
+		auto rgen = std::bind(unif, reng);
+		std::generate(_elements, _elements + _length, rgen);
+	}
+
+	// Fill every element with a constant 
+	template <typename T>
+	void Vector<T>::fill(const T c)
+	{
+		auto elem = _elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			elem[i] = c;
+		}
+	}
+
+	// Add a constant to each element
+	template <typename T>
+	void Vector<T>::add(const T c)
+	{
+		auto elem = _elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			elem[i] += c;
+		}
+	}
+
+	// Return sum of elements
+	template <typename T>
+	T Vector<T>::sum() const
+	{
+		auto elem = _elements;
+		auto len = _length;
+		T s = 0;
+
+		#pragma omp parallel for reduction(+:s)
+		for (int64_t i = 0; i < len; i++) {
+			s += elem[i];
+		}
+		return s;
+	}
+
+	// Return the maximum value. No OpenMP used
+	template <typename T>
+	T Vector<T>::max() const
+	{
+		auto elem = _elements;
+		auto len = _length;
+		T a = elem[0];
+		for (size_t i = 1; i < len; i++) {
+			if (elem[i] > a) a = elem[i];
+		}
+		return a;
+	}
+
+	// Return the minimum value. No OpenMP used
+	template <typename T>
+	T Vector<T>::min() const
+	{
+		auto elem = _elements;
+		auto len = _length;
+		T a = elem[0];
+		for (size_t i = 1; i < len; i++) {
+			if (elem[i] < a) a = elem[i];
+		}
+		return a;
+	}
+
+	// Return 1-norm: sum_i abs(x[i])
+	template <typename T>
+	T Vector<T>::norm1() const
+	{
+		auto elem = _elements;
+		auto len = _length;
+		T a = 0;
+
+		#pragma omp parallel for reduction(+:a)
+		for (int64_t i = 0; i < len; i++) {
+			a += fabs(elem[i]);
+		}
+		return a;
+	}
+
+	// Returns 2-norm (Euclidean norm) of the vector
+	template <typename T>
+	T Vector<T>::norm2() const
+	{
+		auto elem = _elements;
+		auto len = _length;
+		T sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++) {
+			sum += pow(elem[i], 2);
+		}
+		return sqrt(sum);
+	}
+
+	// Return infinity norm: max_i abs(x[i])
+	template <typename T>
+	T Vector<T>::normInf() const
+	{
+		auto elem = _elements;
+		auto len = _length;
+		T a = 0, xi_abs;
+		for (size_t i = 0; i < len; i++)
+		{
+			xi_abs = fabs(elem[i]);
+			if (xi_abs > a) a = xi_abs;
+		}
+		return a;
+	}
+
+	// Return a vector of signs in {-1, 0, +1}
+	template <typename T>
+	void Vector<T>::sign(Vector<T> &s) const
+	{
+		assert(_length == s._length);
+		auto elem = _elements;
+		auto s_elem = s._elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			s_elem[i] = elem[i] > 0 ? 1.0f : elem[i] < 0 ? -1 : 0.0f;
+		}
+	}
+
+	// Convert to vector of signs in {-1, 0, +1}
+	template <typename T>
+	void Vector<T>::to_sign()
+	{
+		auto elem = _elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			elem[i] = elem[i] > 0 ? 1.0f : elem[i] < 0 ? -1.0f : 0.0f;
+		}
+	}
+
+	// Count number of positive elements
+	template <typename T>
+	size_t Vector<T>::count_positive() const
+	{
+		auto elem = _elements;
+		auto len = _length;
+		size_t count = 0;
+
+		#pragma omp parallel for reduction(+:count)
+		for (int64_t i = 0; i < len; i++) {
+			count += elem[i] > 0 ? 1 : 0;
+		}
+		return count;
+	}
+
+	// Count number of negative elements
+	template <typename T>
+	size_t Vector<T>::count_negative() const
+	{
+		auto elem = _elements;
+		auto len = _length;
+		size_t count = 0;
+
+		#pragma omp parallel for reduction(+:count)
+		for (int64_t i = 0; i < len; i++) {
+			count += elem[i] < 0 ? 1 : 0;
+		}
+		return count;
+	}
+
+	// Scale itself by a constant alpha
+	template <typename T>
+	void Vector<T>::scale(const T alpha)
+	{
+		auto elem = _elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			elem[i] *= alpha;
+		}
+	}
+
+	// Return inner product with vector x
+	template <typename T>
+	T Vector<T>::dot(const Vector<T> &x) const
+	{
+		assert(_length == x._length);
+		auto elem = _elements;
+		auto x_elem = x._elements;
+		auto len = x._length;
+		T a = 0;
+
+		#pragma omp parallel for reduction(+:a)
+		for (int64_t i = 0; i < len; i++) {
+			a += elem[i] * x_elem[i];
+		}
+		return a;
+	}
+
+	template <typename T>
+	T Vector<T>::dot(const Vector<T> &x, const Vector<T> &y)
+	{
+		return x.dot(y);
+	}
+
+	// y = alpha * x + y
+	template <typename T>
+	void Vector<T>::axpy(const T alpha, const Vector<T> &x)
+	{
+		assert(x._length == _length);
+		auto x_elem = x._elements;
+		auto y_elem = _elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			y_elem[i] += alpha*x_elem[i];
+		}
+	}
+
+	// y = alpha * x + y
+	template <typename T>
+	void Vector<T>::axpy(const T alpha, const Vector<T> &x, Vector<T> &y)
+	{
+		assert(x._length == y._length);
+		auto x_elem = x._elements;
+		auto y_elem = y._elements;
+		auto len = x._length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			y_elem[i] += alpha*x_elem[i];
+		}
+	}
+
+	/* A tricky mistake, this always shadowed by the previous static method!!!
+	// this = alpha * x + y
+	template <typename T>
+	void Vector<T>::axpy(const T alpha, const Vector<T> &x, const Vector<T> &y)
+	{
+		this->copy(y);
+		this->axpy(alpha, x);
+	}
+	*/
+
+	// Elementwise absolute value: vabs[i] = abs( v[i] )
+	template <typename T>
+	void Vector<T>::elem_abs(Vector<T> &v) const
+	{
+		assert(v._length == _length);
+		auto elem = _elements;
+		auto v_elem = v._elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			v_elem[i] = fabs(elem[i]);
+		}
+	}
+
+	// Elementwise inversion: inv[i] = 1/v[i]
+	template <typename T>
+	void Vector<T>::elem_inv(Vector<T> &v) const
+	{
+		assert(v._length == _length);
+		auto elem = _elements;
+		auto v_elem = v._elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			v_elem[i] = 1 / elem[i];
+		}
+	}
+
+	// Elementwise square root: vsqrt[i] = sqrt( v[i] )
+	template <typename T>
+	void Vector<T>::elem_sqrt(Vector<T> &v) const
+	{
+		assert(v._length == _length);
+		auto elem = _elements;
+		auto v_elem = v._elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			v_elem[i] = sqrt(elem[i]);
+		}
+	}
+
+	// Elementwise exponential: vexp[i] = exp( v[i] )
+	template <typename T>
+	void Vector<T>::elem_exp(Vector<T> &v) const
+	{
+		assert(v._length == _length);
+		auto elem = _elements;
+		auto v_elem = v._elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			v_elem[i] = exp(elem[i]);
+		}
+	}
+
+	// Elementwise natural logarithm: vlog[i] = log( v[i] )
+	template <typename T>
+	void Vector<T>::elem_log(Vector<T> &v) const
+	{
+		assert(v._length == _length);
+		auto elem = _elements;
+		auto v_elem = v._elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			v_elem[i] = log(elem[i]);
+		}
+	}
+
+	// Elementwise exponential after minus 1: vexpm1[i] = exp(v[i]) - 1 
+	template <typename T>
+	void Vector<T>::elem_expm1(Vector<T> &v) const
+	{
+		assert(v._length == _length);
+		auto elem = _elements;
+		auto v_elem = v._elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			v_elem[i] = expm1(elem[i]);
+		}
+	}
+
+	// Elementwise natural logarithm of 1 plus: vlog1p[i] = log( 1 + v[i] )
+	template <typename T>
+	void Vector<T>::elem_log1p(Vector<T> &v) const
+	{
+		assert(v._length == _length);
+		auto elem = _elements;
+		auto v_elem = v._elements;
+		auto len = _length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			v_elem[i] = log1p(elem[i]);
+		}
+	}
+
+	// Elememtwise hinge loss: h[i] = max{0, 1 - yXw[i]}
+	template <typename T>
+	void Vector<T>::elem_hinge(Vector<T> &h) const
+	{
+		assert(_length == h._length);
+		auto elem = _elements;
+		auto h_elem = h._elements;
+		auto len = _length;
+		T diff = 0;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			diff = 1 - elem[i];
+			h_elem[i] = diff > 0 ? diff : 0;
+		}
+	}
+
+	// Elementwise clip: b[i] = min( ub, max(lb, a[i]))
+	template <typename T>
+	void Vector<T>::elem_clip(const T lb, const T ub, Vector<T> &b) const
+	{
+		assert(_length == b._length);
+		auto elem = _elements;
+		auto b_elem = b._elements;
+		auto len = _length;
+		T val = 0;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			val = elem[i];
+			b_elem[i] = val > ub ? ub : val < lb ? lb : val;
+		}
+	}
+
+	// Elementwise addition: z[i] = x[i] + y[i]
+	template <typename T>
+	void Vector<T>::elem_add(const Vector<T> &x, const Vector<T> &y, Vector<T> &z)
+	{
+		assert(x._length == y._length && x._length == z._length);
+		auto x_elem = x._elements;
+		auto y_elem = y._elements;
+		auto z_elem = z._elements;
+		auto len = x._length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			z_elem[i] = x_elem[i] + y_elem[i];
+		}
+	}
+
+	// Elementwise subtraction: z[i] = x[i] - y[i]
+	template <typename T>
+	void Vector<T>::elem_subtract(const Vector<T> &x, const Vector<T> &y, Vector<T> &z)
+	{
+		assert(x._length == y._length && x._length == z._length);
+		auto x_elem = x._elements;
+		auto y_elem = y._elements;
+		auto z_elem = z._elements;
+		auto len = x._length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			z_elem[i] = x_elem[i] - y_elem[i];
+		}
+	}
+
+	// Elementwise multiplication: z[i] = x[i] * y[i]
+	template <typename T>
+	void Vector<T>::elem_multiply(const Vector<T> &x, const Vector<T> &y, Vector<T> &z)
+	{
+		assert(x._length == y._length && x._length == z._length);
+		auto x_elem = x._elements;
+		auto y_elem = y._elements;
+		auto z_elem = z._elements;
+		auto len = x._length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			z_elem[i] = x_elem[i] * y_elem[i];
+		}
+	}
+
+	// Elementwise division: z[i] = x[i] / y[i]
+	template <typename T>
+	void Vector<T>::elem_divide(const Vector<T> &x, const Vector<T> &y, Vector<T> &z)
+	{
+		assert(x._length == y._length && x._length == z._length);
+		auto x_elem = x._elements;
+		auto y_elem = y._elements;
+		auto z_elem = z._elements;
+		auto len = x._length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			z_elem[i] = x_elem[i] / y_elem[i];
+		}
+	}
+
+	// Elementwise min: z[i] = min( x[i], y[i] )
+	template <typename T>
+	void Vector<T>::elem_min(const Vector<T> &x, const Vector<T> &y, Vector<T> &z)
+	{
+		assert(x._length == y._length && x._length == z._length);
+		auto x_elem = x._elements;
+		auto y_elem = y._elements;
+		auto z_elem = z._elements;
+		auto len = x._length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			z_elem[i] = fmin(x_elem[i], y_elem[i]);
+		}
+	}
+
+	// Elementwise max: z[i] = max( x[i], y[i] )
+	template <typename T>
+	void Vector<T>::elem_max(const Vector<T> &x, const Vector<T> &y, Vector<T> &z)
+	{
+		assert(x._length == y._length && x._length == z._length);
+		auto x_elem = x._elements;
+		auto y_elem = y._elements;
+		auto z_elem = z._elements;
+		auto len = x._length;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++) {
+			z_elem[i] = fmax(x_elem[i], y_elem[i]);
+		}
+	}
+
+
+	// Member functions of SubVector, a mask to memory in Vector
+	template <typename T>
+	SubVector<T>::SubVector(const Vector<T> &v, const size_t start_idx, const size_t length)
+	{
+		this->recast(v, start_idx, length);
+	}
+
+	template <typename T>
+	void SubVector<T>::recast(const Vector<T> &v, const size_t start_idx, const size_t length)
+	{
+		assert(start_idx + length <= v.length());
+		this->_length = length;
+		this->_elements = v.data() + start_idx;
+		this->_need_cleanup = false;
+	}
+
+	// instantiates template with two float types
+	template class Vector<float>;
+	template class Vector<double>;
+	template class SubVector<float>;
+	template class SubVector<double>;
+}
--- a/src/Numerics/dnvector.h
+++ b/src/Numerics/dnvector.h
@ -0,0 +1,150 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <vector>
+#include <cstdint>
+
+namespace numerics {
+
+	template <typename T> 
+	class Vector
+	{
+	protected:
+		size_t _length;			// Length of vector
+		T *_elements;			// Pointer to memory storing the elements
+		bool _need_cleanup;		// Indicator to delete[] elements
+
+		// Default constructor for creating the SubVector class (masks only)
+		Vector() = default;
+
+	public:
+		Vector(const size_t n);				// construct all-zero vector of length n
+		Vector(const Vector<T> &v);			// copy constructor
+		// the following two functions avoid the needs for using two template parameters
+		Vector(const std::vector<float> &v);	// construct from std::vector<float>
+		Vector(const std::vector<double> &v);	// construct from std::vector<double>
+		Vector(const Vector<T> &v, const std::vector<size_t> &indices);
+		// Destructor, also called by derived class SubVector without cleaning up memory
+		~Vector();
+
+		std::vector<T> to_vector();	// convert to std::vector<T>, using C++11 std::move()
+		// find pre-permutation vector v[p[i]] = this[i]
+		void pre_permute(const std::vector<size_t> &p, Vector<T> &v);
+		// find post-permutation vector v[i] = this[p[i]]
+		void post_permute(const std::vector<size_t> &p, Vector<T> &v);
+
+		// Do not allow copy assignment constructor due to subVector cleanup issues!
+		Vector<T>& operator=(const Vector<T> &v) = delete;
+
+		// Returns length of vector
+		inline size_t length() const { return _length; }
+		// Return a direct pointer to the data memory (e.g., used as MPI buffe)
+		inline T* data() const { return _elements; }
+
+		// Overloading operator [], use 0-based indexing, inlined for speed
+		inline T& operator[] (const size_t i) { return _elements[i]; }
+		inline const T& operator[] (const size_t i) const { return _elements[i]; }
+		// To remove compilation warning of converting int64_t to size_t when using OpenMP
+		inline T& operator[] (const int64_t i) { return _elements[i]; }
+		inline const T& operator[] (const int64_t i) const { return _elements[i]; }
+
+		// Copy from (part of a longer) vector x
+		void copy(const Vector<T> &v, const size_t start_idx=0);
+		// vector concatenation: z[] = [ x[]; y[] ] 
+		void concat(const Vector<T> &x, const Vector<T> &y);
+		// split vector into two z[] = [ x[]; y[] ] 
+		void split(Vector<T> &x, Vector<T> &y) const;
+		// Set every element to zero
+		void fill_zero();
+		// Fill Vector with random numbers in the interval [a, b)
+		void fill_rand(const T a, const T b);
+		// Fill every element with a constant 
+		void fill(const T c);
+		// Add a constant to each element
+		void add(const T c);
+
+		// Return sum of elements
+		T sum() const;
+		// Return the maximum value of elements
+		T max() const;
+		// Return the minimum value of elements
+		T min() const;
+		// Return 1-norm of vector
+		T norm1() const;
+		// Returns 2-norm (Euclidean norm) of the vector
+		T norm2() const;
+		// Return the infinite norm 
+		T normInf() const;
+
+		// Return a vector of signs in {-1, 0, +1}
+		void sign(Vector<T> &s) const;
+		// Convert elements to signs in {-1, 0, +1}
+		void to_sign(); 
+		// Count number of positive elements
+		size_t count_positive() const;
+		// Count number of negative elements
+		size_t count_negative() const;
+
+		// Scale itself by a constant alpha
+		void scale(const T alpha);
+		// Return inner product with vector x
+		T dot(const Vector<T> &x) const;
+		static T dot(const Vector<T> &x, const Vector<T> &y);
+		// y = alpha * x + y
+		void axpy(const T alpha, const Vector<T> &x);
+		static void axpy(const T alpha, const Vector<T> &x, Vector<T> &y);
+		// this = alpha * x + y		// this is completely ignored as call always to the above static method!!! 
+		//void axpy(const T alpha, const Vector<T> &x, const Vector<T> &y);
+
+		// Elementwise absolute value: vabs[i] = abs( v[i] )
+		void elem_abs(Vector<T> &vabs) const;
+		// Elementwise inversion: inv[i] = 1/v[i]
+		void elem_inv(Vector<T> &vinv) const;
+		// Elementwise square root: vsqrt[i] = sqrt( v[i] )
+		void elem_sqrt(Vector<T> &vsqrt) const;
+		// Elementwise exponential: vexp[i] = exp( v[i] )
+		void elem_exp(Vector<T> &vexp) const;
+		// Elementwise natural logarithm: vlog[i] = log( v[i] )
+		void elem_log(Vector<T> &vlog) const;
+		// Elementwise exponential after minus 1: vexpm1[i] = exp(v[i]) - 1 
+		void elem_expm1(Vector<T> &vexpm1) const;
+		// Elementwise natural logarithm of 1 plus: vlog1p[i] = log( 1 + v[i] )
+		void elem_log1p(Vector<T> &vlog1p) const;
+		// Elememtwise hinge loss: h[i] = max{0, 1 - yXw[i]}
+		void elem_hinge(Vector<T> &h) const;
+		// Elementwise clip: b[i] = min( ub, max(lb, a[i]))
+		void elem_clip(const T lb, const T ub, Vector<T> &b) const;
+
+		// Elementwise addition: z[i] = x[i] + y[i]
+		static void elem_add(const Vector<T> &x, const Vector<T> &y, Vector<T> &z);
+		// Elementwise subtraction: z[i] = x[i] - y[i]
+		static void elem_subtract(const Vector<T> &x, const Vector<T> &y, Vector<T> &z);
+		// Elementwise multiplication: z[i] = x[i] * y[i]
+		static void elem_multiply(const Vector<T> &x, const Vector<T> &y, Vector<T> &z);
+		// Elementwise division: z[i] = x[i] / y[i]
+		static void elem_divide(const Vector<T> &x, const Vector<T> &y, Vector<T> &z);
+		// Elementwise min: z[i] = min( x[i], y[i] )
+		static void elem_min(const Vector<T> &x, const Vector<T> &y, Vector<T> &z);
+		// Elementwise max: z[i] = max( x[i], y[i] )
+		static void elem_max(const Vector<T> &x, const Vector<T> &y, Vector<T> &z);
+	};
+
+
+	// This class defines a mask for a contiguous part of a Vector
+	// !!! Memory problem if using SubVector after referred Vector is desconstructed!
+	template <typename T>
+	class SubVector final : public Vector<T>
+	{
+	public:
+		// Use 0-based indexing for starting index
+		SubVector(const Vector<T> &v, const size_t start_idx, const size_t sub_length);
+		void recast(const Vector<T> &v, const size_t start_idx, const size_t sub_length);
+
+		// no memory allocated, so do not allow copy constructor and copy-assignment operator
+		SubVector() = delete;
+		SubVector(const SubVector<T> &) = delete;
+		SubVector& operator=(const SubVector<T> &) = delete;
+	};
+}
--- a/src/Numerics/floattype.h
+++ b/src/Numerics/floattype.h
@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <type_traits>
+
+// for convenience to use conditional MPI types
+#define DOUBLE_VECTOR true
+#define DOUBLE_MATRIX true
+
+using floatT = std::conditional<DOUBLE_VECTOR, double, float>::type;
+using spmatT = std::conditional<DOUBLE_MATRIX, double, float>::type;
+
+// otherwise can simply use the following
+//using floatT = double;
+//using spmatT = float;
+
+#if DOUBLE_VECTOR
+#define MPI_VECTOR_TYPE MPI_DOUBLE
+#else
+#define MPI_VECTOR_TYPE MPI_FLOAT
+#endif
--- a/src/Numerics/functions.cpp
+++ b/src/Numerics/functions.cpp
@ -0,0 +1,740 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <assert.h>
+#include <math.h>
+#include <omp.h>
+#include <cstdint>
+#include <iostream>
+
+// OpenMP 3.0 supports unsigned for-loop iterator, so can replace std::int64_t by size_t
+#include "functions.h"
+
+using std::int64_t;
+
+namespace functions
+{
+	// mse for linear regression (1/n)*||X*w - y||^2 where n is length of y
+	floatT regression_mse(const SparseMatrixCSR &X, const Vector &y, const Vector &w)
+	{
+		assert(X.nrows() == y.length() && X.ncols() == w.length());
+		Vector yp(y);
+		X.aAxby(1, w, -1, yp);	// yp = X * w - y
+		return pow(yp.norm2(), 2) / yp.length();
+	}
+
+	// return number of prediction errors where sign(Xi'*w) != yi
+	size_t binary_error_count(const SparseMatrixCSR &X, const Vector &y, const Vector &w)
+	{
+		assert(X.nrows() == y.length() && X.ncols() == w.length());
+		Vector yp(y.length());
+		X.aAxby(1, w, 0, yp);	// yp = X * w
+		Vector::elem_multiply(y, yp, yp);
+		return yp.count_negative();
+	}
+
+	// return error rate: number of errors (with sign(Xi'*w) != yi) divided by length of y
+	floatT binary_error_rate(const SparseMatrixCSR &X, const Vector &y, const Vector &w)
+	{
+		assert(X.nrows() == y.length() && X.ncols() == w.length());
+		Vector yp(y.length());
+		X.aAxby(1, w, 0, yp);	// yp = X * w
+		yp.to_sign();			// only keep signs in {-1, 0, +1}
+		return (1 - yp.dot(y) / y.length()) / 2;
+	}
+
+	// Member functions of SquareLoss class: f(t) = (1/2)*(t-b)^2 ---------------------
+	floatT SquareLoss::loss(const floatT Xiw, const floatT yi) const
+	{
+		return 0.5f * pow(Xiw - yi, 2);
+	}
+
+	void SquareLoss::loss(const Vector &Xw, const Vector y, Vector &loss) const
+	{
+		assert(Xw.length() == y.length() && y.length() == loss.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			loss[i] = 0.5f * pow(Xw[i] - y[i], 2);
+		}
+	}
+
+	// return F(Xw) = sum_i 0.5*(Xw[i]-y[i])^2
+	floatT SquareLoss::sum_loss(const Vector &Xw, const Vector &y) const
+	{
+		assert(Xw.length() == y.length());
+		size_t len = y.length();
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			sum += 0.5f * pow(Xw[i] - y[i], 2);
+		}
+		return sum;
+	}
+
+	// return F(Xw) = sum_i 0.5*(Xw[i]-y[i])^2
+	floatT SquareLoss::sum_loss(const Vector &theta, const Vector &Xw, const Vector &y) const
+	{
+		assert(Xw.length() == y.length() && theta.length() == y.length());
+		size_t len = y.length();
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			sum += 0.5f * pow(Xw[i] - y[i], 2) * theta[i];
+		}
+		return sum;
+	}
+
+	// return di = f'(Xiw) = Xiw - yi
+	floatT SquareLoss::derivative(const floatT Xiw, const floatT yi) const
+	{
+		return Xiw - yi;
+	}
+
+	// return d = F'(Xw) = Xw - y
+	void SquareLoss::derivative(const Vector &Xw, const Vector &y, Vector &d) const
+	{
+		assert(Xw.length() == y.length() && y.length() == d.length());
+		Vector::elem_subtract(Xw, y, d);
+	}
+
+	floatT SquareLoss::second_derivative(const floatT Xiw, const floatT yi) const
+	{
+		return 1;
+	}
+
+	void SquareLoss::second_derivative(const Vector &Xw, const Vector &y, Vector &d) const
+	{
+		assert(Xw.length() == y.length() && y.length() == d.length());
+		d.fill(1);
+	}
+
+	floatT SquareLoss::conjugate(const floatT ai, const floatT yi) const
+	{
+		return 0.5f * ai*ai + yi*ai;
+	}
+
+	void SquareLoss::conjugate(const Vector &a, const Vector &y, Vector &conj) const
+	{
+		assert(a.length() == y.length() && y.length() == conj.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			conj[i] = 0.5f * a[i] * a[i] + y[i] * a[i];
+		}
+	}
+
+	// return f*(a) = (1/2)*||a||^2 + a'*y
+	floatT SquareLoss::sum_conjugate(const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length());
+		return 0.5f * a.dot(a) + a.dot(y);
+	}
+
+	floatT SquareLoss::sum_conjugate(const Vector &theta, const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length() && theta.length() == y.length());
+		size_t len = y.length();
+
+		floatT sum = 0;
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			sum += (0.5f * a[i] * a[i] + y[i] * a[i]) * theta[i];
+		}
+		return sum;
+	}
+
+	floatT SquareLoss::conj_derivative(const floatT ai, const floatT yi) const
+	{
+		return ai + yi;
+	}
+
+	void SquareLoss::conj_derivative(const Vector &a, const Vector &y, Vector &cd) const
+	{
+		assert(a.length() == y.length() && y.length() == cd.length());
+		Vector::elem_add(a, y, cd);
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	floatT SquareLoss::conj_prox(const floatT sigma, const floatT ai, const floatT yi) const
+	{
+		return (ai - sigma*yi) / (1 + sigma);
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	void SquareLoss::conj_prox(const floatT sigma, const Vector &a, const Vector &y, Vector &u) const
+	{
+		assert(a.length() == y.length() && y.length() == u.length());
+		size_t len = a.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++) 
+		{
+			u[i] = (a[i] - sigma*y[i]) / (1 + sigma);
+		}
+	}
+
+	// Member functions of LogisticLoss class ------------------------------------------
+
+	LogisticLoss::LogisticLoss(const floatT deltabd, const floatT epsilon, const int maxiter)
+	{
+		assert(deltabd > 0 && deltabd < 1 && epsilon > 0 && maxiter > 0);
+		_deltabd = deltabd;
+		_epsilon = epsilon;
+		_maxiter = maxiter;
+	
+		// conjugate function value when dual variable is within deltabd to boundary of [-1,0]
+		_xentrpy = deltabd*log(deltabd) + (1 - deltabd)*log1p(-deltabd);
+		_xderiva = log1p(-deltabd) - log(deltabd);
+	}
+
+	floatT LogisticLoss::loss(const floatT Xiw, const floatT yi) const
+	{
+		floatT yxwi = yi * Xiw;
+		return yxwi > 50 ? 0 : yxwi < -50 ? -yxwi : log1p(exp(-yxwi));
+	}
+
+	void LogisticLoss::loss(const Vector &Xw, const Vector y, Vector &loss) const
+	{
+		assert(Xw.length() == y.length() && y.length() == loss.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			loss[i] = this->loss(Xw[i], y[i]);
+		}
+	}
+
+	// Return sum_i  log(1+exp(-y[i]*Xw[i])). Need to parallelize using OpenMP
+	floatT LogisticLoss::sum_loss(const Vector &Xw, const Vector &y) const
+	{
+		assert(Xw.length() == y.length());
+		size_t len = y.length();
+		double sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			sum += this->loss(Xw[i], y[i]);
+		}
+		return floatT(sum);
+	}
+
+	floatT LogisticLoss::sum_loss(const Vector &theta, const Vector &Xw, const Vector &y) const
+	{
+		assert(Xw.length() == y.length() && theta.length() == y.length());
+		size_t len = y.length();
+		double sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			sum += this->loss(Xw[i], y[i]) * theta[i];
+		}
+		return floatT(sum);
+	}
+
+	floatT LogisticLoss::derivative(const floatT Xiw, const floatT yi) const
+	{
+		return -yi / (1 + exp(yi*Xiw));
+	}
+
+	// d[i] = -y[i]/(1+exp(y[i]*Xw[i]) so that gradient is expressed as sum_i d[i]*X[i,:]
+	void LogisticLoss::derivative(const Vector &Xw, const Vector &y, Vector &d) const
+	{
+		assert(Xw.length() == y.length() && Xw.length() == y.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			d[i] = -y[i] / (1 + exp(y[i] * Xw[i]));
+		}
+	}
+
+	floatT LogisticLoss::second_derivative(const floatT Xiw, const floatT yi) const
+	{
+		return 1.0 / (2 + exp(yi*Xiw) + exp(-yi*Xiw));
+	}
+
+	// d[i] = -y[i]/(1+exp(y[i]*Xw[i]) so that gradient is expressed as sum_i d[i]*X[i,:]
+	void LogisticLoss::second_derivative(const Vector &Xw, const Vector &y, Vector &d) const
+	{
+		assert(Xw.length() == y.length() && Xw.length() == y.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			d[i] = 1.0 / (2 + exp(y[i] * Xw[i]) + exp(-y[i] * Xw[i]));
+		}
+	}
+
+	bool LogisticLoss::conj_feasible(const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length());
+		//Vector ya(y.length());
+		//Vector::elem_multiply(y, a, ya);
+		//return (ya.max() <= 0 && ya.min() >= -1);
+
+		size_t len = y.length();
+		for (size_t i = 0; i < len; i++)
+		{
+			if (!this->conj_feasible(a[i], y[i])) { return false; }
+		}
+		return true;
+	}
+
+	floatT LogisticLoss::conjugate(const floatT ai, const floatT yi) const
+	{
+		// Need to check if all a_j belongs to [-1, 0], but can ignore if initialized this way 
+		floatT t = yi*ai;
+		assert(t >= -1 && t <= 0);
+		return t > -_deltabd ? _xentrpy : t < -1.0 + _deltabd ? _xentrpy : (-t) * log(-t) + (1.0 + t) * log1p(t);
+	}
+
+	void LogisticLoss::conjugate(const Vector &a, const Vector &y, Vector &conj) const
+	{
+		assert(a.length() == y.length() && y.length() == conj.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			conj[i] = this->conjugate(a[i], y[i]);
+		}
+	}
+
+	// Return (f*)(a) sum_i (-ya[i])*log(-ya[i]) + (1+ya[i])*log(1+ya[i]) with ya[i] in [-1, 0]
+	floatT LogisticLoss::sum_conjugate(const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length());
+		size_t len = a.length();
+		double sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			sum += this->conjugate(a[i], y[i]);
+		}
+		return floatT(sum);
+	}
+
+	floatT LogisticLoss::sum_conjugate(const Vector &theta, const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length() && theta.length() == y.length());
+		size_t len = a.length();
+		double sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			sum += this->conjugate(a[i], y[i]) * theta[i];
+		}
+		return floatT(sum);
+	}
+
+	floatT LogisticLoss::conj_derivative(const floatT ai, const floatT yi) const
+	{
+		floatT t = yi*ai;
+		assert(t >= -1 && t <= 0);
+		return t > -_deltabd ? yi*_xderiva : t < -1.0 + _deltabd ? -yi*_xderiva : yi*(log1p(t) - log(-t));
+	}
+
+	void LogisticLoss::conj_derivative(const Vector &a, const Vector &y, Vector &cd) const
+	{
+		assert(a.length() == y.length() && y.length() == cd.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			cd[i] = this->conj_derivative(a[i], y[i]);
+		}
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*(b - a)^2} 
+	floatT LogisticLoss::conj_prox(const floatT sigma, const floatT ai, const floatT yi) const
+	{
+		// using double here is critical for numerical stability
+		double lb = _deltabd - 1.0, ub = -_deltabd;
+		double epsilon = _epsilon;
+		int maxiter = _maxiter;
+
+		double yai = ai*yi;					// need to convert sign first to use one code
+		double bi = fmin(ub, fmax(lb, yai));
+		double f = 0, df = 0;
+		for (int k = 0; k < maxiter; ++k)
+		{
+			f = bi - yai + sigma * log((1.0 + bi) / (-bi));
+			if (fabs(f) < epsilon) break;
+			df = 1.0 - sigma / (bi * (1.0 + bi));
+			bi -= f / df;
+			bi = fmin(ub, fmax(lb, bi));	// critical for convergence
+		}
+		return floatT(bi * yi);				
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	void LogisticLoss::conj_prox(const floatT sigma, const Vector &a, const Vector &y, Vector &b) const
+	{
+		assert(a.length() == y.length() && y.length() == b.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			b[i] = this->conj_prox(sigma, a[i], y[i]);
+		}
+	}
+
+	// Member functions of SmoothedHinge class ------------------------------------------
+
+	floatT SmoothedHinge::loss(const floatT Xiw, const floatT yi) const
+	{
+		floatT yXwi = yi * Xiw;
+		floatT t = 1 - yXwi;
+		return yXwi >= 1 ? 0 : t >= _delta ? t - _delta / 2 : t*t / (2 * _delta);
+	}
+
+	void SmoothedHinge::loss(const Vector &Xw, const Vector y, Vector &loss) const
+	{
+		assert(Xw.length() == y.length() && y.length() == loss.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			loss[i] = this->loss(Xw[i], y[i]);
+		}
+	}
+
+	floatT SmoothedHinge::sum_loss(const Vector &Xw, const Vector &y) const
+	{
+		assert(Xw.length() == y.length());
+		size_t len = y.length();
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; ++i)
+		{
+			sum += this->loss(Xw[i], y[i]);
+		}
+		return sum;
+	}
+
+	floatT SmoothedHinge::sum_loss(const Vector &theta, const Vector &Xw, const Vector &y) const
+	{
+		assert(Xw.length() == y.length() && theta.length() == y.length());
+		size_t len = y.length();
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; ++i)
+		{
+			sum += this->loss(Xw[i], y[i]) * theta[i];
+		}
+		return sum;
+	}
+
+	floatT SmoothedHinge::derivative(const floatT Xiw, const floatT yi) const
+	{
+		floatT yXwi = yi * Xiw;
+		floatT di = yXwi>1 ? 0 : yXwi < 1 - _delta ? -1 : (yXwi - 1) / _delta;
+		return yi*di;
+	}
+
+	// Compute d[i] so that gradient is expressed as sum_i d[i]*X[i,:]
+	void SmoothedHinge::derivative(const Vector &Xw, const Vector &y, Vector &d) const
+	{
+		assert(Xw.length() == y.length() && Xw.length() == y.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			d[i] = this->derivative(Xw[i], y[i]);
+		}
+	}
+
+	floatT SmoothedHinge::second_derivative(const floatT Xiw, const floatT yi) const
+	{
+		floatT yXwi = yi * Xiw;
+		floatT di = yXwi>1 ? 0 : yXwi < 1 - _delta ? 0 : 1.0 / _delta;
+		return yi*di;
+	}
+
+	// Compute d[i] so that gradient is expressed as sum_i d[i]*X[i,:]
+	void SmoothedHinge::second_derivative(const Vector &Xw, const Vector &y, Vector &d) const
+	{
+		assert(Xw.length() == y.length() && Xw.length() == y.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			d[i] = this->second_derivative(Xw[i], y[i]);
+		}
+	}
+
+	bool SmoothedHinge::conj_feasible(const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length());
+		//Vector ya(y.length());
+		//Vector::elem_multiply(y, a, ya);
+		//return (ya.max() <= 0 && ya.min() >= -1);
+
+		size_t len = y.length();
+		for (size_t i = 0; i < len; i++)
+		{
+			if (!this->conj_feasible(a[i], y[i])) { return false; }
+		}
+		return true;
+	}
+
+	floatT SmoothedHinge::conjugate(const floatT ai, const floatT yi) const
+	{
+		floatT yai = yi*ai;
+		assert(yai >= -1 && yai <= 0);
+		return yai + (_delta / 2)*(yai*yai);
+	}
+
+	void SmoothedHinge::conjugate(const Vector &a, const Vector &y, Vector &conj) const
+	{
+		assert(a.length() == y.length() && y.length() == conj.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			conj[i] = this->conjugate(a[i], y[i]);
+		}
+	}
+
+	floatT SmoothedHinge::sum_conjugate(const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length());
+		size_t len = y.length();
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum) 
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT yai = y[i] * a[i];
+			assert(yai >= -1 && yai <= 0);
+			sum += yai + (_delta / 2)*(yai*yai);
+			//sum += this->conjugate(a[i], y[i]);
+		}
+		return sum;
+	}
+
+	floatT SmoothedHinge::sum_conjugate(const Vector &theta, const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length() && theta.length() == y.length());
+		size_t len = y.length();
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum) 
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT yai = y[i] * a[i];
+			assert(yai >= -1 && yai <= 0);
+			sum += (yai + (_delta / 2)*(yai*yai)) * theta[i];
+			//sum += this->conjugate(a[i], y[i]) * theta[i];
+		}
+		return sum;
+	}
+
+	floatT SmoothedHinge::conj_derivative(const floatT ai, const floatT yi) const
+	{
+		floatT yai = yi*ai;
+		assert(yai >= -1 && yai <= 0);
+		return _delta*ai + yi;
+	}
+
+	void SmoothedHinge::conj_derivative(const Vector &a, const Vector &y, Vector &cd) const
+	{
+		assert(a.length() == y.length() && y.length() == cd.length());
+		cd.copy(y);
+		Vector::axpy(_delta, a, cd);
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	floatT SmoothedHinge::conj_prox(const floatT sigma, const floatT ai, const floatT yi) const
+	{
+		// should not assert yi*ai in [-1, 0] here because ai can be arbitrary
+		floatT bi = (yi*ai - sigma) / (1 + sigma*_delta);
+		return yi*(bi > 0 ? 0 : bi < -1 ? -1 : bi);
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	void SmoothedHinge::conj_prox(const floatT sigma, const Vector &a, const Vector &y, Vector &b) const
+	{
+		assert(a.length() == y.length() && y.length() == b.length());
+		size_t len = y.length();
+		floatT delta = _delta;
+
+		#pragma omp parallel for default(none) shared(a,y,b,len,delta)
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT bi = (y[i]*a[i] - sigma) / (1 + sigma*delta);
+			b[i] = y[i] * (bi > 0 ? 0 : bi < -1 ? -1 : bi);
+		}
+	}
+
+	// Member functions of SquaredL2Norm class ------------------------------------------
+
+	floatT SquaredL2Norm::operator()(const Vector &w) const 
+	//floatT SquaredL2Norm::value(const Vector &w) const
+	{
+		return 0.5f * pow(w.norm2(), 2);
+	}
+
+	// Compute f*(v) = max_w {v'*w - f(w)}
+	floatT SquaredL2Norm::conjugate(const Vector &v) const
+	{ 
+		return 0.5f *pow(v.norm2(), 2);
+	}
+
+	// Compute f*(v) = max_w {v'*w - f(w)}, with maxing w already computed
+	floatT SquaredL2Norm::conjugate(const Vector &v, const Vector &w) const
+	{
+		assert(v.length() == w.length());
+		return this->conjugate(v);
+	}
+
+	// prox() operator computes u = argmin_u { tau*f(u) + (1/2)*||u-w||_2^2 }
+	void SquaredL2Norm::prox(const floatT tau, const Vector &w, Vector &u) const
+	{
+		assert(w.length() == u.length());
+		if (&w != &u) { u.copy(w); }
+		u.scale(1 / (1 + tau));
+	}
+
+	// Given vector v, compute g = argmax_w {v'*w - f(w)} = argmin_w {-v'*w + f(w)}
+	void SquaredL2Norm::conj_grad(const Vector &v, Vector &w) const
+	{
+		assert(v.length() == w.length());
+		if (&w != &v) { w.copy(v); }
+	}
+
+	// Member functions of ElasticNet class ------------------------------------------
+
+	//floatT ElasticNet::value(const Vector &w) const
+	floatT ElasticNet::operator()(const Vector &w) const 
+	{
+		return 0.5f * pow(w.norm2(), 2) + _lambda1 * w.norm1();
+	}
+
+	// Compute f*(v) = max_w {v'*w - f(w)}
+	floatT ElasticNet::conjugate(const Vector &v) const
+	{
+		Vector w(v.length());
+		this->conj_grad(v, w);
+		return v.dot(w) - (*this)(w);
+	}
+
+	// Compute f*(v) = max_w {v'*w - f(w)}, with maxing w already computed
+	floatT ElasticNet::conjugate(const Vector &v, const Vector &w) const
+	{
+		assert(v.length() == w.length());
+		return v.dot(w) - (*this)(w);
+	}
+
+	// Compute u = argmin_u { tau*f(u) + (1/2)*||u-w||_2^2 }
+	// Note that we allow u and w being the same Vector, that is, allow &w==&u
+	void ElasticNet::prox(const floatT tau, const Vector &w, Vector &u) const
+	{
+		assert(w.length() == u.length());
+		floatT tl2 = 1 + tau;
+		floatT tl1 = tau * _lambda1;
+		size_t len = w.length();
+
+		#pragma omp parallel for
+		for(int64_t i=0; i<len; i++)
+		{
+			floatT vneg = w[i] + tl1;
+			floatT vpos = w[i] - tl1;
+			u[i] = vpos > 0 ? vpos / tl2 : vneg < 0 ? vneg / tl2 : 0;
+		}
+	}
+
+	// Given vector v, compute g = argmax_w {v'*w - f(w)} = argmin_w {-v'*w + f(w)}
+	void ElasticNet::conj_grad(const Vector &v, Vector &w) const
+	{
+		size_t len = v.length();
+		floatT lambda1 = _lambda1;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT vneg = v[i] + lambda1;
+			floatT vpos = v[i] - lambda1;
+			w[i] = vpos > 0 ? vpos : vneg < 0 ? vneg : 0;
+		}
+	}
+
+	// Member functions of UnitQuadratic class ------------------------------------------
+
+	OffsetQuadratic::OffsetQuadratic(const size_t dim)
+	{
+		_w0 = new Vector(dim);
+		_dw = new Vector(dim);
+	}
+
+	OffsetQuadratic::~OffsetQuadratic()
+	{
+		if (_w0 != nullptr) delete _w0;
+		if (_dw != nullptr) delete _dw;
+	}
+
+	floatT OffsetQuadratic::operator()(const Vector &w) const
+	{
+		Vector::elem_subtract(w, *_w0, *_dw);
+		//Vector _dw(w);
+		//Vector::axpy(-1, *_w0, *_dw);
+		return 0.5f * pow(_dw->norm2(), 2);
+	}
+
+	floatT OffsetQuadratic::conjugate(const Vector &v) const
+	{
+		return 0.5f * pow(v.norm2(), 2) + v.dot(*_w0);
+	}
+
+	floatT OffsetQuadratic::conjugate(const Vector &v, const Vector &w) const
+	{
+		assert(v.length() == w.length());
+		// return v.dot(w) - (*this)(w);	
+		// but it's easier to call directly
+		return this->conjugate(v);
+	}
+
+	void OffsetQuadratic::prox(const floatT tau, const Vector &w, Vector &u) const
+	{
+		assert(w.length() == u.length());
+		if (&w != &u) { u.copy(w); }
+		Vector::axpy(tau, *_w0, u);
+		u.scale(1.0f / (1.0f + tau));
+	}
+
+	// Given vector v, compute g = argmax_w {v'*w - f(w)} = argmin_w {-v'*w + f(w)}
+	// this one does not exploit sparsity, can be computationally very expensive
+	// see efficient implementation in sdca
+	void OffsetQuadratic::conj_grad(const Vector &v, Vector &w) const
+	{
+		assert(v.length() == w.length());
+		//if (&w != &v) { w.copy(v); }
+		//Vector::axpy(1.0f, *_w0, w);
+		Vector::elem_add(*_w0, v, w);
+	}
+}
--- a/src/Numerics/functions.h
+++ b/src/Numerics/functions.h
@ -0,0 +1,224 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <assert.h>
+#include <stdexcept>
+
+#include "floattype.h"
+#include "dnvector.h"
+#include "spmatrix.h"
+
+namespace functions
+{
+	using Vector = numerics::Vector<floatT>;
+	using SubVector = numerics::SubVector<floatT>;
+	using SparseMatrixCSR = numerics::SparseMatrixCSR<spmatT, floatT>;
+	using SubSparseMatrixCSR = numerics::SubSparseMatrixCSR<spmatT, floatT>;
+
+	// functions to compute performance of linear regression and binary classification
+	floatT regression_mse(const SparseMatrixCSR &X, const Vector &y, const Vector &w);
+	floatT binary_error_rate(const SparseMatrixCSR &X, const Vector &y, const Vector &w);
+	size_t binary_error_count(const SparseMatrixCSR &X, const Vector &y, const Vector &w);
+
+	// An abstract class to serve as interface for sum of parametrized smooth loss functions 
+	class ISmoothLoss
+	{
+	public:
+		ISmoothLoss() = default;
+		virtual char symbol() const = 0;
+		virtual floatT smoothness() const = 0;	// smoothness parameter of single component function
+		virtual floatT loss(const floatT Xiw, const floatT yi) const = 0;
+		// the vector form of loss() is provided to faciliate weighted sum of losses
+		virtual void   loss(const Vector &Xw, const Vector y, Vector &loss) const = 0;
+		// can also add weighted_sum_loss(), but it can be done by dot(weights, loss)
+		virtual floatT sum_loss(const Vector &Xw, const Vector &y) const = 0; 
+		virtual floatT sum_loss(const Vector &theta, const Vector &Xw, const Vector &y) const = 0;
+		// derivative() is defined such that gradient is simply expressed as sum_i d[i]*X[i,:]
+		virtual floatT derivative(const floatT Xiw, const floatT yi) const = 0;
+		virtual void   derivative(const Vector &Xw, const Vector &y, Vector &d) const = 0;
+		virtual floatT second_derivative(const floatT Xiw, const floatT yi) const = 0;
+		virtual void   second_derivative(const Vector &Xw, const Vector &y, Vector &d) const = 0;
+		virtual bool   conj_feasible(const floatT ai, const floatT yi) const = 0;
+		virtual bool   conj_feasible(const Vector &a, const Vector &y) const = 0;
+		virtual floatT conjugate(const floatT ai, const floatT yi) const = 0;
+		// the vector form of conjugate() is provided to faciliate weighted sum of losses
+		virtual void   conjugate(const Vector &a, const Vector &y, Vector &conj) const = 0;
+		virtual floatT sum_conjugate(const Vector &a, const Vector &y) const = 0;
+		virtual floatT sum_conjugate(const Vector &theta, const Vector &a, const Vector &y) const = 0;
+		virtual floatT conj_derivative(const floatT ai, const floatT yi) const = 0;
+		virtual void   conj_derivative(const Vector &a, const Vector &y, Vector &cd) const = 0;
+		virtual floatT conj_prox(const floatT sigma, const floatT ai, const floatT yi) const = 0;
+		virtual void   conj_prox(const floatT sigma, const Vector &a, const Vector &y, Vector &u) const = 0;
+	};
+
+	// f_y(t) = (1/2)*(t-y)^2 where y is a real number for linear regression
+	class SquareLoss : public ISmoothLoss
+	{
+	public:
+		SquareLoss() {};
+		inline char symbol() const override { return '2'; };
+		inline floatT smoothness() const override { return 1.0f; };	// smoothness of (1/2)*(x-b)^2
+		floatT loss(const floatT Xiw, const floatT yi) const override;
+		void   loss(const Vector &Xw, const Vector y, Vector &loss) const override;
+		floatT sum_loss(const Vector &Xw, const Vector &y) const override;
+		floatT sum_loss(const Vector &theta, const Vector &Xw, const Vector &y) const override;
+		floatT derivative(const floatT Xiw, const floatT yi) const override;
+		void   derivative(const Vector &Xw, const Vector &y, Vector &d) const override;
+		floatT second_derivative(const floatT Xiw, const floatT yi) const override;
+		void   second_derivative(const Vector &Xw, const Vector &y, Vector &d) const override;
+		bool   conj_feasible(const floatT ai, const floatT yi) const override { return true; };
+		bool   conj_feasible(const Vector &a, const Vector &y) const override { return true; };
+		floatT conjugate(const floatT ai, const floatT yi) const override;
+		void   conjugate(const Vector &a, const Vector &y, Vector &conj) const override;
+		floatT sum_conjugate(const Vector &a, const Vector &y) const override;
+		floatT sum_conjugate(const Vector &theta, const Vector &a, const Vector &y) const override;
+		floatT conj_derivative(const floatT ai, const floatT yi) const override;
+		void   conj_derivative(const Vector &a, const Vector &y, Vector &cd) const override;
+		floatT conj_prox(const floatT sigma, const floatT ai, const floatT yi) const override;
+		void   conj_prox(const floatT sigma, const Vector &a, const Vector &y, Vector &u) const override;
+	};
+
+	// f_y(t) = log(1+exp(-y*t) where y is +1 or -1 for binary classification
+	class LogisticLoss : public ISmoothLoss
+	{
+	private:
+		// need to test if the min_log trik is really necessary for numerical stability
+		floatT _deltabd;		// boundary margin within [-1, 0] for taking logarithm
+		floatT _epsilon;		// stopping accuracy for Newton's method
+		int    _maxiter;		// maximum iteration for Newton's method
+
+		floatT _xentrpy;		// entropy when dual variable are close to boundary of [-1,0]
+		floatT _xderiva;		// derivative of conjugate function near the boundary
+
+	public:
+		// may not work as desired when floatT = float, 1 + 1e-12 = 1 anyway!
+		LogisticLoss(const floatT deltabd = 1.0e-12f, const floatT epsilon = 1.0e-6f, const int maxiter = 100);
+		inline char symbol() const override { return 'l'; };
+		inline floatT smoothness() const override { return 0.25; }	// smoothness of log(1+exp(-b*t))
+		floatT loss(const floatT Xiw, const floatT yi) const override;
+		void   loss(const Vector &Xw, const Vector y, Vector &loss) const override;
+		floatT sum_loss(const Vector &Xw, const Vector &y) const override;
+		floatT sum_loss(const Vector &theta, const Vector &Xw, const Vector &y) const override;
+		floatT derivative(const floatT Xiw, const floatT yi) const override;
+		void   derivative(const Vector &Xw, const Vector &y, Vector &d) const override;
+		floatT second_derivative(const floatT Xiw, const floatT yi) const override;
+		void   second_derivative(const Vector &Xw, const Vector &y, Vector &d) const override;
+		bool   conj_feasible(const floatT ai, const floatT yi) const override { return (yi*ai <= 0 && yi*ai >= -1); };
+		bool   conj_feasible(const Vector &a, const Vector &y) const override;
+		floatT conjugate(const floatT ai, const floatT yi) const override;
+		void   conjugate(const Vector &a, const Vector &y, Vector &conj) const override;
+		floatT sum_conjugate(const Vector &a, const Vector &y) const override;
+		floatT sum_conjugate(const Vector &theta, const Vector &a, const Vector &y) const override;
+		floatT conj_derivative(const floatT ai, const floatT yi) const override;
+		void   conj_derivative(const Vector &a, const Vector &y, Vector &cd) const override;
+		floatT conj_prox(const floatT sigma, const floatT ai, const floatT yi) const override;
+		void   conj_prox(const floatT sigma, const Vector &a, const Vector &y, Vector &u) const override;
+	};
+
+	// smoothed hinge constructed by adding strong convexity to conjugate
+	class SmoothedHinge : public ISmoothLoss
+	{
+	private:
+		floatT _delta;			// conjugate smoothing parameter
+
+	public:
+		SmoothedHinge(const floatT delta = 1) { assert(delta > 0); _delta = delta; };
+		inline char symbol() const override { return 's'; };
+		inline floatT smoothness() const override { return 1.0f / _delta; }; // smoothness by conjugate smoothing
+		floatT loss(const floatT Xiw, const floatT yi) const override;
+		void   loss(const Vector &Xw, const Vector y, Vector &loss) const override;
+		floatT sum_loss(const Vector &Xw, const Vector &y) const override;
+		floatT sum_loss(const Vector &theta, const Vector &Xw, const Vector &y) const override;
+		floatT derivative(const floatT Xiw, const floatT yi) const override;
+		void   derivative(const Vector &Xw, const Vector &y, Vector &d) const override;
+		floatT second_derivative(const floatT Xiw, const floatT yi) const override;
+		void   second_derivative(const Vector &Xw, const Vector &y, Vector &d) const override;
+		bool   conj_feasible(const floatT ai, const floatT yi) const override { return (yi*ai <= 0 && yi*ai >= -1); };
+		bool   conj_feasible(const Vector &a, const Vector &y) const override;
+		floatT conjugate(const floatT ai, const floatT yi) const override;
+		void   conjugate(const Vector &a, const Vector &y, Vector &conj) const override;
+		floatT sum_conjugate(const Vector &a, const Vector &y) const override;
+		floatT sum_conjugate(const Vector &theta, const Vector &a, const Vector &y) const override;
+		floatT conj_derivative(const floatT ai, const floatT yi) const override;
+		void   conj_derivative(const Vector &a, const Vector &y, Vector &cd) const override;
+		floatT conj_prox(const floatT sigma, const floatT ai, const floatT yi) const override;
+		void   conj_prox(const floatT sigma, const Vector &a, const Vector &y, Vector &u) const override;
+	};
+
+	//================================================================================================
+	// An abstract class to serve as interface for convex regularization functions
+	class IUnitRegularizer
+	{
+	private:
+		Vector *_offset = nullptr;
+	public:
+		IUnitRegularizer() = default;
+		virtual char symbol() const = 0;					// enable customized algorithm implementations
+		virtual Vector *offset() const { return _offset; };	// enable offset for derived classes
+		floatT convexity() const { return 1.0f; };			// strong convexity parameter
+		virtual floatT l1penalty() const { return 0.0f; };		// return l1 penalty parameter if any
+		virtual floatT operator()(const Vector &w) const = 0;	// return function value
+		// Compute f*(v) = max_w {v'*w - f(w)}, can also use maxing w if already computed
+		virtual floatT conjugate(const Vector &v) const = 0;
+		virtual floatT conjugate(const Vector &v, const Vector &w) const = 0;
+		// prox() operator computes u = argmin_u { tau*f(u) + (1/2)*||u-w||_2^2 }
+		virtual void prox(const floatT tau, const Vector &w, Vector &u) const = 0;
+		// return gradient of conjugate function: argmax_w {v'*w - f(w)} = argmin_w {-v'*w + f(w)}
+		virtual void conj_grad(const Vector &v, Vector &w) const = 0;	
+	};
+
+	// g(w) = (1/2)*||w||_2^2
+	class SquaredL2Norm : public IUnitRegularizer
+	{
+	public:
+		SquaredL2Norm() {};
+		inline char symbol() const override { return '2'; };
+		floatT operator()(const Vector &w) const override;
+		floatT conjugate(const Vector &v) const override;
+		floatT conjugate(const Vector &v, const Vector &w) const override;
+		void prox(const floatT tau, const Vector &w, Vector &u) const override;
+		void conj_grad(const Vector &w, Vector &g) const override;	
+	};
+
+	// g(w) = (1/2)*||w||_2^2 + lambda1*||w||_1
+	class ElasticNet : public IUnitRegularizer
+	{
+	private:
+		floatT _lambda1;
+
+	public:
+		ElasticNet(const floatT lambda1) { assert(lambda1 > 0); _lambda1 = lambda1; };
+		inline char symbol() const override { return 'e'; };
+		inline floatT l1penalty() const override { return _lambda1; };
+		floatT operator()(const Vector &w) const override;
+		floatT conjugate(const Vector &v) const override;
+		floatT conjugate(const Vector &v, const Vector &w) const override;
+		void prox(const floatT tau, const Vector &w, Vector &u) const override;
+		void conj_grad(const Vector &w, Vector &g) const override;
+	};
+
+	// R(w) = (1/2)*||w-w0||_2^2 
+	class OffsetQuadratic : public IUnitRegularizer
+	{
+	private:
+		Vector *_w0;
+		Vector *_dw;
+	public:
+		OffsetQuadratic(const size_t dim);
+		~OffsetQuadratic();
+		void update_offset(const Vector &w0) { _w0->copy(w0); };
+		Vector *offset() const override { return _w0; };
+		char symbol() const { return 'q'; };
+		floatT operator()(const Vector &w) const override;
+		floatT conjugate(const Vector &v) const override;
+		floatT conjugate(const Vector &v, const Vector &w) const override;
+		void prox(const floatT tau, const Vector &w, Vector &u) const override;
+		void conj_grad(const Vector &w, Vector &g) const override;
+	};
+
+	// need to implement the following more sophisticated regularizer for sparsity
+	// g(w) = (3/2)log(d)||w||_q^2 + (lambda1/lambda2)||w||_1  (when feature has low infty norm) 
+}
+
--- a/src/Numerics/functions_nonunity.cpp
+++ b/src/Numerics/functions_nonunity.cpp
@ -0,0 +1,573 @@
+#include <assert.h>
+#include <math.h>
+#include <omp.h>
+#include <cstdint>
+
+// OpenMP 3.0 supports unsigned for-loop iterator, so can replace std::int64_t by size_t
+#include "functions.h"
+
+using std::int64_t;
+
+namespace randalgms
+{
+	// mse for linear regression (1/n)*||X*w - y||^2 where n is length of y
+	floatT regression_mse(const SparseMatrixCSR &X, const Vector &y, const Vector &w)
+	{
+		assert(X.nrows() == y.length() && X.ncols() == w.length());
+		Vector yp(y);
+		X.aAxby(1, w, -1, yp);	// yp = X * w - y
+		return pow(yp.norm2(), 2) / yp.length();
+	}
+
+	// return number of prediction errors where sign(Xi'*w) != yi
+	size_t binary_error_count(const SparseMatrixCSR &X, const Vector &y, const Vector &w)
+	{
+		assert(X.nrows() == y.length() && X.ncols() == w.length());
+		Vector yp(y.length());
+		X.aAxby(1, w, 0, yp);	// yp = X * w
+		Vector::elem_multiply(y, yp, yp);
+		return yp.count_negative();
+	}
+
+	// return error rate: number of errors (with sign(Xi'*w) != yi) divided by length of y
+	floatT binary_error_rate(const SparseMatrixCSR &X, const Vector &y, const Vector &w)
+	{
+		assert(X.nrows() == y.length() && X.ncols() == w.length());
+		Vector yp(y.length());
+		X.aAxby(1, w, 0, yp);	// yp = X * w
+		yp.to_sign();			// only keep signs in {-1, 0, +1}
+		return (1 - yp.dot(y) / y.length()) / 2;
+	}
+
+	// Member functions of SquareLoss class: f(t) = (1/2)*(t-b)^2 ---------------------
+	inline floatT SquareLoss::loss(const float Xiw, const floatT yi) const
+	{
+		return pow(Xiw - yi, 2) / 2;
+	}
+
+	void SquareLoss::loss(const Vector &Xw, const Vector y, Vector &loss) const
+	{
+		assert(Xw.length() == y.length() && y.length() == loss.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			loss[i] = pow(Xw[i] - y[i], 2) / 2;
+		}
+	}
+
+	// return F(Xw) = sum_i 0.5*(Xw[i]-y[i])^2
+	floatT SquareLoss::sum_loss(const Vector &Xw, const Vector &y) const
+	{
+		assert(Xw.length() == y.length());
+		size_t len = y.length();
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT zi = Xw[i] - y[i];
+			sum += zi*zi / 2;
+		}
+		return sum;
+	}
+
+	// return di = f'(Xiw) = Xiw - yi
+	inline floatT SquareLoss::derivative(const floatT Xiw, const floatT yi) const
+	{
+		return Xiw - yi;
+	}
+
+	// return d = F'(Xw) = Xw - y
+	void SquareLoss::derivative(const Vector &Xw, const Vector &y, Vector &d) const
+	{
+		assert(Xw.length() == y.length() && y.length() == d.length());
+		Vector::elem_subtract(Xw, y, d);
+	}
+
+	inline floatT SquareLoss::conjugate(const floatT &ai, const floatT yi) const
+	{
+		return ai*ai / 2 + yi*ai;
+	}
+
+	void SquareLoss::conjugate(const Vector &a, const Vector &y, Vector &conj) const
+	{
+		assert(a.length() == y.length() && y.length() == conj.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			conj[i] = a[i] * a[i] / 2 + y[i] * a[i];
+		}
+	}
+
+	// return f*(a) = (1/2)*||a||^2 + a'*y
+	floatT SquareLoss::sum_conjugate(const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length());
+		return a.dot(a) / 2 + a.dot(y);
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	inline floatT SquareLoss::conj_prox(floatT sigma, const floatT ai, const floatT yi) const
+	{
+		return (ai - sigma*yi) / (1 + sigma);
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	void SquareLoss::conj_prox(floatT sigma, const Vector &a, const Vector &y, Vector &u) const
+	{
+		assert(a.length() == y.length() && y.length() == u.length());
+		size_t len = a.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++) 
+		{
+			u[i] = (a[i] - sigma*y[i]) / (1 + sigma);
+		}
+	}
+
+	// Member functions of LogisticLoss class ------------------------------------------
+
+	LogisticLoss::LogisticLoss(const floatT min_log, const floatT epsilon, const int maxiter)
+	{
+		assert(min_log > 0 && epsilon > 0 && maxiter > 0);
+		_min_log = min_log;
+		_epsilon = epsilon;
+		_maxiter = maxiter;
+	}
+
+	inline floatT LogisticLoss::loss(const float Xiw, const floatT yi) const
+	{
+		floatT yxwi = yi * Xiw;
+		return yxwi > 50 ? 0 : yxwi < -50 ? -yxwi : log(1 + exp(-yxwi));
+	}
+
+	void   LogisticLoss::loss(const Vector &Xw, const Vector y, Vector &loss) const
+	{
+		assert(Xw.length() == y.length() && y.length() == loss.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT yxwi = y[i] * Xw[i];
+			loss[i] = yxwi > 50 ? 0 : yxwi < -50 ? -yxwi : log(1 + exp(-yxwi));
+		}
+	}
+
+	// Return sum_i  log(1+exp(-y[i]*Xw[i])). Need to parallelize using OpenMP
+	floatT LogisticLoss::sum_loss(const Vector &Xw, const Vector &y) const
+	{
+		assert(Xw.length() == y.length());
+		size_t len = y.length();
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT yxwi = y[i] * Xw[i];
+			sum += yxwi > 50 ? 0 : yxwi < -50 ? -yxwi : log(1 + exp(-yxwi));
+		}
+		return sum;
+	}
+
+	inline floatT LogisticLoss::derivative(const floatT Xiw, const floatT yi) const
+	{
+		return -yi / (1 + exp(yi*Xiw));
+	}
+
+	// d[i] = -y[i]/(1+exp(y[i]*Xw[i]) so that gradient is expressed as sum_i d[i]*X[i,:]
+	void LogisticLoss::derivative(const Vector &Xw, const Vector &y, Vector &d) const
+	{
+		assert(Xw.length() == y.length() && Xw.length() == y.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			d[i] = -y[i] / (1 + exp(y[i] * Xw[i]));
+		}
+	}
+
+	inline floatT LogisticLoss::conjugate(const floatT &ai, const floatT yi) const
+	{
+		assert(yi*ai >= -1 && yi*ai <= 0);	// yi*ai in [-1, 0], may not be necessary here
+
+		// clip may not work here, especially for float 1e-12 - 1 = -1, log(1+t)=log(0)
+		// better approach: should directly bound entropy argument from zero!
+		double t = fmin(-_min_log, fmax(_min_log - 1, yi*ai));	// clip [-1 + min_log, - min_log]
+		return floatT((-t) * log(-t) + (1 + t) * log(1 + t));
+	}
+
+	void LogisticLoss::conjugate(const Vector &a, const Vector &y, Vector &conj) const
+	{
+		assert(a.length() == y.length() && y.length() == conj.length());
+		size_t len = y.length();
+		double lb = _min_log - 1, ub = -_min_log;
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			assert(y[i] * a[i] >= -1 && y[i] * a[i] <= 0);
+			double t = fmin(ub, fmax(lb, y[i] * a[i]));
+			conj[i] = floatT((-t) * log(-t) + (1 + t) * log(1 + t));
+		}
+	}
+
+	// Return (f*)(a) sum_i (-ya[i])*log(-ya[i]) + (1+ya[i])*log(1+ya[i]) with ya[i] in [-1, 0]
+	floatT LogisticLoss::sum_conjugate(const Vector &a, const Vector &y) const
+	{
+		// Need to check if all a_j belongs to [-1, 0], but can ignore if initialized this way 
+		assert(a.length() == y.length());
+		size_t len = a.length();
+		double lb = _min_log - 1, ub = -_min_log;
+		double sum = 0;
+
+		// this code may have more numerical error accumulation !!?
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; i++)
+		{
+			assert(y[i] * a[i] >= -1 && y[i] * a[i] <= 0);
+			double t = fmin(ub, fmax(lb, y[i] * a[i]));
+			sum += (-t) * log(-t) + (1 + t) * log(1 + t);
+		}
+		return floatT(sum);
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*(b - a)^2} 
+	floatT LogisticLoss::conj_prox(floatT sigma, const floatT ai, const floatT yi) const
+	{
+		floatT lb = _min_log - 1, ub = -_min_log;
+		floatT epsilon = _epsilon;
+		int maxiter = _maxiter;
+
+		double yai = ai*yi;					// need to convert sign first to use one code
+		double bi = fmin(ub, fmax(lb, yai));
+		double f = 0, df = 0;
+		for (int k = 0; k < maxiter; ++k)
+		{
+			f = bi - yai + sigma * log((1 + bi) / (-bi));
+			if (fabs(f) < epsilon) break;
+			df = 1.0 - sigma / (bi * (1 + bi));
+			bi -= f / df;
+			bi = fmin(ub, fmax(lb, bi));	// critical for convergence
+		}
+		return floatT(bi * yi);				
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	void LogisticLoss::conj_prox(floatT sigma, const Vector &a, const Vector &y, Vector &b) const
+	{
+		// Need to check if all a_j belongs to [-1, 0], but can ignore because always clipped
+		assert(a.length() == y.length());
+		Vector ya(y.length());
+		Vector::elem_multiply(y, a, ya);	// u[i] = y[i]*a[i]. Here y[i] need to be in {-1, +1};
+
+		floatT lb = _min_log - 1, ub = -_min_log;
+		floatT epsil = _epsilon;
+		int maxiter = _maxiter;
+		size_t len = y.length();
+
+		#pragma omp parallel for default(none) shared(y,ya,b,sigma,lb,ub,epsilon,maxiter,len)
+		for (int64_t i = 0; i < len; i++)
+		{
+			double bi = fmin(ub, fmax(lb, ya[i]));
+			double f = 0, df = 0;
+			for (int k = 0; k < maxiter; ++k)
+			{
+				f = bi - ya[i] + sigma * log((1 + bi) / (-bi));
+				if (fabs(f) < epsil) break;
+				df = 1.0 - sigma / (bi * (1 + bi));
+				bi -= f / df;
+				bi = fmin(ub, fmax(lb, bi));	// critical for convergence
+			}
+			b[i] = floatT(bi * y[i]);			
+		}
+	}
+
+	// Member functions of SmoothedHinge class ------------------------------------------
+
+	inline floatT SmoothedHinge::loss(const float Xiw, const floatT yi) const
+	{
+		floatT yXwi = yi * Xiw;
+		floatT t = 1 - yXwi;
+		return yXwi >= 1 ? 0 : t >= _delta ? t - _delta / 2 : t*t / (2 * _delta);
+	}
+
+	void SmoothedHinge::loss(const Vector &Xw, const Vector y, Vector &loss) const
+	{
+		assert(Xw.length() == y.length() && y.length() == loss.length());
+		size_t len = y.length();
+		floatT delta = _delta;
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT yXwi = y[i] * Xw[i];
+			floatT t = 1 - yXwi;
+			loss[i] = yXwi >= 1 ? 0 : t >= delta ? t - delta / 2 : t*t / (2 * delta);
+		}
+	}
+
+	floatT SmoothedHinge::sum_loss(const Vector &Xw, const Vector &y) const
+	{
+		assert(Xw.length() == y.length());
+		size_t len = y.length();
+		floatT delta = _delta;
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum)
+		for (int64_t i = 0; i < len; ++i)
+		{
+			floatT yXwi = y[i] * Xw[i];
+			floatT t = 1.0f - yXwi;
+			sum += yXwi >= 1 ? 0 : t >= delta ? t - delta / 2 : t*t / (2 * delta);
+		}
+		return sum;
+	}
+
+	inline floatT SmoothedHinge::derivative(const floatT Xiw, const floatT yi) const
+	{
+		floatT yXwi = yi * Xiw;
+		return yXwi>1 ? 0 : yXwi < 1 - _delta ? -1 : (yXwi - 1) / _delta;
+	}
+
+	// Compute d[i] so that gradient is expressed as sum_i d[i]*X[i,:]
+	void SmoothedHinge::derivative(const Vector &Xw, const Vector &y, Vector &d) const
+	{
+		assert(Xw.length() == y.length() && Xw.length() == y.length());
+		size_t len = y.length();
+		floatT delta = _delta;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT yXwi = y[i] * Xw[i];
+			d[i] = yXwi>1 ? 0 : yXwi < 1 - delta ? -1 : (yXwi - 1) / delta;
+		}
+	}
+
+	inline floatT SmoothedHinge::conjugate(const floatT &ai, const floatT yi) const
+	{
+		floatT yai = yi*ai;
+		assert(yai >= -1 && yai <= 0);
+		return yai + (_delta / 2)*(yai*yai);
+	}
+
+	void SmoothedHinge::conjugate(const Vector &a, const Vector &y, Vector &conj) const
+	{
+		assert(a.length() == y.length() && y.length() == conj.length());
+		size_t len = y.length();
+
+		#pragma omp parallel for 
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT yai = y[i] * a[i];
+			assert(yai >= -1 && yai <= 0);
+			conj[i] = yai + (_delta / 2)*(yai*yai);
+		}
+	}
+
+	floatT SmoothedHinge::sum_conjugate(const Vector &a, const Vector &y) const
+	{
+		assert(a.length() == y.length());
+		size_t len = y.length();
+		floatT sum = 0;
+
+		#pragma omp parallel for reduction(+:sum) 
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT yai = y[i] * a[i];
+			assert(yai >= -1 && yai <= 0);
+			sum += yai + (_delta / 2)*(yai*yai);
+		}
+		return sum;
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	inline floatT SmoothedHinge::conj_prox(floatT sigma, const floatT ai, const floatT yi) const
+	{
+		// should not assert yi*ai in [-1, 0] here because ai can be arbitrary
+		floatT bi = (yi*ai - sigma) / (1 + sigma*_delta);
+		return yi*(bi > 0 ? 0 : bi < -1 ? -1 : bi);
+	}
+
+	// Return argmin_b {sigma*(f*)(b) + (1/2)*||b - a||_2^2} 
+	void SmoothedHinge::conj_prox(floatT sigma, const Vector &a, const Vector &y, Vector &b) const
+	{
+		assert(a.length() == y.length() && y.length() == b.length());
+		size_t len = y.length();
+		floatT delta = _delta;
+
+		#pragma omp parallel for default(none) shared(a,y,b,len,delta)
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT bi = (y[i]*a[i] - sigma) / (1 + sigma*delta);
+			b[i] = y[i] * (bi > 0 ? 0 : bi < -1 ? -1 : bi);
+		}
+	}
+
+	// Member functions of SquaredL2Norm class ------------------------------------------
+
+	SquaredL2Norm::SquaredL2Norm(const floatT lambda) 
+	{ 
+		assert(lambda > 0); 
+		_lambda = lambda; 
+		if (lambda == 1.0f) {
+			_is_unity = true;
+		}
+		else {
+			_is_unity = false;
+		}
+	}
+
+	SquaredL2Norm::SquaredL2Norm(const SquaredL2Norm &g, bool unitize)
+	{
+		if (unitize) {
+			_lambda = 1.0f;
+			_is_unity = true;
+		}
+		else {
+			_lambda = g._lambda;
+			_is_unity = g._is_unity;
+		}
+	}
+
+	inline floatT SquaredL2Norm::unitize()
+	{
+		floatT previous_lambda = _lambda;
+		_lambda = 1.0f;
+		_is_unity = true;
+		return previous_lambda;
+	}
+
+	floatT SquaredL2Norm::operator()(const Vector &w) const 
+	//floatT SquaredL2Norm::value(const Vector &w) const
+	{
+		return (0.5f *_lambda)*pow(w.norm2(), 2);
+	}
+
+	// Compute f*(v) = max_w {v'*w - f(w)}
+	floatT SquaredL2Norm::conjugate(const Vector &v) const
+	{ 
+		return (0.5f / _lambda)*pow(v.norm2(), 2);
+	}
+
+	// Compute f*(v) = max_w {v'*w - f(w)}, with maxing w already computed
+	floatT SquaredL2Norm::conjugate(const Vector &v, const Vector &w) const
+	{
+		return this->conjugate(v);
+	}
+
+	// prox() operator computes u = argmin_u { tau*f(u) + (1/2)*||u-w||_2^2 }
+	void SquaredL2Norm::prox(const floatT tau, const Vector &w, Vector &u) const
+	{
+		if (&w != &u) { u.copy(w); }
+		u.scale(1 / (1 + tau*_lambda));
+	}
+
+	// Given vector v, compute g = argmax_w {v'*w - f(w)} = argmin_w {-v'*w + f(w)}
+	void SquaredL2Norm::conj_grad(const Vector &v, Vector &w) const
+	{
+		if (&w != &v) { w.copy(v); }
+		if (!_is_unity) {
+			w.scale(1.0f / _lambda);
+		}
+	}
+
+	// Member functions of ElasticNet class ------------------------------------------
+
+	ElasticNet::ElasticNet(const floatT lambda1, const floatT lambda2)
+	{ 
+		assert(lambda1 >= 0 && lambda2 > 0); 
+		_lambda1 = lambda1; 
+		_lambda2 = lambda2; 
+		if (lambda2 == 1.0f) {
+			_is_unity = true;
+		}
+		else {
+			_is_unity = false;
+		}
+	}
+
+	ElasticNet::ElasticNet(const ElasticNet &g, bool unitize)
+	{
+		if (unitize) {
+			_lambda1 = g._lambda1 / g._lambda2;
+			_lambda2 = 1.0f;
+			_is_unity = true;
+		}
+		else {
+			_lambda1 = g._lambda1;
+			_lambda2 = g._lambda2;
+			_is_unity = g._is_unity;
+		}
+	}
+
+	inline floatT ElasticNet::unitize()
+	{
+		floatT previous_lambda2 = _lambda2;
+		_lambda1 = _lambda1 / _lambda2;
+		_lambda2 = 1.0f;
+		_is_unity = true;
+		return previous_lambda2;
+	}
+
+
+	//floatT ElasticNet::value(const Vector &w) const
+	floatT ElasticNet::operator()(const Vector &w) const 
+	{
+		return (_lambda2 / 2)*pow(w.norm2(), 2) + _lambda1*w.norm1();
+	}
+
+	// Compute f*(v) = max_w {v'*w - f(w)}
+	floatT ElasticNet::conjugate(const Vector &v) const
+	{
+		Vector w(v.length());
+		this->conj_grad(v, w);
+		return v.dot(w) - (*this)(v);
+	}
+
+	// Compute f*(v) = max_w {v'*w - f(w)}, with maxing w already computed
+	floatT ElasticNet::conjugate(const Vector &v, const Vector &w) const
+	{
+		assert(v.length() == w.length());
+		return v.dot(w) - (*this)(v);
+	}
+
+	// Compute u = argmin_u { tau*f(u) + (1/2)*||u-w||_2^2 }
+	// Note that we allow u and w being the same Vector, that is, allow &w==&u
+	void ElasticNet::prox(const floatT tau, const Vector &w, Vector &u) const
+	{
+		floatT tl2 = 1 + tau * _lambda2;
+		floatT tl1 = tau * _lambda1;
+		size_t len = w.length();
+
+		#pragma omp parallel for
+		for(int64_t i=0; i<len; i++)
+		{
+			floatT vneg = w[i] + tl1;
+			floatT vpos = w[i] - tl1;
+			u[i] = vpos > 0 ? vpos / tl2 : vneg < 0 ? vneg / tl2 : 0;
+		}
+	}
+
+	// Given vector v, compute g = argmax_w {v'*w - f(w)} = argmin_w {-v'*w + f(w)}
+	void ElasticNet::conj_grad(const Vector &v, Vector &g) const
+	{
+		size_t len = v.length();
+		floatT lambda1 = _lambda1;
+		floatT lambda2 = _lambda2;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < len; i++)
+		{
+			floatT vneg = v[i] + lambda1;
+			floatT vpos = v[i] - lambda1;
+			g[i] = vpos > 0 ? vpos / lambda2 : vneg < 0 ? vneg / lambda2 : 0;
+		}
+	}
+}
--- a/src/Numerics/functions_nonunity.h
+++ b/src/Numerics/functions_nonunity.h
@ -0,0 +1,164 @@
+#pragma once
+
+#include <assert.h>
+#include <stdexcept>
+
+#include "floattype.h"
+
+namespace randalgms
+{
+	// functions to compute performance of linear regression and binary classification
+	floatT regression_mse(const SparseMatrixCSR &X, const Vector &y, const Vector &w);
+	floatT binary_error_rate(const SparseMatrixCSR &X, const Vector &y, const Vector &w);
+	size_t binary_error_count(const SparseMatrixCSR &X, const Vector &y, const Vector &w);
+
+	// An abstract class to serve as interface for sum of parametrized smooth loss functions 
+	class ISmoothLoss
+	{
+	public:
+		ISmoothLoss() = default;
+		virtual floatT smoothness() const = 0;	// smoothness parameter of single component function
+		virtual floatT loss(const float Xiw, const floatT yi) const = 0;
+		virtual void   loss(const Vector &Xw, const Vector y, Vector &loss) const = 0;
+		// can also add weighted_sum_loss(), but it can be done by dot(weights, loss)
+		virtual floatT sum_loss(const Vector &Xw, const Vector &y) const = 0; 
+		// derivative() is defined such that gradient is simply expressed as sum_i d[i]*X[i,:]
+		virtual floatT derivative(const floatT Xiw, const floatT yi) const = 0;
+		virtual void   derivative(const Vector &Xw, const Vector &y, Vector &d) const = 0;
+		virtual floatT conjugate(const floatT &ai, const floatT yi) const = 0;
+		virtual void   conjugate(const Vector &a, const Vector &y, Vector &conj) const = 0;
+		virtual floatT sum_conjugate(const Vector &a, const Vector &y) const = 0;
+		virtual floatT conj_prox(floatT sigma, const floatT ai, const floatT yi) const = 0;
+		virtual void   conj_prox(floatT sigma, const Vector &a, const Vector &y, Vector &u) const = 0;
+	};
+
+	// f_y(t) = (1/2)*(t-y)^2 where y is a real number for linear regression
+	class SquareLoss : public ISmoothLoss
+	{
+	public:
+		SquareLoss() {};
+		inline floatT smoothness() const override { return 1.0f; };	// smoothness of (1/2)*(x-b)^2
+		floatT loss(const float Xiw, const floatT yi) const override;
+		void   loss(const Vector &Xw, const Vector y, Vector &loss) const override;
+		floatT sum_loss(const Vector &Xw, const Vector &y) const override;
+		floatT derivative(const floatT Xiw, const floatT yi) const override;
+		void   derivative(const Vector &Xw, const Vector &y, Vector &d) const override;
+		floatT conjugate(const floatT &ai, const floatT yi) const override;
+		void   conjugate(const Vector &a, const Vector &y, Vector &conj) const override;
+		floatT sum_conjugate(const Vector &a, const Vector &y) const override;
+		floatT conj_prox(floatT sigma, const floatT ai, const floatT yi) const override;
+		void   conj_prox(floatT sigma, const Vector &a, const Vector &y, Vector &u) const override;
+	};
+
+	// f_y(t) = log(1+exp(-y*t) where y is +1 or -1 for binary classification
+	class LogisticLoss : public ISmoothLoss
+	{
+	private:
+		// need to test if the min_log trik is really necessary for numerical stability
+		floatT _min_log;		// boundary margin within [-1, 0] for taking logarithm
+		floatT _epsilon;		// stopping accuracy for Newton's method
+		int    _maxiter;		// maximum iteration for Newton's method
+
+	public:
+		// may not work as desired when floatT = float, 1 + 1e-12 = 1 anyway!
+		LogisticLoss(const floatT min_log = 1.0e-12F, const floatT epsilon = 1.0e-6F, const int maxiter = 10);
+		inline floatT smoothness() const override { return 0.25; }	// smoothness of log(1+exp(-b*t))
+		floatT loss(const float Xiw, const floatT yi) const override;
+		void   loss(const Vector &Xw, const Vector y, Vector &loss) const override;
+		floatT sum_loss(const Vector &Xw, const Vector &y) const override;
+		floatT derivative(const floatT Xiw, const floatT yi) const override;
+		void   derivative(const Vector &Xw, const Vector &y, Vector &d) const override;
+		floatT conjugate(const floatT &ai, const floatT yi) const override;
+		void   conjugate(const Vector &a, const Vector &y, Vector &conj) const override;
+		floatT sum_conjugate(const Vector &a, const Vector &y) const override;
+		floatT conj_prox(floatT sigma, const floatT ai, const floatT yi) const override;
+		void   conj_prox(floatT sigma, const Vector &a, const Vector &y, Vector &u) const override;
+	};
+
+	// smoothed hinge constructed by adding strong convexity to conjugate
+	class SmoothedHinge : public ISmoothLoss
+	{
+	private:
+		floatT _delta;			// conjugate smoothing parameter
+
+	public:
+		SmoothedHinge(const floatT delta = 1) { assert(delta > 0); _delta = delta; };
+		inline floatT smoothness() const override { return 1.0f / _delta; }; // smoothness by conjugate smoothing
+		floatT loss(const float Xiw, const floatT yi) const override;
+		void   loss(const Vector &Xw, const Vector y, Vector &loss) const override;
+		floatT sum_loss(const Vector &Xw, const Vector &y) const override;
+		floatT derivative(const floatT Xiw, const floatT yi) const override;
+		void   derivative(const Vector &Xw, const Vector &y, Vector &d) const override;
+		floatT conjugate(const floatT &ai, const floatT yi) const override;
+		void   conjugate(const Vector &a, const Vector &y, Vector &conj) const override;
+		floatT sum_conjugate(const Vector &a, const Vector &y) const override;
+		floatT conj_prox(floatT sigma, const floatT ai, const floatT yi) const override;
+		void   conj_prox(floatT sigma, const Vector &a, const Vector &y, Vector &u) const override;
+	};
+
+	//================================================================================================
+	// An abstract class to serve as interface for convex regularization functions
+	class IRegularizer
+	{
+	protected:
+		bool _is_unity = false;					// _is_unity == true iff this->convexity() == 1
+	public:
+		IRegularizer() = default;
+		virtual char symbol() const = 0;		// to enable customized algorithm implementations
+		virtual floatT convexity() const = 0;	// strong convexity parameter
+		inline bool is_unity() const { return _is_unity; };
+		// rescale so that convexity parameter = 1, return equivalent scale (previous convexity)
+		virtual floatT unitize() = 0;
+		virtual floatT operator()(const Vector &w) const = 0;	// return function value
+		virtual floatT conjugate(const Vector &v) const = 0;	// return conjugate function value
+		virtual floatT conjugate(const Vector &v, const Vector &w) const = 0;
+		// prox() operator computes u = argmin_u { tau*f(u) + (1/2)*||u-w||_2^2 }
+		virtual void prox(const floatT tau, const Vector &w, Vector &u) const = 0;
+		// return gradient of conjugate function: argmax_w {v'*w - f(w)} = argmin_w {-v'*w + f(w)}
+		virtual void conj_grad(const Vector &v, Vector &w) const = 0;	
+	};
+
+	// g(w) = (lambda2/2)*||w||_2^2
+	class SquaredL2Norm : public IRegularizer
+	{
+	private:
+		floatT _lambda;
+
+	public:
+		SquaredL2Norm(const floatT lambda);
+		SquaredL2Norm(const SquaredL2Norm &g, bool unitize);
+		inline char symbol() const override { return '2'; };
+		inline floatT convexity() const override { return _lambda; };
+		inline floatT unitize() override;
+		floatT operator()(const Vector &w) const override;
+		floatT conjugate(const Vector &v) const override;
+		floatT conjugate(const Vector &v, const Vector &w) const override;
+		void prox(const floatT tau, const Vector &w, Vector &u) const override;
+		void conj_grad(const Vector &w, Vector &g) const override;	
+	};
+
+	// g(w) = (lambda2/2)*||w||_2^2 + lambda1*||w||_1
+	class ElasticNet : public IRegularizer
+	{
+	private:
+		floatT _lambda1, _lambda2;
+
+	public:
+		ElasticNet(const floatT lambda1, const floatT lambda2);
+		ElasticNet(const ElasticNet &g, bool unitize);
+		inline char symbol() const override { return 'e'; };
+		inline floatT convexity() const override { return _lambda2; };
+		inline floatT unitize() override;
+		floatT operator()(const Vector &w) const override;
+		floatT conjugate(const Vector &v) const override;
+		floatT conjugate(const Vector &v, const Vector &w) const override;
+		void prox(const floatT tau, const Vector &w, Vector &u) const override;
+		void conj_grad(const Vector &w, Vector &g) const override;
+
+		// member functions specific to this class, not defined in IRegularizer
+		inline floatT l1parameter() const { return _lambda1; };
+	};
+
+	// g(w) = (3/2)log(d)||w||_q^2 + (lambda1/lambda2)||w||_1  (when feature has low infty norm) 
+}
+
--- a/src/Numerics/regu_loss.cpp
+++ b/src/Numerics/regu_loss.cpp
@ -0,0 +1,103 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include<string>
+
+#include "regu_loss.h"
+
+namespace functions
+{
+	RegularizedLoss::RegularizedLoss(const SparseMatrixCSR &A, const Vector &b, const char f, double lambda,
+		const char g, const double l1weight)
+	{
+		assert(A.nrows() == b.length());
+
+		_A = new SubSparseMatrixCSR(A, 0, A.nrows());
+		_b = new SubVector(b, 0, b.length());
+		_lambda = lambda;
+
+		_Ax = new Vector(A.nrows());
+		_drv = new Vector(A.nrows());
+
+		// such conditional constructions needs to be put in a reuseable function!
+		switch (f)
+		{
+		case 'L':
+		case 'l':
+			_f = new LogisticLoss();
+			break;
+		case 'S':
+		case 's':
+			_f = new SmoothedHinge();
+			break;
+		case '2':
+			_f = new SquareLoss();
+			break;
+		default:
+			throw std::runtime_error("Loss function type " + std::to_string(f) + " not defined.");
+		}
+
+		switch (g)
+		{
+		case '2':
+			_g = new SquaredL2Norm();
+			break;
+		case 'E':
+		case 'e':
+			_g = new ElasticNet(l1weight / lambda);
+			break;
+		default:
+			throw std::runtime_error("Regularizer type " + std::to_string(g) + " not defined.");
+		}
+	}
+
+	RegularizedLoss::~RegularizedLoss()
+	{
+		delete _A;
+		delete _b;
+		delete _f;
+		delete _g;
+		delete _Ax;
+		delete _drv;
+	}
+
+	double RegularizedLoss::avg_loss(const Vector &x)
+	{
+		return this->sum_loss(x) / _b->length();
+	}
+
+	double RegularizedLoss::sum_loss(const Vector &x)
+	{
+		_A->aAxby(1.0, x, 0, *_Ax);
+		return _f->sum_loss(*_Ax, *_b);
+	}
+
+	double RegularizedLoss::regu_loss(const Vector &x)
+	{
+		return this->sum_loss(x) / _b->length() + _lambda*(*_g)(x);
+	}
+
+
+	double RegularizedLoss::avg_grad(const Vector &x, Vector &g)
+	{
+		double sum_loss = this->sum_grad(x, g);
+		g.scale(1.0 / _b->length());
+		return sum_loss / _b->length();
+	}
+
+	double RegularizedLoss::sum_grad(const Vector &x, Vector &g)
+	{
+		_A->aAxby(1.0, x, 0, *_Ax);
+		_f->derivative(*_Ax, *_b, *_drv);
+		_A->aATxby(1.0, *_drv, 0, g);
+		return _f->sum_loss(*_Ax, *_b);
+	}
+
+	double RegularizedLoss::regu_grad(const Vector &x, Vector &g)
+	{
+		double sum_loss = this->sum_grad(x, g);
+		g.scale(1.0 / _b->length());
+		g.axpy(_lambda, x);				// this only works for L2 regularization!
+		return sum_loss / _b->length() + _lambda*(*_g)(x);
+	}
+}
--- a/src/Numerics/regu_loss.h
+++ b/src/Numerics/regu_loss.h
@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "functions.h"
+
+namespace functions {
+
+	class RegularizedLoss {
+	private:
+		SubSparseMatrixCSR *_A;
+		SubVector *_b;
+		ISmoothLoss *_f;			// really should be an object here.
+		double _lambda;
+		IUnitRegularizer *_g;
+
+		// working buffers
+		Vector *_Ax, *_drv;
+
+	public:
+		RegularizedLoss(const SparseMatrixCSR &A, const Vector &b, const char f, double lambda, const char g,
+			const double l1weight = 0.0);
+		~RegularizedLoss();
+		double avg_loss(const Vector &x);
+		double sum_loss(const Vector &x);
+		double regu_loss(const Vector &x);
+		// for nonsmooth regularizers, grad means gradient mapping
+		double avg_grad(const Vector &x, Vector &g);
+		double sum_grad(const Vector &x, Vector &g);
+		double regu_grad(const Vector &x, Vector &g);
+	};
+}
--- a/src/Numerics/spmatrix.cpp
+++ b/src/Numerics/spmatrix.cpp
@ -0,0 +1,615 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <assert.h> 
+#include <stdexcept>
+#include <algorithm>
+#include <numeric>
+#include <cstdint>
+#include <vector>
+#include <omp.h>
+
+#include "spmatrix.h"
+
+// OpenMP 3.0 supports unsigned for-loop iterator, so can replace std::int64_t by size_t
+using std::int64_t;
+
+namespace numerics
+{
+	// constructor from three vectors, column index start with 0-indexing, not necessarily ordered
+	template <typename floatT, typename T>
+	SparseMatrixCSR<floatT, T>::SparseMatrixCSR(std::vector<floatT> &vals, std::vector<size_t> &cidx, 
+		std::vector<size_t> &rptr, const bool has_bias_feature, const bool make_copy)
+	{
+		_nrows = rptr.size() - 1;
+		_ncols = *std::max_element(cidx.begin(), cidx.end()) + 1;
+		_nnzs  = vals.size();
+		
+		if (make_copy)	// make deep copies using std::vector<> overloaded operator=()
+		{
+			_mem_values = vals;
+			_mem_colidx = cidx;
+			_mem_rowptr = rptr;
+		}
+		else	// swap two vector<>, so argument memory removed at exit to reduce memory footage
+		{
+			_mem_values.swap(vals);
+			_mem_colidx.swap(cidx);
+			_mem_rowptr.swap(rptr);
+		}
+
+		// these pointers only point to existing memories, no need to use new and delete[]
+		_values = _mem_values.data();
+		_colidx = _mem_colidx.data();
+		_rowstr = _mem_rowptr.data();
+		_rowend = _mem_rowptr.data() + 1;
+
+		// specify biased feature
+		_has_bias_feature = has_bias_feature;
+	}
+
+	// copy constructor, can take a transpose if specified. A could be a SubSparseMatrixCSR!
+	template <typename floatT, typename T>
+	SparseMatrixCSR<floatT, T>::SparseMatrixCSR(const SparseMatrixCSR<floatT, T> &A, bool transpose)
+	{
+		if (!transpose)
+		{
+			_nrows = A._nrows;
+			_ncols = A._ncols;
+			_nnzs = A._nnzs;
+			_mem_values.resize(_nnzs);
+			_mem_colidx.resize(_nnzs);
+			_mem_rowptr.resize(_nrows + 1);
+			size_t k = 0;
+			for (size_t i = 0; i < _nrows; i++) {
+				_mem_rowptr[i] = k;
+				for (size_t j = A._rowstr[i]; j < A._rowend[i]; j++)
+				{
+					_mem_colidx[k] = A._colidx[j];
+					_mem_values[k] = A._values[j];
+					k++;
+				}
+			}
+			_mem_rowptr[_nrows] = k;
+
+			_rowstr = _mem_rowptr.data();
+			_rowend = _mem_rowptr.data() + 1;
+			_has_bias_feature = A.has_bias_feature();
+			return;
+		}
+
+		// the following code does not work if A is a SubSparseMatrix, contiguous or not
+		/*
+		if (!transpose)	// use std::vector<>::operator= to make deep copies
+		{
+			_nrows = A._nrows;
+			_ncols = A._ncols;
+			_nnzs  = A._nnzs;
+			_mem_values = A._mem_values;
+			_mem_colidx = A._mem_colidx;
+			_mem_rowptr = A._mem_rowptr;
+			_values = _mem_values.data();
+			_colidx = _mem_colidx.data();
+			_rowstr = _mem_rowptr.data();
+			_rowend = _mem_rowptr.data() + 1;
+			_has_bias_feature = A.has_bias_feature();
+			return;
+		}
+		*/
+
+		// SparseMatrixCSR transpose, works for un-sorted column indices
+		size_t m, n, nnz, j, p, q;  
+		size_t *Trwp, *Tcli;
+		floatT *Tval;
+		const size_t* Arstr = A._rowstr;
+		const size_t* Arend = A._rowend;
+		const size_t* Acli = A._colidx;
+		const floatT* Aval = A._values;
+
+		m = A._nrows; 		
+		n = A._ncols;		
+		nnz = A._nnzs;
+		_nrows = n; 		
+		_ncols = m;		
+		_nnzs = nnz;
+
+		_mem_values.resize(nnz); 
+		_mem_colidx.resize(nnz); 
+		_mem_rowptr.resize(n + 1);
+		Trwp = _mem_rowptr.data();
+		Tcli = _mem_colidx.data(); 
+		Tval = _mem_values.data();
+
+		std::vector<size_t> ws(n, 0);
+		for (j = 0; j < m; j++) {
+			for (p = Arstr[j]; p < Arend[j]; p++) {
+				ws[Acli[p]]++;
+			}
+		}
+		Trwp[0] = 0;
+		for (j = 0; j < n; j++) {
+			Trwp[j + 1] = Trwp[j] + ws[j];
+			ws[j] = Trwp[j];
+		}
+		for (j = 0; j < m; j++) {
+			for (p = Arstr[j]; p < Arend[j]; p++) {
+				Tcli[q = ws[Acli[p]]++] = j;
+				Tval[q] = Aval[p];
+			}
+		}
+
+		_values = _mem_values.data();
+		_colidx = _mem_colidx.data();
+		_rowstr = _mem_rowptr.data();
+		_rowend = _mem_rowptr.data() + 1;
+		// hard to tell if bias_feature is meaningful in transpose
+		_has_bias_feature = A.has_bias_feature();
+	}
+
+	// THIS ONE IS DANGEOUS TO USE FOR A TRANSPOSED DATA MATRIX WHTERE BIAS ARE THE LAST ROW!
+	template <typename floatT, typename T>
+	bool SparseMatrixCSR<floatT, T>::set_bias_feature(const floatT c)
+	{
+		if (_has_bias_feature) {
+			auto vals = _values;
+			auto rend = _rowend;
+			auto nrows = _nrows;
+			for (size_t i = 0; i < nrows; i++) {
+				vals[rend[i]-1] = c;	
+			}
+		}
+
+		return _has_bias_feature;
+	}
+
+	// THIS ONE IS DANGEOUS TO USE FOR A TRANSPOSED DATA MATRIX WHTERE BIAS ARE THE LAST ROW!
+	// reset number of columns only if n is larger than this->nClos;
+	template <typename floatT, typename T>
+	bool SparseMatrixCSR<floatT, T>::reset_ncols(const size_t n)
+	{
+		if (n <= _ncols) return false;
+
+		_ncols = n;
+		if (_has_bias_feature) {
+			auto cidx = _colidx;
+			auto rend = _rowend;
+			auto nrows = _nrows;
+			for (size_t i = 0; i < nrows; i++) {
+				cidx[rend[i] - 1] = n - 1;
+			}
+		}
+		return true;
+	}
+
+	// return number of elements in each column
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::col_elem_counts(std::vector<size_t> &counts) const
+	{
+		counts.resize(_ncols);
+		std::fill(counts.begin(), counts.end(), 0);
+		for (size_t i = 0; i < _nrows; i++) {
+			for (size_t p = _rowstr[i]; p < _rowend[i]; p++) {
+				counts[_colidx[p]]++;
+			}
+		}
+	}
+
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::row_elem_counts(std::vector<size_t> &counts) const
+	{
+		counts.resize(_nrows);
+		std::fill(counts.begin(), counts.end(), 0);
+		for (size_t i = 0; i < _nrows; i++) {
+			counts[i] = _rowend[i] - _rowstr[i];
+		}
+	}
+
+	// scale ith row of sparse CSR matrix by y[i]
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::scale_rows(const Vector<T> &y)
+	{
+		assert(y.length() == _nrows);
+
+		floatT *vals = _values;
+		size_t nrows = _nrows;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			floatT yi = y[i];
+			size_t endidx = _rowend[i];
+			for (size_t j = _rowstr[i]; j < endidx; j++)
+			{
+				vals[j] *= yi;
+			}
+		}
+	}
+
+	// return value: L[i] is 2-norm squared of ith row of the sparse CSR matrix 
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::row_sqrd2norms(Vector<T> & sqrd2norms) const
+	{
+		assert(sqrd2norms.length() == _nrows);
+
+		floatT *vals = _values;
+		size_t nrows = _nrows;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			T s = 0;
+			size_t endidx = _rowend[i];
+			for (size_t j = _rowstr[i]; j < endidx; j++)
+			{
+				s += vals[j] * vals[j];
+			}
+			sqrd2norms[i] = s;
+		}
+	}
+
+	// scale each row so that each rom has target 2-norm
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::normalize_rows(const T tgt2norm)
+	{
+		floatT *vals = _values;
+		size_t nrows = _nrows;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			T s = 0;
+			size_t endidx = _rowend[i];
+			for (size_t j = _rowstr[i]; j < endidx; j++)
+			{
+				s += vals[j] * vals[j];
+			}
+			if (s > 0) {
+				s = tgt2norm / sqrt(s);
+				for (size_t j = _rowstr[i]; j < endidx; j++)
+				{
+					vals[j] *= s;
+				}
+			}
+		}
+	}
+
+	// return Frobenious norm of matrix
+	template <typename floatT, typename T>
+	T SparseMatrixCSR<floatT, T>::Frobenious_norm() const
+	{
+		Vector<T> L(_nrows);
+		this->row_sqrd2norms(L);
+		return sqrt(L.sum());
+	}
+
+	// to test if two matrices has the same values
+	template <typename floatT, typename T>
+	T SparseMatrixCSR<floatT, T>::norm_of_difference(SparseMatrixCSR<floatT, T> &A)
+	{
+		if (_nrows != A._nrows || _ncols != A._ncols || _nnzs != A._nnzs) return -1;
+
+		T diff_sqrd = 0.0;
+
+		floatT *vals = _values;
+		floatT *A_vals = A._values;
+		size_t *colidx = _colidx;
+		size_t *A_colidx = A._colidx;
+		size_t nrows = _nrows;
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			if (_rowend[i] - _rowstr[i] != A._rowend[i] - A._rowstr[i]) return -1;
+
+			size_t ni = _rowend[i] - _rowstr[i];
+			size_t jj, jA;
+			for (size_t j = 0; j < ni; j++)
+			{
+				jj = _rowstr[i] + j;
+				jA = A._rowstr[i] + j;
+				if (colidx[jj] != A_colidx[jA]) return -1;
+				diff_sqrd += std::pow(vals[jj] - A_vals[jA], 2);
+			}
+		}
+
+		return std::sqrt(diff_sqrd);
+	}
+
+	// y = alpha * A * x + beta * y
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::aAxby(const T alpha, const Vector<T> &x, const T beta, Vector<T> &y) const
+	{
+		assert(y.length() == _nrows && x.length() == _ncols);
+
+		floatT *vals = _values;
+		size_t *cidx = _colidx;
+		size_t nrows = _nrows;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			T Aix = 0;
+			size_t endidx = _rowend[i];
+			for (size_t j = _rowstr[i]; j < endidx; j++)
+			{
+				Aix += vals[j] * x[cidx[j]];
+			}
+			y[i] *= beta;
+			y[i] += alpha*Aix;
+		}
+	}
+
+	// y[i] = alpha * A[perm[i],:] * x + beta * y[i]
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::aAxby(const T alpha, const Vector<T> &x, const T beta, Vector<T> &y, std::vector<size_t> &rowperm) const
+	{
+		assert(y.length() == _nrows && x.length() == _ncols && rowperm.size() == _nrows);
+		assert(*std::min_element(std::begin(rowperm), std::end(rowperm)) >= 0
+			&& *std::max_element(std::begin(rowperm), std::end(rowperm)) < _nrows);
+
+		floatT *vals = _values;
+		size_t *cidx = _colidx;
+		size_t nrows = _nrows;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			T Aix = 0;
+			size_t endidx = _rowend[rowperm[i]];
+			for (size_t j = _rowstr[rowperm[i]]; j < endidx; j++)
+			{
+				Aix += vals[j] * x[cidx[j]];
+			}
+			y[i] *= beta;
+			y[i] += alpha*Aix;
+		}
+	}
+
+	// block matrix vector product: y = alpha * A(:,start:end) * x + beta * y. Vector x has length end-start+1
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::aAxbyBlock(const T alpha, const size_t start, const Vector<T> &x, 
+		const T beta, Vector<T> &y) const
+	{
+		assert(y.length() == _nrows && start + x.length() <= _ncols);
+
+		floatT *vals = _values;
+		size_t *cidx = _colidx;
+		size_t nrows = _nrows;
+		size_t blockend = start + x.length();
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			T Aix = 0;
+			size_t endidx = _rowend[i];
+			size_t idx;
+			for (size_t j = _rowstr[i]; j < endidx; j++)
+			{
+				idx = cidx[j];
+				if (idx >= start && idx < blockend)
+				{
+					Aix += vals[j] * x[idx - start];
+				}
+			}
+			y[i] *= beta;
+			y[i] += alpha*Aix;
+		}
+	}
+
+	// y = alpha * A' * x + beta * y
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::aATxby(const T alpha, const Vector<T> &x, const T beta, Vector<T> &y) const
+	{
+		assert(x.length() == _nrows && y.length() == _ncols);
+
+		// first scale it before entering racing conditions using OpenMP
+		y.scale(beta);
+
+		floatT *vals = _values;
+		size_t *cidx = _colidx;
+		size_t nrows = _nrows;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			size_t endidx = _rowend[i];
+			for (size_t j = _rowstr[i]; j < endidx; j++)
+			{
+				#pragma omp atomic
+				y[cidx[j]] += alpha * vals[j] * x[i];
+			}
+		}
+	}
+
+	// y = alpha * A[rowperm,:]' * x + beta * y
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::aATxby(const T alpha, const Vector<T> &x, const T beta, Vector<T> &y, std::vector<size_t> &rowperm) const
+	{
+		assert(x.length() == _nrows && y.length() == _ncols && rowperm.size() == _nrows);
+		assert(*std::min_element(std::begin(rowperm), std::end(rowperm)) >= 0
+			&& *std::max_element(std::begin(rowperm), std::end(rowperm)) < _nrows);
+
+		// first scale it before entering racing conditions using OpenMP
+		y.scale(beta);
+
+		floatT *vals = _values;
+		size_t *cidx = _colidx;
+		size_t nrows = _nrows;
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			size_t endidx = _rowend[rowperm[i]];
+			for (size_t j = _rowstr[rowperm[i]]; j < endidx; j++)
+			{
+				#pragma omp atomic
+				y[cidx[j]] += alpha * vals[j] * x[i];
+			}
+		}
+	}
+
+	// block matrix Transpose vector product y = alpha * A(:,start:end)' * x + beta * y. Vector y has length end-start+1
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::aATxbyBlock(const T alpha, const Vector<T> &x, const T beta, 
+		const size_t start, Vector<T> &y) const
+	{
+		assert(x.length() == _nrows && start + y.length() <= _ncols);
+
+		// first scale it before entering racing conditions using OpenMP
+		y.scale(beta);
+
+		floatT *vals = _values;
+		size_t *cidx = _colidx;
+		size_t nrows = _nrows;
+		size_t blockend = start + x.length();
+
+		#pragma omp parallel for
+		for (int64_t i = 0; i < nrows; i++)
+		{
+			size_t endidx = _rowend[i];
+			size_t cj;
+			for (size_t j = _rowstr[i]; j < endidx; j++)
+			{
+				cj = cidx[j];
+				if (cj >= start && cj < blockend)
+				{
+					#pragma omp atomic
+					y[cj - start] += alpha * vals[j] * x[i];
+				}
+			}
+		}
+	}
+
+	// return inner product of A(idx,:) and vector x, where idx is zero-based indexing
+	template <typename floatT, typename T>
+	T SparseMatrixCSR<floatT, T>::row_dot(const size_t row, const Vector<T> &x) const
+	{
+		assert(row < _nrows && x.length() == _ncols);
+
+		floatT *vals = _values;
+		size_t *cidx = _colidx;
+		size_t stridx = _rowstr[row];
+		size_t endidx = _rowend[row];
+		T a = 0;
+
+		//#pragma omp parallel for reduction(+:a)  // dramatic slow down for small number of features 
+		//#pragma omp parallel for reduction(+:a) if (endidx-stridx > 10000)	// not really helpful 
+		for (int64_t j = stridx; j < endidx; j++)
+		{
+			a += vals[j] * x[cidx[j]];
+		}
+		return a;
+	}
+
+	// y = alpha * A(idx,:) + y, where idx is zero-based indexing
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::row_add(const size_t row, const T alpha, Vector<T> &y) const
+	{
+		assert(row < _nrows && y.length() == _ncols);
+
+		floatT *vals = _values;
+		size_t *cidx = _colidx;
+		size_t stridx = _rowstr[row];
+		size_t endidx = _rowend[row];
+
+		//#pragma omp parallel for	// dramatic slow down for small number of features, need condition 
+		//#pragma omp parallel for if (endidx-stridx > 10000)	// even this is not really helpful 
+		for (int64_t j = stridx; j < endidx; j++)
+		{
+			y[cidx[j]] += alpha * vals[j];
+		}
+	}
+
+	// Sparse soft-thresholding with threshold c, only using the sparsity pattern of row, not using values!
+	// Only read elements of v and modify elements of w that correspond to nonzero elements in sepcified row
+	template <typename floatT, typename T>
+	void SparseMatrixCSR<floatT, T>::row_sst(const size_t row, const T c, const Vector<T> &v, Vector<T> &w) const
+	{
+		assert(v.length() == _ncols && w.length() == _ncols);
+		
+		size_t stridx = _rowstr[row];
+		size_t endidx = _rowend[row];
+		size_t *cidx = _colidx;
+
+		//#pragma omp parallel for	// dramatic slow down for small number of features, need condition 
+		//#pragma omp parallel for if (endidx-stridx > 10000)	// even this is not really helpful
+		for (int64_t j = stridx; j < endidx; j++)
+		{
+			size_t idx = cidx[j];
+			T vlb = v[idx] - c;
+			T vub = v[idx] + c;
+			w[idx] = vlb > 0 ? vlb : vub < 0 ? vub : 0;
+		}
+	}
+
+	// Block-rows submatrix of SparseMatrixCSR with 0-based indexing, only pointers, no memory allocation
+	template <typename floatT, typename T>
+	SubSparseMatrixCSR<floatT, T>::SubSparseMatrixCSR(const SparseMatrixCSR<floatT, T> &A, const size_t row_start, const size_t num_rows)
+	{
+		this->recast(A, row_start, num_rows);
+	}
+
+	// this will work for submatrix of submatrix, but only contiguous submatrices!
+	template <typename floatT, typename T>
+	void SubSparseMatrixCSR<floatT, T>::recast(const SparseMatrixCSR<floatT, T> &A, const size_t row_start, const size_t num_rows)
+	{
+		std::vector<size_t> idx(num_rows);
+		std::iota(std::begin(idx), std::end(idx), row_start);
+		this->recast(A, idx);
+
+		// The following code does not work for submatrix of submatrix if the rows are not contiguous!
+		/*
+		assert(num_rows > 0 && row_start + num_rows <= A._nrows);
+
+		_nrows = num_rows;
+		_ncols = A._ncols;
+		_nnzs = A._rowend[row_start + num_rows - 1] - A._rowstr[row_start];
+
+		_values = A._values;
+		_colidx = A._colidx;
+		_rowstr = A._rowstr + row_start;
+		_rowend = A._rowend + row_start;
+
+		_has_bias_feature = A.has_bias_feature();
+		*/
+	}
+
+	template <typename floatT, typename T>
+	SubSparseMatrixCSR<floatT, T>::SubSparseMatrixCSR(const SparseMatrixCSR<floatT, T> &A, const std::vector<size_t> &row_indices)
+	{
+		this->recast(A, row_indices);
+	}
+		
+	template <typename floatT, typename T>
+	void SubSparseMatrixCSR<floatT, T>::recast(const SparseMatrixCSR<floatT, T> &A, const std::vector<size_t> &row_indices)
+	{
+		assert(*min_element(row_indices.begin(), row_indices.end()) >= 0);
+		assert(*max_element(row_indices.begin(), row_indices.end()) < A._nrows);
+
+		_nrows = row_indices.size();
+		_ncols = A._ncols;
+
+		_values = A._values;
+		_colidx = A._colidx;
+		_mem_rowstr.resize(_nrows);
+		_mem_rowend.resize(_nrows);
+		_nnzs = 0;
+		for (size_t i = 0; i < _nrows; i++) {
+			_mem_rowstr[i] = A._rowstr[row_indices[i]];
+			_mem_rowend[i] = A._rowend[row_indices[i]];
+			_nnzs += _mem_rowend[i] - _mem_rowstr[i];
+		}
+		_rowstr = _mem_rowstr.data();
+		_rowend = _mem_rowend.data();
+
+		_has_bias_feature = A.has_bias_feature();
+	}
+
+	// instantiates template with different float types
+	template class SparseMatrixCSR<float, float>;
+	template class SparseMatrixCSR<float, double>;
+	template class SparseMatrixCSR<double, float>;
+	template class SparseMatrixCSR<double, double>;
+	template class SubSparseMatrixCSR<float, float>;
+	template class SubSparseMatrixCSR<float, double>;
+	template class SubSparseMatrixCSR<double, float>;
+	template class SubSparseMatrixCSR<double, double>;
+}
--- a/src/Numerics/spmatrix.h
+++ b/src/Numerics/spmatrix.h
@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <vector>
+
+#include "dnvector.h"
+
+namespace numerics{
+
+	template <typename, typename> class SubSparseMatrixCSR;
+
+	// Sparse matrix storage with type floatT, interacting with Vector of type T 
+	template <typename floatT, typename T>
+	class SparseMatrixCSR
+	{
+		// derived class cannot access private/protected members of instances of parent class
+		friend class SubSparseMatrixCSR<floatT, T>;
+
+	private:
+		// std::vector<>'s are used to better interface with memory allocated when reading from file
+		std::vector<floatT> _mem_values;
+		std::vector<size_t> _mem_colidx;
+		std::vector<size_t> _mem_rowptr;	// compact storage, only use NIST format for pointers!
+
+	protected:
+		size_t _nrows;       // number of rows.
+		size_t _ncols;       // number of columns.
+		size_t _nnzs;        // number of non-zero elements.
+		// pointers to stored memory, convenient for Python interface and derived SubSparseMatrixCSR
+		floatT *_values;	// size of _nnzs, pointing nonzero float/double elements in sparse matrix
+		size_t *_colidx;	// size of _nnzs, column indices (0-based for elements in _values.
+		size_t *_rowstr;	// size of _nrows, rowstr[i]-rowstr[0] is index of first non-zero in row i 
+		size_t *_rowend;	// size of _nrows, rowend[i]-rowstr[0] is index of last non-zero in row i
+
+		bool _has_bias_feature = false;
+
+		// this constructor allows derived classes, but protected to prevent explicit use
+		SparseMatrixCSR() = default;
+
+	public:
+
+		// This constructor swaps vector<>'s by default, so input memory is lost after calling
+		SparseMatrixCSR(std::vector<floatT> &vals, std::vector<size_t> &cidx, std::vector<size_t> &rptr,
+					    const bool has_bias_feature = false, const bool make_copy = false);
+
+		// copy constructor, can take a transpose if specified, works with SubSparseMatrixCSR as input
+		SparseMatrixCSR(const SparseMatrixCSR<floatT, T> &A, bool transpose = false);
+
+		// Explicit Descructor is not necessary because std::vector<> used to store data
+		//~SparseMatrixCSR();
+		// For ease of memory management, we do not allow copy-assignment operator 
+		SparseMatrixCSR<floatT, T>& operator=(const SparseMatrixCSR<floatT, T> &A) = delete;
+
+		// to test if two matrices has the same values
+		T norm_of_difference(SparseMatrixCSR<floatT, T> &A);
+
+		inline size_t nrows() const { return _nrows; }
+		inline size_t ncols() const { return _ncols; }
+		inline size_t nnzs()  const { return _nnzs;  }
+
+		inline bool has_bias_feature() const { return _has_bias_feature; }
+		// THE FOLLOWING TWO ARE DANGEOUS TO USE FOR A TRANSPOSED DATA MATRIX WHTERE BIAS ARE THE LAST ROW!
+		bool set_bias_feature(const floatT c);	// only works if _has_bias_feature == true
+		bool reset_ncols(const size_t n);	// also need to reset bias feature index if exist
+
+		// return number of elements in each column
+		void col_elem_counts(std::vector<size_t> &counts) const;
+		void row_elem_counts(std::vector<size_t> &counts) const;
+
+		// scale ith row of sparse CSR matrix by y[i]
+		void scale_rows(const Vector<T> &y);
+		// return value: L[i] is squared 2-norm of ith row of the sparse CSR matrix 
+		void row_sqrd2norms(Vector<T> &sqr2dnorms) const;
+		// scale each row so that each rom has target 2-norm
+		void normalize_rows(const T tgt2norm = 1);
+		// return Frobenious norm of matrix
+		T Frobenious_norm() const;
+
+		// y = alpha * A * x + beta * y
+		void aAxby(const T alpha, const Vector<T> &x, const T beta, Vector<T> &y) const;
+		// y[i] = alpha * A[rowperm[i],:] * x + beta * y[i]
+		void aAxby(const T alpha, const Vector<T> &x, const T beta, Vector<T> &y, std::vector<size_t> &rowperm) const;
+		// y = alpha * A' * x + beta * y
+		void aATxby(const T alpha, const Vector<T> &x, const T beta, Vector<T> &y) const;
+		// y = alpha * A[rowperm,:]' * x + beta * y
+		void aATxby(const T alpha, const Vector<T> &x, const T beta, Vector<T> &y, std::vector<size_t> &rowperm) const;
+
+		// The following two functions are not efficient to take advantage of submatrix
+		// block matrix vector product: y = alpha * A(:,start:end) * x + beta * y. Vector x has length end-start+1
+		void aAxbyBlock(const T alpha, const size_t start, const Vector<T> &x, const T beta, Vector<T> &y) const;
+		// block matrix Transpose vector product y = alpha * A(:,start:end)' * x + beta * y. y has length end-start+1
+		void aATxbyBlock(const T alpha, const Vector<T> &x, const T beta, const size_t start, Vector<T> &y) const;
+
+		// return inner product of A(idx,:) and vector x
+		T row_dot(const size_t rowidx, const Vector<T> &x) const;
+		// y = alpha * A(idx,:) + y
+		void row_add(const size_t rowidx, const T alpha, Vector<T> &y) const;
+		// sparse soft-thresholding using the sparse pattern of one particular row
+		void row_sst(const size_t row, const T c, const Vector<T> &v, Vector<T> &w) const;
+	};
+
+	// This class defines a mask for a submatrix consisting of contiguous rows of a sparse matrix
+	template <typename floatT, typename T>
+	class SubSparseMatrixCSR final : public SparseMatrixCSR<floatT, T>
+	{
+	private:
+		std::vector<size_t> _mem_rowstr;	// store NIST format row indices when sub-matrix
+		std::vector<size_t> _mem_rowend;	// is constructed from a subset of row indices
+
+	public:
+		// constructor from a SparseMatrixCSR with 0-based indexing
+		SubSparseMatrixCSR(const SparseMatrixCSR<floatT, T> &A, const size_t row_start, const size_t num_rows);
+		SubSparseMatrixCSR(const SparseMatrixCSR<floatT, T> &A, const std::vector<size_t> &row_indices);
+		void recast(const SparseMatrixCSR<floatT, T> &A, const size_t row_start, const size_t num_rows);
+		void recast(const SparseMatrixCSR<floatT, T> &A, const std::vector<size_t> &row_indices);
+
+		// For ease of memory management, we do not allow copy constructor and copy-assignment operator
+		SubSparseMatrixCSR() = delete;
+		SubSparseMatrixCSR(const SubSparseMatrixCSR<floatT, T>&) = delete;
+		SubSparseMatrixCSR(const SubSparseMatrixCSR<floatT, T>&, bool) = delete;
+		SubSparseMatrixCSR<floatT, T>& operator=(const SubSparseMatrixCSR<floatT, T>&) = delete;
+	};
+}
+
--- a/src/Numerics/test_numerics.cpp
+++ b/src/Numerics/test_numerics.cpp
@ -0,0 +1,124 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <vector>
+#include <iostream>
+#include <chrono>
+#include <random>
+#include <numeric>
+#include <algorithm>
+
+#include "utils.h"
+#include "timer.h"
+#include "spmatrix.h"
+#include "floattype.h"
+
+using Vector = numerics::Vector<floatT>;
+using SubVector = numerics::SubVector<floatT>;
+using SparseMatrixCSR = numerics::SparseMatrixCSR<spmatT, floatT>;
+using SubSparseMatrixCSR = numerics::SubSparseMatrixCSR<spmatT, floatT>;
+
+int main(int argc, char* argv[])
+{
+	//string filename = "C:/Data/LIBSVM/splice1percent100_1";
+	//string filename = "C:/Data/LIBSVM/ads1percent100_1";
+	//string filename = "C:/Data/LIBSVM/rcv1_test.binary";
+	string filename = "C:/Data/LIBSVM/rcv1_train.binary";
+	//string filename = "C:/Data/LIBSVM/news20.binary";
+	//string filename = "C:/Data/LIBSVM/covtype.libsvm.binary.scale";
+
+	vector<spmatT> labels;
+	vector<spmatT> weights;
+	vector<spmatT> values;
+	vector<size_t> colidx;
+	vector<size_t> rowptr;
+
+	load_datafile(filename, labels, weights, values, colidx, rowptr, false, true, true);
+
+	SparseMatrixCSR X(values, colidx, rowptr, false, false);
+	Vector y(labels);
+	size_t N = X.nrows();
+	size_t D = X.ncols();
+	std::cout << "Sparse matrix with " << N << " rows, " << D << " cols, " << X.nnzs() << " nnzs." << std::endl;
+
+	// test matrix-vector multiplication -----------------------------------------
+	Vector v(D), Xv(N), u(N), XTu(D);
+	u.fill_rand(0, 1);
+	v.fill_rand(0, 1);
+
+	HighResTimer timer;
+	X.aAxby(1, v, 0, Xv);
+	float t_elpsd = timer.seconds_from_last();
+	std::cout << "Matrix-Vector multiplications took " << t_elpsd << " seconds)." << std::endl;
+
+	timer.reset();
+	X.aATxby(1, u, 0, XTu);
+	t_elpsd = timer.seconds_from_last();
+	std::cout << "Matrix.T-Vector multiplications took " << t_elpsd << " seconds)." << std::endl;
+
+	// ---------------------------------------------------------------------------
+	// test submatrix splitting with random indices, prepare for DSCOVR algorithms
+	size_t m = 10;
+	size_t K = 20;
+	std::vector<size_t> row_ids = random_permutation(N);
+	std::vector<size_t> col_ids = random_permutation(D);
+
+	std::vector<SparseMatrixCSR*> XiT(m);
+	std::vector<std::vector<SubSparseMatrixCSR*>> XiTk(m);
+	for (size_t i = 0; i < m; i++) { XiTk[i].resize(K); }
+
+	size_t row_stride = N / m;
+	size_t row_remain = N % m;
+	size_t col_stride = D / K;
+	size_t col_remain = D % K;
+	SubSparseMatrixCSR Xi(X, 0, 1);				// temporary initialization
+	size_t row_start = 0;
+	for (size_t i = 0; i < m; i++) {
+		size_t n_rows = i < row_remain ? row_stride + 1 : row_stride;
+		std::vector<size_t> subrows(row_ids.begin() + row_start, row_ids.begin() + (row_start + n_rows));
+		Xi.recast(X, subrows);
+		XiT[i] = new SparseMatrixCSR(Xi, true);
+		row_start += n_rows;
+
+		size_t col_start = 0;
+		for (size_t k = 0; k < K; k++) {
+			size_t n_cols = k < col_remain ? col_stride + 1 : col_stride;
+			std::vector<size_t> subcols(col_ids.begin() + col_start, col_ids.begin() + (col_start + n_cols));
+			XiTk[i][k] = new SubSparseMatrixCSR(*XiT[i], subcols);	// actually rows of XiT after transpose
+			col_start += n_cols;
+		}
+	}
+
+	// test matrix vector multiplications using the split submatrices
+	Vector Xv2(N), Xv3(N), uperm(N), XTu2(D);
+	SubVector Xiv(Xv2, 0, N);
+	u.post_permute(row_ids, uperm);
+	SubVector ui(uperm, 0, N);
+	row_start = 0;
+	for (size_t i = 0; i < m; i++) {
+		Xiv.recast(Xv2, row_start, XiT[i]->ncols());
+		XiT[i]->aATxby(1, v, 0, Xiv);
+		ui.recast(uperm, row_start, XiT[i]->ncols());
+		XiT[i]->aAxby(1, ui, 1, XTu2);
+		row_start += Xiv.length();
+	}
+	std::cout << "||X*v|| = " << Xv.norm2() << std::endl;
+	std::cout << "||X1*v; ...; Xm*v|| = " << Xv2.norm2() << std::endl;
+	Xv2.pre_permute(row_ids, Xv3);
+	Vector::axpy(-1, Xv, Xv3);
+	std::cout << "||X*v - [X1*v; ...; Xm*v]|| = " << Xv3.norm2() << std::endl;
+
+	std::cout << "||X'*u|| = " << XTu.norm2() << std::endl;
+	std::cout << "||X1'*u1 + ... + Xm'*um|| = " << XTu2.norm2() << std::endl;
+	Vector::axpy(-1, XTu, XTu2);
+	std::cout << "||X'*u - (X1'*u1 + ... + Xm'*um)|| = " << XTu2.norm2() << std::endl;
+
+	// delete pointers allocated with "new"
+	for (size_t i = 0; i < m; i++) {
+		delete XiT[i];
+		for (size_t k = 0; k < K; k++) {
+			delete XiTk[i][k];
+		}
+	}
+	return 0;
+}
--- a/src/SDCA/SDCA.vcxproj
+++ b/src/SDCA/SDCA.vcxproj
@ -0,0 +1,141 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="display.h" />
+    <ClInclude Include="randalgms.h" />
+    <ClInclude Include="sampling.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="display.cpp" />
+    <ClCompile Include="rpcd_cocoa.cpp" />
+    <ClCompile Include="sdca.cpp" />
+    <ClCompile Include="sdca_weighted.cpp" />
+    <ClCompile Include="test_sdca.cpp" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{F2136A94-90B2-44EE-B812-9FA8C0EDD175}</ProjectGuid>
+    <RootNamespace>Arica</RootNamespace>
+    <ProjectName>SDCA</ProjectName>
+    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>../Numerics;../Utils</AdditionalIncludeDirectories>
+      <OpenMPSupport>false</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <AdditionalDependencies>Numerics.lib;Utils.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <AdditionalIncludeDirectories>../Numerics;../Utils</AdditionalIncludeDirectories>
+      <OpenMPSupport>true</OpenMPSupport>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>Numerics.lib;Utils.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/src/SDCA/display.cpp
+++ b/src/SDCA/display.cpp
@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <numeric>
+#include <random>
+#include <algorithm>
+
+#include "display.h"
+
+namespace randalgms {
+
+	void display_header(std::string algm_info)
+	{
+		std::cout << std::endl << algm_info << std::endl;
+		std::cout << "epoch  primal     dual       gap        time" << std::endl;
+	}
+
+	void display_progress(int epoch, double primal_obj, double dual_obj, double time_from_start)
+	{
+		std::cout << std::setw(3) << epoch
+			<< std::fixed << std::setprecision(6)
+			<< std::setw(10) << primal_obj
+			<< std::setw(10) << dual_obj
+			<< std::scientific << std::setprecision(3)
+			<< std::setw(12) << primal_obj - dual_obj
+			<< std::fixed << std::setprecision(3)
+			<< std::setw(9) << time_from_start
+			<< std::endl;
+	}
+
+	void display_header_stage(std::string algm_info)
+	{
+		std::cout << std::endl << algm_info << std::endl;
+		std::cout << "stage epoch  primal     dual       gap        time" << std::endl;
+	}
+
+	void display_progress_stage(int stage, int epoch, double primal_obj, double dual_obj, double time_from_start)
+	{
+		std::cout << std::setw(3) << stage
+			<< std::setw(6) << epoch
+			<< std::fixed << std::setprecision(6)
+			<< std::setw(10) << primal_obj
+			<< std::setw(10) << dual_obj
+			<< std::scientific << std::setprecision(3)
+			<< std::setw(12) << primal_obj - dual_obj
+			<< std::fixed << std::setprecision(3)
+			<< std::setw(9) << time_from_start
+			<< std::endl;
+	}
+}
--- a/src/SDCA/display.h
+++ b/src/SDCA/display.h
@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include<string>
+
+namespace randalgms
+{
+	void display_header(std::string algm_info);
+	void display_progress(int epoch, double primal_obj, double dual_obj, double time_from_start);
+	void display_header_stage(std::string algm_info);
+	void display_progress_stage(int stage, int epoch, double primal_obj, double dual_obj, double time_from_start);
+}
--- a/src/SDCA/randalgms.h
+++ b/src/SDCA/randalgms.h
@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "floattype.h"
+#include "functions.h"
+
+using namespace functions;
+
+namespace randalgms
+{
+	// SDCA need the regularization function to have strongly convex parameter 1 (unit strong convexity)
+	int sdca(const SparseMatrixCSR &X, const Vector &y, const ISmoothLoss &f, const double lambda, 
+		const IUnitRegularizer &g, const int max_epoch, const double eps_gap, Vector &wio, Vector &aio, 
+		const char opt_sample = 'p', const char opt_update = 'd', const bool display = true);
+
+	int sdca(const SparseMatrixCSR &X, const Vector &y, const Vector &theta, const ISmoothLoss &f, const double lambda,
+		const IUnitRegularizer &g, const int max_epoch, const double eps_gap, Vector &wio, Vector &aio,
+		const char opt_sample = 'p', const char opt_update = 'd', const bool display = true);
+
+	// RPCD algorithm for solving CoCoA local optimization problems
+	int rpcd_cocoa(const SparseMatrixCSR &X, const Vector &y, const ISmoothLoss &f, const double lambda_N, const double sigma,
+		const int max_epoch, const Vector &Xw, const Vector &a0, Vector &a1, const char opt_sample = 'p', const bool display = true);
+}
--- a/src/SDCA/rpcd_cocoa.cpp
+++ b/src/SDCA/rpcd_cocoa.cpp
@ -0,0 +1,111 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <stdexcept>
+#include <string>
+
+#include "randalgms.h"
+#include "sampling.h"
+#include "timer.h"
+#include "display.h"
+
+namespace randalgms {
+
+	// RPCD algorithm for solving CoCoA local optimization problems
+	int rpcd_cocoa(const SparseMatrixCSR &X, const Vector &y, const ISmoothLoss &f, const double lambda_N, const double sigma,
+		const int max_epoch, const Vector &Xw, const Vector &a0, Vector &a1, const char opt_sample, const bool display)
+
+	{
+		size_t n = X.nrows();
+		size_t d = X.ncols();
+
+		if (y.length() != n || Xw.length() != n || a0.length() != n || a1.length() != n) {
+			throw std::runtime_error("RPCD(CoCoA): Input/output matrix and vector dimensions do not match.");
+		}
+
+		// start tracking time elapsed.
+		HighResTimer timer;
+
+		// initialize primal-dual variables and working space variables
+		Vector L(n);		// vector to store row-wise Lipschitz constant
+		X.row_sqrd2norms(L);
+
+		// construct the random index generator after vector L is computed
+		IRandomIndexGenerator *randidx;
+		switch (opt_sample) {
+		case 'u':	// i.i.d. uniform sampling
+			randidx = new RandIndex_Unifom(n);
+			break;
+		case 'w':	// i.i.d. weighted sampling proportional to L[i]
+			randidx = new RandIndex_Weighted(L.to_vector());
+			break;
+		case 'p':	// random permutations for each epoch
+			randidx = new RandIndex_Permut(n);
+			break;
+		default:
+			throw std::runtime_error("RPCD(CoCoA): sampling method " + std::string(1, opt_sample) + " is not defined.");
+		}
+
+		// scale L for convenience of computation: L[i] = ||X_i||^2 * sigma / (lambda * n)
+		L.scale(sigma / lambda_N);			
+
+		// need to test if initial a is feasible using f.conj_feasible() return true or false
+		a1.copy(a0);
+		if (!f.conj_feasible(a1, y)) {
+			a1.fill_zero();
+		}
+		double Da = f.sum_conjugate(a1, y);		// objective value with initial a1 = a0 
+		Da /= n;
+
+		double t_elpsd = timer.seconds_from_start();
+		if (display) {
+			display_header("RPCD(CoCoA) with options: sample(" + std::string(1, opt_sample) + ").");
+			display_progress(0, 0, Da, t_elpsd);
+		}
+
+		Vector XTda(d);		// X(a-a0), needed for computing gradient of quadratic (L/2)||X(a-a0)||^2
+		size_t i;
+		double ai, yi, Li, gi, ai1;		// elements of dual variables a1 and Xw
+		int epoch = 0;
+		while (epoch < max_epoch)
+		{
+			// iterate over data once for each epoch
+			for (int64_t iter = 0; iter < n; iter++)
+			{
+				i = randidx->next();
+
+				ai = a1[i];
+				yi = y[i];
+				Li = L[i];
+
+				gi = -Xw[i] + (sigma / lambda_N)*X.row_dot(i, XTda);
+
+				if (Li > 0)
+				{
+					ai1 = f.conj_prox(1.0f / Li, ai - gi / Li, yi);
+				}
+
+				if (ai1 != ai)
+				{
+					X.row_add(i, ai1 - ai, XTda);
+					a1[i] = ai1;
+				}
+			}
+
+			epoch++;
+
+			Da = f.sum_conjugate(a1, y) - Xw.dot(a1) + 0.5*(sigma / lambda_N)*pow(XTda.norm2(), 2);
+			Da /= n;
+
+			t_elpsd = timer.seconds_from_start();
+			if (display) {
+				display_progress(epoch, 0, Da, t_elpsd);
+			}
+		}
+
+		// delete pointers initialized with "new"
+		delete randidx;
+
+		return epoch;
+	}
+}
--- a/src/SDCA/sampling.h
+++ b/src/SDCA/sampling.h
@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <algorithm>
+#include <numeric>
+#include <random>
+#include <vector>
+
+namespace randalgms {
+
+	// an abstract class to serve as interface to random index generators 
+	class IRandomIndexGenerator
+	{
+	protected: 
+		size_t _n;					// range of number of indices 0, 1, 2, ..., n-1 
+		std::mt19937_64 _rng;		// random number generator, seeded in constructor
+
+	public: 
+		IRandomIndexGenerator(const size_t n) : _n(n) { 
+			std::random_device rd; 
+			_rng.seed(rd()); 
+		};
+		virtual size_t operator()() = 0;
+		virtual size_t next() = 0;
+	};
+
+	class RandIndex_Unifom : public IRandomIndexGenerator
+	{
+	private: 
+		std::uniform_int_distribution<size_t> _unif;
+
+	public: 
+		RandIndex_Unifom(const size_t n) : IRandomIndexGenerator(n), _unif(0, n - 1) {};
+		
+		size_t operator()() override { return _unif(_rng); };
+		size_t next() override { return _unif(_rng); };
+	};
+
+	class RandIndex_Permut : public IRandomIndexGenerator
+	{
+	private:
+		std::vector<size_t> _indices;
+		size_t _count;
+
+	public:
+		RandIndex_Permut(const size_t n) : IRandomIndexGenerator(n), _indices(n), _count(0) {
+			std::iota(_indices.begin(), _indices.end(), 0);				// 0, 1, ..., n-1
+			std::shuffle(_indices.begin(), _indices.end(), _rng);		// random permutation
+		};
+
+		size_t operator()() override { return this->next(); };
+
+		size_t next() override {
+			if (_count == _n) {
+				std::shuffle(_indices.begin(), _indices.end(), _rng);	// random permutation
+				_count = 0;												// reset _count = 0
+			}
+			return _indices[_count++]; 
+		};
+	};
+
+	class RandIndex_Weighted : public IRandomIndexGenerator
+	{
+	private:
+		std::discrete_distribution<size_t> _distr;
+
+	public:
+		RandIndex_Weighted(const std::vector<float> weights)
+			:IRandomIndexGenerator(weights.size()), _distr(weights.begin(), weights.end()) {};
+		RandIndex_Weighted(const std::vector<double> weights)
+			:IRandomIndexGenerator(weights.size()), _distr(weights.begin(), weights.end()) {};
+		
+		size_t operator()() override { return _distr(_rng); };
+		size_t next() override { return _distr(_rng); };
+	};
+}
--- a/src/SDCA/sdca.cpp
+++ b/src/SDCA/sdca.cpp
@ -0,0 +1,192 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <stdexcept>
+#include <string>
+
+#include "randalgms.h"
+#include "sampling.h"
+#include "timer.h"
+#include "display.h"
+
+namespace randalgms {
+
+	// SDCA need the regularization function to have strongly convex parameter 1 (unit strong convexity)
+	int sdca(const SparseMatrixCSR &X, const Vector &y, const ISmoothLoss &f, const double lambda, 
+		const IUnitRegularizer &g, const int max_epoch, const double eps_gap, Vector &w, Vector &a, 
+		const char opt_sample, const char opt_update, const bool display)
+	{
+		size_t n = X.nrows();
+		size_t d = X.ncols();
+
+		if (y.length() != n || w.length() != d || a.length() != n) {
+			throw std::runtime_error("SDCA: Input/output matrix and vector dimensions do not match.");
+		}
+
+		// start tracking time elapsed.
+		HighResTimer timer;
+
+		// initialize primal-dual variables and working space variables
+		Vector L(n);							// vector to store row-wise Lipschitz constant
+		X.row_sqrd2norms(L);
+
+		// construct the random index generator after vector L is computed
+		IRandomIndexGenerator *randidx;
+		switch (opt_sample) {
+		case 'u':	// i.i.d. uniform sampling
+			randidx = new RandIndex_Unifom(n);
+			break;
+		case 'w':	// i.i.d. weighted sampling proportional to L[i]
+			randidx = new RandIndex_Weighted(L.to_vector());
+			break;
+		case 'p':	// random permutations for each epoch
+			randidx = new RandIndex_Permut(n);
+			break;
+		default:
+			throw std::runtime_error("SDCA: sampling method " + std::string(1, opt_sample) + " is not defined.");
+		}
+
+		double gammainv = f.smoothness();		// f is 1/gamma smooth
+		double lambda_n = lambda*n;				// sdca suppose g has unit strong convexity
+		L.scale(1.0 / lambda_n);				// scale L for easy computation: L[i] = ||X_i||^2 / (lambda * n)
+
+		// need to test if initial a is feasible using f.conj_feasible() return true or false
+		if (!f.conj_feasible(a, y)) {
+			a.fill_zero();
+		}
+		// initial dual variable a is used (not w) because SDCA is fundamentally a dual algoithm
+		Vector v(d);							// intermediate variable to compute next w
+		Vector Xw(n);							// vector to store X * w
+		X.aATxby(-1.0 / lambda_n, a, 0, v);		// v = - (X' * a) / (lambda * n)
+		g.conj_grad(v, w);						// w = \nabla g*(v) determined by a
+		X.aAxby(1.0f, w, 0, Xw);				// Xw = X * w
+		// compute primal and dual objective values, and duality gap
+		double Pw = f.sum_loss(Xw, y) / n + lambda*g(w);
+		double Da = -f.sum_conjugate(a, y) / n - lambda*g.conjugate(v, w);
+		double Pw_Da = Pw - Da;
+
+		Vector beta(n);							// vector for conjugate-free update
+		f.conj_derivative(a, y, beta);
+
+		double t_elpsd = timer.seconds_from_start();
+		if (display) {
+			display_header("SDCA with options: sample(" + std::string(1, opt_sample)
+						                + "), update(" + std::string(1, opt_update) + ").");
+			display_progress(0, Pw, Da, t_elpsd);
+		}
+
+		// SDCA main loop
+		size_t i;
+		double ai, yi, Li, Xiw, ai1, da;		// elements of dual variables a1 and Xw
+		double cfp = 2;
+		int epoch = 0;
+		while (epoch < max_epoch)
+		{
+			// iterate over data once for each epoch
+			for (int64_t iter = 0; iter < n; iter++)
+			{
+				i = randidx->next();
+
+				ai = a[i];
+				yi = y[i];
+				Li = L[i];
+
+				// compute Xi'*w, exploit structure of g
+				switch (g.symbol()) {
+				case 'q':
+					Xiw = X.row_dot(i, v) + X.row_dot(i, *g.offset());
+					break;
+				default:
+					Xiw = X.row_dot(i, w);
+					break;
+				}
+
+				da = 0;
+				if (Li > 0)
+				{
+					switch (opt_update) {
+					case 'b':	// primal update, use Bregman divergence
+						if (epoch == 0) {
+							da = (f.derivative(Xiw, yi) - ai) / (1.0 + Li * gammainv);
+							ai1 = ai + da;
+							beta[i] = f.conj_derivative(ai1, yi);
+						}
+						else {
+							beta[i] = ((Li*gammainv / cfp)*beta[i] + Xiw) / (1.0 + Li * gammainv / cfp);
+							ai1 = f.derivative(beta[i], yi);
+							da = ai1 - ai;
+						}
+						break;
+					case 'p':	// primal update, use primal function gradient for update
+						da = (f.derivative(Xiw, yi) - ai) / (1.0 + Li*gammainv);
+						ai1 = ai + da;
+						break;
+					case 'd':	// dual update, use dual (conjugate) function prox mapping
+					default:
+						ai1 = f.conj_prox(1.0f / Li, ai + Xiw / Li, yi);
+						da = ai1 - ai;
+						break;
+					}
+				}
+
+				//if (da != 0)
+				if (abs(da) > 1e-8)
+				{
+					switch (g.symbol()) {
+					case '2':	// regularization g is unit L2 norm squared 
+						X.row_add(i, -da / lambda_n, w);
+						break;
+					case 'q':
+						X.row_add(i, -da / lambda_n, v);
+						break;
+					case 'e':	// regularization g is elastic net (unit L2 + L1)
+						X.row_add(i, -da / lambda_n, v);
+						X.row_sst(i, g.l1penalty(), v, w);
+						break;
+					default:	// generic update w = \nabla g*(v)
+						X.row_add(i, -da / lambda_n, v);
+						g.conj_grad(v, w);
+						break;
+					}
+
+					a[i] = ai1;
+				}
+			}
+
+			epoch++;
+
+			// re-calculate v if necessary to combat numerical error accumulation (useful for async?)
+			//X.aATxby(-1.0 / lambda_n, a, 0, v);
+			//g.conj_grad(v, w);
+
+			// compute primal and dual objective values and duality gap
+			if (g.symbol() == 'q') {
+				Vector::elem_add(*g.offset(), v, w);
+			}
+			X.aAxby(1.0f, w, 0, Xw);
+			Pw = f.sum_loss(Xw, y) / n + lambda*g(w);
+			if (g.symbol() == '2') {
+				Da = -f.sum_conjugate(a, y) / n - lambda*g.conjugate(w);
+			}
+			else {
+				Da = -f.sum_conjugate(a, y) / n - lambda*g.conjugate(v);
+			}
+
+			t_elpsd = timer.seconds_from_start();
+			if (display) {
+				display_progress(epoch, Pw, Da, t_elpsd);
+			}
+
+			Pw_Da = Pw - Da;
+			if (Pw_Da < eps_gap) {
+				break;
+			}
+		}
+
+		// delete pointers initialized with "new"
+		delete randidx;
+
+		return epoch;
+	}
+}
+
--- a/src/SDCA/sdca_weighted.cpp
+++ b/src/SDCA/sdca_weighted.cpp
@ -0,0 +1,178 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <stdexcept>
+#include <string>
+
+#include "randalgms.h"
+#include "sampling.h"
+#include "timer.h"
+#include "display.h"
+
+namespace randalgms {
+
+	// SDCA need the regularization function to have strongly convex parameter 1 (unit strong convexity)
+	int sdca(const SparseMatrixCSR &X, const Vector &y, const Vector &theta, const ISmoothLoss &f, const double lambda,
+		const IUnitRegularizer &g, const int max_epoch, const double eps_gap, Vector &w, Vector &a,
+		const char opt_sample, const char opt_update, const bool display)
+	{
+		size_t n = X.nrows();
+		size_t d = X.ncols();
+
+		if (y.length() != n || theta.length() != n || w.length() != d || a.length() != n) {
+			throw std::runtime_error("SDCA: Input/output matrix and vector dimensions do not match.");
+		}
+
+		// start tracking time elapsed.
+		HighResTimer timer;
+
+		// initialize primal-dual variables and working space variables
+		Vector L(n);							// vector to store row-wise Lipschitz constant
+		X.row_sqrd2norms(L);
+
+		// construct the random index generator after vector L is computed
+		IRandomIndexGenerator *randidx;
+		switch (opt_sample) {
+		case 'u':	// i.i.d. uniform sampling
+			randidx = new RandIndex_Unifom(n);
+			break;
+		case 'w':	// i.i.d. weighted sampling proportional to L[i]
+			randidx = new RandIndex_Weighted(L.to_vector());
+			break;
+		case 'p':	// random permutations for each epoch
+			randidx = new RandIndex_Permut(n);
+			break;
+		default:
+			throw std::runtime_error("SDCA: sampling method " + std::string(1, opt_sample) + " is not defined.");
+		}
+
+		double gammainv = f.smoothness();		// f is 1/gamma smooth
+		double lambda_n = lambda*n;				// sdca suppose g has unit strong convexity
+		L.scale(1.0 / lambda_n);				// scale L for easy computation: L[i] = ||X_i||^2 / (lambda * n)
+		Vector::elem_multiply(L, theta, L);		// modifications for update in the weighted case
+
+		// need to test if initial a is feasible using f.conj_feasible() return true or false
+		if (!f.conj_feasible(a, y)) {
+			a.fill_zero();
+		}
+		// initial dual variable a is used (not w) because SDCA is fundamentally a dual algoithm
+		Vector v(d);							// intermediate variable to compute next w
+		Vector Xw(n);							// vector to store X * w
+		Vector::elem_multiply(theta, a, Xw);	// borrow memory of Xw for computing v (not affecting later)
+		X.aATxby(-1.0 / lambda_n, Xw, 0, v);	// v = - (X' * (theta.*a)) / (lambda * n)
+		g.conj_grad(v, w);						// w = \nabla g*(v) determined by a
+		X.aAxby(1.0f, w, 0, Xw);				// Xw = X * w
+												// compute primal and dual objective values, and duality gap
+		double Pw = f.sum_loss(theta, Xw, y) / n + lambda*g(w);
+		double Da = -f.sum_conjugate(theta, a, y) / n - lambda*g.conjugate(v, w);
+		double Pw_Da = Pw - Da;
+
+		double t_elpsd = timer.seconds_from_start();
+		if (display) {
+			display_header("SDCA with options: sample(" + std::string(1, opt_sample)
+				+ "), update(" + std::string(1, opt_update) + ").");
+			display_progress(0, Pw, Da, t_elpsd);
+		}
+
+		// SDCA main loop
+		size_t i;
+		double ai, yi, Li, Xiw, ai1, da;		// elements of dual variables a1 and Xw
+		double cfp = 2;
+		int epoch = 0;
+		while (epoch < max_epoch)
+		{
+			// iterate over data once for each epoch
+			for (int64_t iter = 0; iter < n; iter++)
+			{
+				i = randidx->next();
+
+				ai = a[i];
+				yi = y[i];
+				Li = L[i];
+
+				// compute Xi'*w, exploit structure of g
+				switch (g.symbol()) {
+				case 'q':
+					Xiw = X.row_dot(i, v) + X.row_dot(i, *g.offset());
+					break;
+				default:
+					Xiw = X.row_dot(i, w);
+					break;
+				}
+
+				da = 0;
+				if (Li > 0)
+				{
+					switch (opt_update) {
+					case 'p':	// primal update, use primal function gradient for update
+						da = (f.derivative(Xiw, yi) - ai) / (1.0 + Li*gammainv);
+						ai1 = ai + da;
+						break;
+					case 'd':	// dual update, use dual (conjugate) function prox mapping
+					default:
+						ai1 = f.conj_prox(1.0f / Li, ai + Xiw / Li, yi);
+						da = ai1 - ai;
+						break;
+					}
+				}
+
+				if (da != 0)
+				{
+					switch (g.symbol()) {
+					case '2':	// regularization g is unit L2 norm squared 
+						X.row_add(i, - theta[i] * da / lambda_n, w);
+						break;
+					case 'q':
+						X.row_add(i, - theta[i] * da / lambda_n, v);
+						break;
+					case 'e':	// regularization g is elastic net (unit L2 + L1)
+						X.row_add(i, - theta[i] * da / lambda_n, v);
+						X.row_sst(i, g.l1penalty(), v, w);
+						break;
+					default:	// generic update w = \nabla g*(v)
+						X.row_add(i, - theta[i] * da / lambda_n, v);
+						g.conj_grad(v, w);
+						break;
+					}
+
+					a[i] = ai1;
+				}
+			}
+
+			epoch++;
+
+			// re-calculate v if necessary to combat numerical error accumulation (useful for async?)
+			//X.aATxby(-1.0 / lambda_n, a, 0, v);	// here need to multiply by theta as well!
+			//g.conj_grad(v, w);
+
+			// compute primal and dual objective values and duality gap
+			if (g.symbol() == 'q') {
+				Vector::elem_add(*g.offset(), v, w);
+			}
+			X.aAxby(1.0f, w, 0, Xw);
+			Pw = f.sum_loss(theta, Xw, y) / n + lambda*g(w);
+			if (g.symbol() == '2') {
+				Da = -f.sum_conjugate(theta, a, y) / n - lambda*g.conjugate(w);
+			}
+			else {
+				Da = -f.sum_conjugate(theta, a, y) / n - lambda*g.conjugate(v);
+			}
+
+			t_elpsd = timer.seconds_from_start();
+			if (display) {
+				display_progress(epoch, Pw, Da, t_elpsd);
+			}
+
+			Pw_Da = Pw - Da;
+			if (Pw_Da < eps_gap) {
+				break;
+			}
+		}
+
+		// delete pointers initialized with "new"
+		delete randidx;
+
+		return epoch;
+	}
+}
+
--- a/src/SDCA/test_sdca.cpp
+++ b/src/SDCA/test_sdca.cpp
@ -0,0 +1,110 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <string>
+#include <iostream>
+#include <algorithm>
+#include <random>
+#include <numeric>
+#include <stdexcept>
+#include <ctime>
+#include <math.h>
+
+#include "utils.h"
+#include "randalgms.h"
+
+using namespace std;
+using namespace randalgms;
+
+//int main_test_sdca(int argc, char* argv[])
+int main(int argc, char* argv[])
+{
+	if (argc < 2) {
+		std::cout << "Need to specify training data file." << std::endl;
+		std::exit(0);
+	}
+	string trainingfile = argv[1];
+	string testingfile;
+	if (argc >= 3){ testingfile = argv[2]; }
+
+	// read training and testing files
+	vector<spmatT> labels;
+	vector<spmatT> weights;
+	vector<spmatT> values;
+	vector<size_t> colidx;
+	vector<size_t> rowptr;
+
+	std::cout << "Loading training data ... " << std::endl;
+	size_t n_examples = load_datafile(trainingfile, labels, weights, values, colidx, rowptr, false, true, true);
+	SparseMatrixCSR X(values, colidx, rowptr, false);
+	Vector y(labels);
+
+	size_t dim = X.ncols();
+
+	SparseMatrixCSR *X_test = nullptr;
+	Vector *y_test = nullptr;
+	if (!testingfile.empty()){
+		std::cout << "Loading test data ... " << std::endl;
+		load_datafile(testingfile, labels, weights, values, colidx, rowptr, false, true, true);
+		X_test = new SparseMatrixCSR(values, colidx, rowptr, false);
+		y_test = new Vector(labels);
+
+		dim = X.ncols() >= X_test->ncols() ? X.ncols() : X_test->ncols();
+		X.reset_ncols(dim);
+		X_test->reset_ncols(dim);
+	}
+
+	// training using SDCA and display error rates
+	Vector w(dim), a(X.nrows());
+	ISmoothLoss *f = new LogisticLoss();
+	//ISmoothLoss *f = new SmoothedHinge();
+	//ISmoothLoss *f = new SquareLoss();
+	IUnitRegularizer *g = new SquaredL2Norm();
+	//IUnitRegularizer *g = new ElasticNet(1);
+	OffsetQuadratic q(dim);
+	//IUnitRegularizer *g = &q;
+	q.update_offset(w);
+
+	double lambda = 1.0e-5;
+	int max_epoch = 100;
+	double eps_gap = 1e-8;
+
+	X.normalize_rows();
+	sdca(X, y, *f, lambda, *g, max_epoch, eps_gap, w, a, 'p', 'd');
+	a.fill_zero();
+	sdca(X, y, *f, lambda, *g, max_epoch, eps_gap, w, a, 'p', 'p');
+	a.fill_zero();
+	sdca(X, y, *f, lambda, *g, max_epoch, eps_gap, w, a, 'p', 'b');
+
+	/*
+	q.update_offset(w);
+	a.fill_zero();
+	sdca(X, y, *f, lambda, *g, max_epoch, eps_gap, w, a, 'u', 'd');
+	// shuffle the row indices and do it again
+	std::vector<size_t> idx(X.nrows());
+	std::iota(std::begin(idx), std::end(idx), 0);
+	std::shuffle(std::begin(idx), std::end(idx), std::default_random_engine(0));
+	SubSparseMatrixCSR Xshfl(X, idx);
+	Vector yshfl(y, idx);
+	// w.fill_zero();	// no need to clean w because only a is used to initialize SDCA
+	a.fill_zero();
+	Vector ashfl(a, idx);
+	sdca(Xshfl, yshfl, *f, lambda, *g, max_epoch, eps_gap, w, ashfl, 'w', 'd');
+	*/
+
+	float train_err = binary_error_rate(X, y, w);
+	std::cout << "Training error rate = " << train_err * 100 << " %" << std::endl;
+	if (!testingfile.empty()) {
+		float test_err = binary_error_rate(*X_test, *y_test, w);
+		std::cout << "Testing error rate  = " << test_err * 100 << " %" << std::endl;
+	}
+
+	// do not forget to delete object pointers!
+	if (X_test != nullptr) { delete X_test; }
+	if (y_test != nullptr) { delete y_test; }
+	delete f;
+	if (g->symbol() != 'q') delete g;
+
+	return 0;
+}
+
--- a/src/SplitData/ReadMe.txt
+++ b/src/SplitData/ReadMe.txt
@ -0,0 +1,40 @@
+========================================================================
+    CONSOLE APPLICATION : SplitData Project Overview
+========================================================================
+
+AppWizard has created this SplitData application for you.
+
+This file contains a summary of what you will find in each of the files that
+make up your SplitData application.
+
+
+SplitData.vcxproj
+    This is the main project file for VC++ projects generated using an Application Wizard.
+    It contains information about the version of Visual C++ that generated the file, and
+    information about the platforms, configurations, and project features selected with the
+    Application Wizard.
+
+SplitData.vcxproj.filters
+    This is the filters file for VC++ projects generated using an Application Wizard. 
+    It contains information about the association between the files in your project 
+    and the filters. This association is used in the IDE to show grouping of files with
+    similar extensions under a specific node (for e.g. ".cpp" files are associated with the
+    "Source Files" filter).
+
+SplitData.cpp
+    This is the main application source file.
+
+/////////////////////////////////////////////////////////////////////////////
+Other standard files:
+
+StdAfx.h, StdAfx.cpp
+    These files are used to build a precompiled header (PCH) file
+    named SplitData.pch and a precompiled types file named StdAfx.obj.
+
+/////////////////////////////////////////////////////////////////////////////
+Other notes:
+
+AppWizard uses "TODO:" comments to indicate parts of the source code you
+should add to or customize.
+
+/////////////////////////////////////////////////////////////////////////////
--- a/src/SplitData/SplitData.cpp
+++ b/src/SplitData/SplitData.cpp
@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// SplitData.cpp : Defines the entry point for the console application.
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <algorithm>
+#include <omp.h>
+
+#include "timer.h"
+#include "utils.h"
+#include "functions.h"
+#include "split_file.h"
+
+using namespace functions;
+
+int main(int argc, char* argv[])
+{
+	if (argc < 3) {
+		std::cout << "Need to specify input file name and number of files to split." << std::endl;
+		std::exit(0);
+	}
+	std::string filename(argv[1]);
+	std::string numsplit(argv[2]);
+	int n = std::stoi(numsplit);
+	char sorted = 'n';
+	if (argc >=4 ){
+		sorted = argv[3][0];
+	}
+
+	// record time for reading file and writing files
+	HighResTimer timer;
+	timer.start();
+
+	size_t N;
+	if (sorted == 's' || sorted == 'S') {
+		N = split_data_sorted(filename, n);
+	}
+	else {
+		N = split_datafile(filename, n);
+	}
+
+	std::cout << "Read and wrote " << N << " lines in " << timer.seconds_from_start() << " seconds." << std::endl;
+
+	return 0;
+}
--- a/src/SplitData/SplitData.vcxproj
+++ b/src/SplitData/SplitData.vcxproj
@ -0,0 +1,160 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>SplitData</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <OpenMPSupport>true</OpenMPSupport>
+      <AdditionalIncludeDirectories>../Utils;../Numerics</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>Utils.lib;Numerics.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>../Utils;../Numerics</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>Utils.lib;Numerics.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <Text Include="ReadMe.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="SplitData.cpp" />
+    <ClCompile Include="split_file.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="split_file.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/src/SplitData/SplitData.vcxproj.filters
+++ b/src/SplitData/SplitData.vcxproj.filters
@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <Text Include="ReadMe.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="SplitData.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="split_file.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="split_file.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
--- a/src/SplitData/split_file.cpp
+++ b/src/SplitData/split_file.cpp
@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <stdexcept>
+#include <cstdio>
+#include <cstdint>
+#include <random>
+#include <omp.h>
+
+#include "utils.h"
+#include "timer.h"
+#include "split_file.h"
+
+// OpenMP 3.0 supports unsigned for-loop iterator, so can replace std::int64_t by size_t
+using std::int64_t;
+
+size_t split_datafile(const std::string dataname, const unsigned int n)
+{
+	ifstream ifs(dataname);
+	if (!ifs.is_open()) {
+		throw runtime_error("Data file " + dataname + " cannot be opened.");
+	}
+
+	size_t buffer_size = 256 * 1024;
+	struct FileBuffer {
+		char buf[256 * 1024];
+	};
+	std::vector<FileBuffer> buffer(n);
+
+	std::vector<ofstream> ofs(n);
+	for (int i = 0; i < n; i++) {
+		ofs[i].rdbuf()->pubsetbuf(buffer[i].buf, buffer_size);
+		std::string filename = dataname + std::to_string(n) + '_' + std::to_string(i + 1);
+		ofs[i].open(filename);
+	}
+
+	// uniform random generator
+	std::random_device rdv;
+	std::mt19937_64 rng(rdv());		
+	std::uniform_int_distribution<size_t> unif(0, n - 1);
+
+	std::string line;
+	size_t count = 0;
+	while (std::getline(ifs, line)) {
+		int i = unif(rng);
+		//int i = count % n;
+		ofs[i] << line << '\n';
+		count++;
+	}
+
+	// close all files
+	ifs.close();
+	for (int i = 0; i < n; i++) {
+		ofs[i].close();
+	}
+
+	return count;
+}
+
+size_t split_data_sorted(const std::string dataname, const unsigned int n)
+{
+	ifstream ifs(dataname);
+	if (!ifs.is_open()) {
+		throw runtime_error("Data file " + dataname + " cannot be opened.");
+	}
+
+	// read lines and sort lexicographically
+	std::vector<std::string> alllines;
+
+	std::string line;
+	size_t count = 0;
+	while (std::getline(ifs, line)) {
+		alllines.push_back(line.substr(0,2));
+		count++;
+	}
+	ifs.close();
+
+	std::vector<size_t> indices(count);
+	std::vector<size_t> file_id(count, 0);
+
+	std::iota(indices.begin(), indices.end(), 0);
+
+	//std::sort(alllines.begin(), alllines.end());
+	//std::sort(alllines.begin(), alllines.end(), [](string s1, string s2) {return (s1[0] > s2[0]); });
+	std::sort(indices.begin(), indices.end(), [&alllines](size_t i1, size_t i2) {return (alllines[i1] > alllines[i2]); });
+
+	size_t quotient = count / n;
+	for (size_t i = 0; i < n*quotient; i++) {
+		file_id[indices[i]] = i / quotient;
+	}
+
+	// reopen the input file and then write to output files
+	size_t buffer_size = 256 * 1024;
+	struct FileBuffer {
+		char buf[256 * 1024];
+	};
+	std::vector<FileBuffer> buffer(n);
+
+	std::vector<ofstream> ofs(n);
+	for (int i = 0; i < n; i++) {
+		ofs[i].rdbuf()->pubsetbuf(buffer[i].buf, buffer_size);
+		std::string filename = dataname + std::to_string(n) + "_s0_" + std::to_string(i + 1);
+		ofs[i].open(filename);
+	}
+
+	ifs.open(dataname);
+	count = 0;
+	while (std::getline(ifs, line)) {
+		ofs[file_id[count]] << line << '\n';
+		count++;
+	}
+
+	// close all files
+	ifs.close();
+	for (int i = 0; i < n; i++) {
+		ofs[i].close();
+	}
+
+	return count;
+}
--- a/src/SplitData/split_file.h
+++ b/src/SplitData/split_file.h
@ -0,0 +1,10 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+
+// split data file into smaller files
+size_t split_datafile(const std::string dataname, const unsigned int n);
+size_t split_data_sorted(const std::string dataname, const unsigned int n);
--- a/src/Utils/Utils.vcxproj
+++ b/src/Utils/Utils.vcxproj
@ -0,0 +1,132 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}</ProjectGuid>
+    <RootNamespace>Utils</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v141</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup />
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <SDLCheck>true</SDLCheck>
+      <OpenMPSupport>false</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <OpenMPSupport>true</OpenMPSupport>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <SubSystem>Console</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="loaddata.cpp" />
+    <ClCompile Include="loadlist.cpp" />
+    <ClCompile Include="test_utils.cpp" />
+    <ClCompile Include="utils.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="timer.h" />
+    <ClInclude Include="utils.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/src/Utils/Utils.vcxproj.filters
+++ b/src/Utils/Utils.vcxproj.filters
@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="loaddata.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="test_utils.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="utils.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="loadlist.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="utils.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="timer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
--- a/src/Utils/loaddata.cpp
+++ b/src/Utils/loaddata.cpp
@ -0,0 +1,167 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <stdexcept>
+#include <cstdio>
+#include <cstdint>
+#include <chrono>
+#include <omp.h>
+
+#include "utils.h"
+#include "timer.h"
+
+// OpenMP 3.0 supports unsigned for-loop iterator, so can replace std::int64_t by size_t
+using std::int64_t;
+using namespace std;
+
+// load datafile with line format: <label>[:<weight>] [<colidx>[:<value>]] ...
+template <typename T>
+size_t load_datafile(const string filename, vector<T> &labels, vector<T> &weights,
+	vector<T> &values, vector<size_t> &colidx, vector<size_t> &rowptr, 
+	const bool add_bias, const bool disp_timing, const bool binary, const size_t start_colidx,
+	const bool skip_first)
+{
+	ifstream ifs(filename);
+	if (!ifs.is_open()) {
+		throw runtime_error("Data file " + filename + " cannot be opened.");
+	}
+
+	// record time for reading file and post-processing
+	HighResTimer timer;
+	timer.start();
+
+	// Work with new vectors and then swap them with output vectors in the end
+	vector<T> _labels;
+	vector<T> _weights;
+	vector<T> _values;
+	vector<size_t> _colidx;
+	vector<size_t> _rowptr;
+
+	string line;
+	T lbl, wgh, val;
+	size_t idx, max_idx;
+	const char *pStart;
+	char *pEnd;
+
+	try {
+		while (getline(ifs, line)) 
+		{
+			// pStart now has a null-character '\0' appended at the end as c_str()
+			pStart = line.c_str();
+
+			// first read the label, and possibly a weight for the example
+			lbl = (T)strtod(pStart, &pEnd);
+			if (*pEnd == ':') {
+				wgh = (T)strtod(pEnd + 1, &pEnd);	// pEnd + 1 to skip ':' 
+			}
+			else {
+				wgh = 1.0f;
+			}
+
+			if (binary) { lbl = lbl > 0.5 ? 1 : -1; }	// use {+1, -1} for binary labels
+			_labels.push_back(lbl);
+			_weights.push_back(wgh);
+			_rowptr.push_back(_values.size());
+
+			// some file has redundant information about dimensionality of feature vector
+			if (skip_first) {
+				idx = strtoul(pEnd, &pEnd, 10);	// idx is unsigned long int with base 10
+			}
+
+			// read features in example. test for the null character '\0' at the line end 
+			max_idx = 0;
+			while (*pEnd)	// if not reaching end of line '\0'
+			{
+				idx = strtoul(pEnd, &pEnd, 10);	// idx is unsigned long int with base 10
+				if (*pEnd == ':') {
+					val = (T)strtod(pEnd + 1, &pEnd);	// pEnd + 1 to skip ':' 
+				}
+				else {
+					val = 1.0f;
+				}
+				_values.push_back(val);
+				_colidx.push_back(idx);
+				max_idx = idx > max_idx ? idx : max_idx;
+
+				// skip white space, only necessary at end of each line to move to '\0'
+				while (isspace(*pEnd)) { pEnd++; }
+			}
+
+			if (add_bias)	// only add bias after processing all features in the example
+			{
+				_values.push_back(1.0f);			// can reset value in SparseMatrixCSR
+				_colidx.push_back(max_idx + 1);		// will reset after reading all data
+			}
+		}
+		// set the last element of row pointers in SparseMatrixCSR format
+		_rowptr.push_back(_values.size());
+	}
+	catch (...)
+	{
+		throw runtime_error("Something wrong when reading data file: " + filename);
+	}
+
+	ifs.close();
+
+	float t_loading = timer.seconds_from_last();
+
+	// make sure column index starts with 0, as needed by sparse matrix operations
+	size_t num_vals = _colidx.size();
+	size_t min_idx = *min_element(_colidx.begin(), _colidx.end());
+	// for distributed computing, it is necessary to specify?!
+	if (min_idx < start_colidx) {
+		throw runtime_error("Error in locading data file " + filename + ": min_idx smaller than start_colidx.");
+	}
+	if (start_colidx > 0)
+	{
+		#pragma omp parallel for
+		for (int64_t i = 0; i < num_vals; i++) {
+			_colidx[i] -= start_colidx;
+		}
+	}
+	// reset bias index for all examples to be the same last index
+	size_t num_examples = _labels.size();
+	max_idx = *max_element(_colidx.begin(), _colidx.end());
+	if (add_bias)
+	{
+		#pragma omp parallel for
+		for (int64_t i = 0; i < num_examples; i++) {
+			size_t bias_idx = _rowptr[i + 1] - 1;
+			_colidx[bias_idx] = max_idx;	// this already include bias indexing
+		}
+	}
+
+	// swap vectors to avoid reallocate memory, and clean up content of input vectors
+	labels.swap(_labels);
+	weights.swap(_weights);
+	values.swap(_values);
+	colidx.swap(_colidx);
+	rowptr.swap(_rowptr);
+
+	// display timing if disp_timing == true
+	float t_process = timer.seconds_from_last();
+	if (disp_timing) {
+		std::cout << std::endl << "Loading datafile: " << filename << std::endl;
+		std::cout << "Time for loading datafile: " << t_loading << " seconds." << std::endl;
+		std::cout << "Time for post-processing:  " << t_process << " seconds." << std::endl;
+		std::cout << "Number of examples: " << num_examples << std::endl;
+		std::cout << "Number of features: " << max_idx + 1 << std::endl;
+		std::cout << "Number of nonzeros: " << values.size() << std::endl;
+		std::cout << std::endl;
+	}
+
+	return num_examples;
+}
+
+// Instantiate function templates with float and double types
+template size_t load_datafile<float>(const string filename, vector<float> &labels, vector<float> &weights,
+	vector<float> &vals, vector<size_t> &cidx, vector<size_t> &rptr, 
+	const bool add_bias, const bool disp_timing, const bool binary, const size_t start_colidx, const bool skip_first);
+template size_t load_datafile<double>(const string filename, vector<double> &labels, vector<double> &weights,
+	vector<double> &vals, vector<size_t> &cidx, vector<size_t> &rptr, 
+	const bool add_bias, const bool disp_timing, const bool binary, const size_t start_colidx, const bool skip_first);
--- a/src/Utils/loadlist.cpp
+++ b/src/Utils/loadlist.cpp
@ -0,0 +1,172 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <stdexcept>
+#include <cstdio>
+#include <cstdint>
+#include <chrono>
+#include <omp.h>
+
+#include "utils.h"
+#include "timer.h"
+
+// OpenMP 3.0 supports unsigned for-loop iterator, so can replace std::int64_t by size_t
+using std::int64_t;
+using namespace std;
+
+// load datafile with line format: <label>[:<weight>] [<colidx>[:<value>]] ...
+template <typename T>
+size_t load_datafile(const list<string> filelist, vector<T> &labels, vector<T> &weights,
+	vector<T> &values, vector<size_t> &colidx, vector<size_t> &rowptr,
+	const bool add_bias, const bool disp_timing, const bool binary, const size_t start_colidx,
+	const bool skip_first)
+{
+	// Work with new vectors and then swap them with output vectors in the end
+	vector<T> _labels;
+	vector<T> _weights;
+	vector<T> _values;
+	vector<size_t> _colidx;
+	vector<size_t> _rowptr;
+
+	string line;
+	T lbl, wgh, val;
+	size_t idx, max_idx;
+	const char *pStart;
+	char *pEnd;
+
+	// record time for reading file and post-processing
+	HighResTimer timer;
+	timer.start();
+
+	for (auto & filename : filelist)
+	{
+		ifstream ifs(filename);
+		if (!ifs.is_open()) {
+			throw runtime_error("Data file " + filename + " cannot be opened.");
+		}
+
+		try {
+			while (getline(ifs, line))
+			{
+				// pStart now has a null-character '\0' appended at the end as c_str()
+				pStart = line.c_str();
+
+				// first read the label, and possibly a weight for the example
+				lbl = (T)strtod(pStart, &pEnd);
+				if (*pEnd == ':') {
+					wgh = (T)strtod(pEnd + 1, &pEnd);	// pEnd + 1 to skip ':' 
+				}
+				else {
+					wgh = 1.0f;
+				}
+
+				if (binary) { lbl = lbl > 0.5 ? 1 : -1; }	// use {+1, -1} for binary labels
+				_labels.push_back(lbl);
+				_weights.push_back(wgh);
+				_rowptr.push_back(_values.size());
+
+				// some file has redundant information about dimensionality of feature vector
+				if (skip_first) {
+					idx = strtoul(pEnd, &pEnd, 10);	// idx is unsigned long int with base 10
+				}
+
+				// read features in example. test for the null character '\0' at the line end 
+				max_idx = 0;
+				while (*pEnd)	// if not reaching end of line '\0'
+				{
+					idx = strtoul(pEnd, &pEnd, 10);	// idx is unsigned long int with base 10
+					if (*pEnd == ':') {
+						val = (T)strtod(pEnd + 1, &pEnd);	// pEnd + 1 to skip ':' 
+					}
+					else {
+						val = 1.0f;
+					}
+					_values.push_back(val);
+					_colidx.push_back(idx);
+					max_idx = idx > max_idx ? idx : max_idx;
+
+					// skip white space, only necessary at end of each line to move to '\0'
+					while (isspace(*pEnd)) { pEnd++; }
+				}
+
+				if (add_bias)	// only add bias after processing all features in the example
+				{
+					_values.push_back(1.0f);			// can reset value in SparseMatrixCSR
+					_colidx.push_back(max_idx + 1);		// will reset after reading all data
+				}
+			}
+		}
+		catch (...)
+		{
+			throw runtime_error("Something wrong when reading data file: " + filename);
+		}
+		ifs.close();
+	}
+	// set the last element of row pointers in SparseMatrixCSR format
+	_rowptr.push_back(_values.size());
+
+	float t_loading = timer.seconds_from_last();
+
+	// make sure column index starts with 0, as needed by sparse matrix operations
+	size_t num_vals = _colidx.size();
+	size_t min_idx = *min_element(_colidx.begin(), _colidx.end());
+	// for distributed computing, it is necessary to specify?!
+	if (min_idx < start_colidx) {
+		throw runtime_error("Error in locading data file: min_idx smaller than start_colidx.");
+	}
+	if (start_colidx > 0)
+	{
+#pragma omp parallel for
+		for (int64_t i = 0; i < num_vals; i++) {
+			_colidx[i] -= start_colidx;
+		}
+	}
+	// reset bias index for all examples to be the same last index
+	size_t num_examples = _labels.size();
+	max_idx = *max_element(_colidx.begin(), _colidx.end());
+	if (add_bias)
+	{
+#pragma omp parallel for
+		for (int64_t i = 0; i < num_examples; i++) {
+			size_t bias_idx = _rowptr[i + 1] - 1;
+			_colidx[bias_idx] = max_idx;	// this already include bias indexing
+		}
+	}
+
+	// swap vectors to avoid reallocate memory, and clean up content of input vectors
+	labels.swap(_labels);
+	weights.swap(_weights);
+	values.swap(_values);
+	colidx.swap(_colidx);
+	rowptr.swap(_rowptr);
+
+	// display timing if disp_timing == true
+	float t_process = timer.seconds_from_last();
+	if (disp_timing) {
+		std::cout << std::endl << "Loading datafiles:" << std::endl;
+		for (auto & filename : filelist) {
+			std::cout << "  " << filename << std::endl;
+		}
+		std::cout << "Time for loading datafiles: " << t_loading << " seconds." << std::endl;
+		std::cout << "Time for post-processing:  " << t_process << " seconds." << std::endl;
+		std::cout << "Number of examples: " << num_examples << std::endl;
+		std::cout << "Number of features: " << max_idx + 1 << std::endl;
+		std::cout << "Number of nonzeros: " << values.size() << std::endl;
+		std::cout << std::endl;
+	}
+
+	return num_examples;
+}
+
+// Instantiate function templates with float and double types
+template size_t load_datafile<float>(const list<string> filelist, vector<float> &labels, vector<float> &weights,
+	vector<float> &vals, vector<size_t> &cidx, vector<size_t> &rptr,
+	const bool add_bias, const bool disp_timing, const bool binary, const size_t start_colidx, const bool skip_first);
+template size_t load_datafile<double>(const list<string> filelist, vector<double> &labels, vector<double> &weights,
+	vector<double> &vals, vector<size_t> &cidx, vector<size_t> &rptr,
+	const bool add_bias, const bool disp_timing, const bool binary, const size_t start_colidx, const bool skip_first);
--- a/src/Utils/test_utils.cpp
+++ b/src/Utils/test_utils.cpp
@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include "utils.h"
+
+void main(int argc, char* argv[])
+{
+	using floatT = float;
+
+	vector<floatT> labels;
+	vector<floatT> weights;
+	vector<floatT> values;
+	vector<size_t> colidx;
+	vector<size_t> rowptr;
+
+	//string filename;
+	//filename = argv[1];
+	//filename = "C:/Data/LIBSVMdata/covtype.libsvm.binary.scale";	//  4 seconds
+	//filename = "C:/Data/LIBSVMdata/splice1percent100_1";			// 12 seconds
+	//filename = "C:/Data/LIBSVMdata/ads1percent100_1";				//  4 seconds
+	//filename = "C:/Data/LIBSVMdata/rcv1_test.binary";				// 50 seconds
+	//filename = "C:/Data/LIBSVMdata/rcv1_train.binary";			//  2 seconds
+	//filename = "C:/Data/LIBSVMdata/news20.binary";				//  6 seconds
+	//filename = "C:/Data/MSFT_Ads/TrainingFIDOnly2.10G";			//  6 minutes
+	//filename = "//gcr/scratch/AZ-USCentral/lixiao/Data/MSFT_Ads/TrainingFIDOnly2.10G";	// 25 minutes
+
+	const string files[] =
+	{
+		//"C:/Data/LIBSVMdata/ads1percent100_1",
+		//"C:/Data/LIBSVMdata/rcv1_test.binary",
+		//"C:/Data/LIBSVMdata/news20.binary"
+		"C:/Data/part-00000.txt",
+		"C:/Data/part-r-00001.txt",
+	};
+
+	for (auto & filename : files) {
+		size_t n_examples = load_datafile(filename, labels, weights, values, colidx, rowptr, false, true, true);
+	}
+
+	std::list<string> filelist;
+	filelist.push_back("C:/Data/part-00000.txt");
+	filelist.push_back("C:/Data/part-r-00001.txt");
+	size_t n_examples = load_datafile(filelist, labels, weights, values, colidx, rowptr, false, true, true);
+}
--- a/src/Utils/timer.h
+++ b/src/Utils/timer.h
@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+#include <chrono>
+
+using HighResClock = std::chrono::high_resolution_clock;
+using FloatSeconds = std::chrono::duration<float>;
+
+class HighResTimer
+{
+private: 
+	HighResClock::time_point t_start, t_last;
+public:
+	HighResTimer() { this->start(); };
+	void start() { this->reset(); };
+	void reset() { 
+		t_start = HighResClock::now(); 
+		t_last = t_start; 
+	};
+	float seconds_from_start() { 
+		t_last = HighResClock::now();  
+		FloatSeconds t_elpsd = t_last - t_start;
+		return t_elpsd.count();
+	}
+	float seconds_from_last() {
+		auto t_now = HighResClock::now();
+		FloatSeconds t_elpsd = t_now - t_last;
+		t_last = t_now;
+		return t_elpsd.count();
+	}
+};
--- a/src/Utils/utils.cpp
+++ b/src/Utils/utils.cpp
@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include <vector>
+#include <numeric>
+#include <random>
+#include <algorithm>
+
+#include "utils.h"
+
+// generate a random permutation vector for indices 0, 1, 2, ..., N-1
+std::vector<size_t> random_permutation(const size_t N, const int seed)
+{
+	std::vector<size_t> p(N);
+	std::iota(p.begin(), p.end(), 0);
+	std::random_device rdev;
+	unsigned int rand_seed = seed > 0 ? seed : rdev();
+	std::shuffle(p.begin(), p.end(), std::default_random_engine(rand_seed));
+	return std::move(p);
+}
--- a/src/Utils/utils.h
+++ b/src/Utils/utils.h
@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include<list>
+#include<string>
+#include<vector>
+
+using namespace std;
+
+// generate a random permutation vector for indices 0, 1, 2, ..., N-1
+std::vector<size_t> random_permutation(const size_t N, const int seed = -1);
+
+// Load datafile with line format: <label>[:<weight>] [<colidx>[:<value>]] ...
+// By default LIVSVM colidx starts with 1, but SparseMatrixCSR starts with 0
+template <typename T>
+size_t load_datafile(const string filename, vector<T> &labels, vector<T> &weights,
+	vector<T> &values, vector<size_t> &colidx, vector<size_t> &rowptr,
+	const bool add_bias = false, const bool disp_timing = false, const bool binary = false, 
+	const size_t start_colidx = 1, const bool skip_first = false);
+
+// Need to make some change to read criteo dataset (skip the second entry for each row)
+template <typename T>
+size_t load_datafile(const list<string> filelist, vector<T> &labels, vector<T> &weights,
+	vector<T> &values, vector<size_t> &colidx, vector<size_t> &rowptr,
+	const bool add_bias = false, const bool disp_timing = false, const bool binary = false,
+	const size_t start_colidx = 1, const bool skip_first = false);
--- a/src/pSDCA.sln
+++ b/src/pSDCA.sln
@ -0,0 +1,87 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.26730.8
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Numerics", "Numerics\Numerics.vcxproj", "{3BB4DA69-B2E0-46D0-86DD-1757C5922506}"
+	ProjectSection(ProjectDependencies) = postProject
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C} = {3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Utils", "Utils\Utils.vcxproj", "{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SplitData", "SplitData\SplitData.vcxproj", "{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}"
+	ProjectSection(ProjectDependencies) = postProject
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C} = {3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506} = {3BB4DA69-B2E0-46D0-86DD-1757C5922506}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LBFGS", "LBFGS\LBFGS.vcxproj", "{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}"
+	ProjectSection(ProjectDependencies) = postProject
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C} = {3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506} = {3BB4DA69-B2E0-46D0-86DD-1757C5922506}
+		{F2136A94-90B2-44EE-B812-9FA8C0EDD175} = {F2136A94-90B2-44EE-B812-9FA8C0EDD175}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SDCA", "SDCA\SDCA.vcxproj", "{F2136A94-90B2-44EE-B812-9FA8C0EDD175}"
+	ProjectSection(ProjectDependencies) = postProject
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C} = {3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506} = {3BB4DA69-B2E0-46D0-86DD-1757C5922506}
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Debug|x86 = Debug|x86
+		Release|x64 = Release|x64
+		Release|x86 = Release|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506}.Debug|x64.ActiveCfg = Debug|x64
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506}.Debug|x64.Build.0 = Debug|x64
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506}.Debug|x86.ActiveCfg = Debug|Win32
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506}.Debug|x86.Build.0 = Debug|Win32
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506}.Release|x64.ActiveCfg = Release|x64
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506}.Release|x64.Build.0 = Release|x64
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506}.Release|x86.ActiveCfg = Release|Win32
+		{3BB4DA69-B2E0-46D0-86DD-1757C5922506}.Release|x86.Build.0 = Release|Win32
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}.Debug|x64.ActiveCfg = Debug|x64
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}.Debug|x64.Build.0 = Debug|x64
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}.Debug|x86.ActiveCfg = Debug|Win32
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}.Debug|x86.Build.0 = Debug|Win32
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}.Release|x64.ActiveCfg = Release|x64
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}.Release|x64.Build.0 = Release|x64
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}.Release|x86.ActiveCfg = Release|Win32
+		{3E2AC248-2A09-4F04-9FD8-4F2DEA665B1C}.Release|x86.Build.0 = Release|Win32
+		{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}.Debug|x64.ActiveCfg = Debug|x64
+		{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}.Debug|x64.Build.0 = Debug|x64
+		{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}.Debug|x86.ActiveCfg = Debug|Win32
+		{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}.Debug|x86.Build.0 = Debug|Win32
+		{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}.Release|x64.ActiveCfg = Release|x64
+		{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}.Release|x64.Build.0 = Release|x64
+		{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}.Release|x86.ActiveCfg = Release|Win32
+		{AF0B3A60-1C23-4229-96EC-E1071F4DAFDB}.Release|x86.Build.0 = Release|Win32
+		{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}.Debug|x64.ActiveCfg = Debug|x64
+		{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}.Debug|x64.Build.0 = Debug|x64
+		{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}.Debug|x86.ActiveCfg = Debug|Win32
+		{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}.Debug|x86.Build.0 = Debug|Win32
+		{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}.Release|x64.ActiveCfg = Release|x64
+		{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}.Release|x64.Build.0 = Release|x64
+		{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}.Release|x86.ActiveCfg = Release|Win32
+		{2AF44FB9-F85D-48D0-AC4F-EE06CFFB398E}.Release|x86.Build.0 = Release|Win32
+		{F2136A94-90B2-44EE-B812-9FA8C0EDD175}.Debug|x64.ActiveCfg = Debug|x64
+		{F2136A94-90B2-44EE-B812-9FA8C0EDD175}.Debug|x64.Build.0 = Debug|x64
+		{F2136A94-90B2-44EE-B812-9FA8C0EDD175}.Debug|x86.ActiveCfg = Debug|Win32
+		{F2136A94-90B2-44EE-B812-9FA8C0EDD175}.Debug|x86.Build.0 = Debug|Win32
+		{F2136A94-90B2-44EE-B812-9FA8C0EDD175}.Release|x64.ActiveCfg = Release|x64
+		{F2136A94-90B2-44EE-B812-9FA8C0EDD175}.Release|x64.Build.0 = Release|x64
+		{F2136A94-90B2-44EE-B812-9FA8C0EDD175}.Release|x86.ActiveCfg = Release|Win32
+		{F2136A94-90B2-44EE-B812-9FA8C0EDD175}.Release|x86.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {53CC80D2-85F9-430F-8BEE-5A3F752CE6C2}
+	EndGlobalSection
+EndGlobal