Added stripped down NVTT library.

2015-12-20 20:40:35 -08:00 · 2015-12-20 20:40:35 -08:00 · 8ab70bd8cf
--- a/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
+++ b/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
@ -0,0 +1,24 @@
+NVIDIA Texture Tools 2.0 is licensed under the MIT license.
+
+Copyright (c) 2007 NVIDIA Corporation
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
--- a/3rdparty/nvtt/bc6h/bits.h
+++ b/3rdparty/nvtt/bc6h/bits.h
@ -0,0 +1,76 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_BITS_H
+#define _ZOH_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/Debug.h"
+
+namespace ZOH {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc6h/shapes_two.h
+++ b/3rdparty/nvtt/bc6h/shapes_two.h
@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_SHAPES_TWO_H
+#define _ZOH_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static const int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static const int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
--- a/3rdparty/nvtt/bc6h/tile.h
+++ b/3rdparty/nvtt/bc6h/tile.h
@ -0,0 +1,83 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_TILE_H
+#define _ZOH_TILE_H
+
+#include "zoh_utils.h"
+#include "nvmath/Vector.h"
+#include <math.h>
+
+namespace ZOH {
+
+//#define	USE_IMPORTANCE_MAP	1		// define this if you want to increase importance of some pixels in tile
+class Tile
+{
+public:
+	// NOTE: this returns the appropriately-clamped BIT PATTERN of the half as an INTEGRAL float value
+	static float half2float(uint16 h)
+	{
+		return (float) Utils::ushort_to_format(h);
+	}
+	// NOTE: this is the inverse of the above operation
+	static uint16 float2half(float f)
+	{
+		return Utils::format_to_ushort((int)f);
+	}
+
+	// look for adjacent pixels that are identical. if there are enough of them, increase their importance
+	void generate_importance_map()
+	{
+		// initialize
+		for (int y=0; y<size_y; ++y)
+		for (int x=0; x<size_x; ++x)
+		{
+			// my importance is increased if I am identical to any of my 4-neighbors
+			importance_map[y][x] = match_4_neighbor(x,y) ? 5.0f : 1.0f;
+		}
+	}
+	bool is_equal(int x, int y, int xn, int yn)
+	{
+		if (xn < 0 || xn >= size_x || yn < 0 || yn >= size_y)
+			return false;
+		return( (data[y][x].x == data[yn][xn].x) &&
+				(data[y][x].y == data[yn][xn].y) &&
+				(data[y][x].z == data[yn][xn].z) );
+	}
+
+#ifdef USE_IMPORTANCE_MAP
+	bool match_4_neighbor(int x, int y)
+	{
+		return is_equal(x,y,x-1,y) || is_equal(x,y,x+1,y) || is_equal(x,y,x,y-1) || is_equal(x,y,x,y+1);
+	}
+#else
+	bool match_4_neighbor(int, int)
+	{
+		return false;
+	}
+#endif
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+    nv::Vector3 data[TILE_H][TILE_W];
+	float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+};
+
+}
+
+#endif // _ZOH_TILE_H
--- a/3rdparty/nvtt/bc6h/zoh.cpp
+++ b/3rdparty/nvtt/bc6h/zoh.cpp
@ -0,0 +1,197 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the zoh compressor and decompressor
+
+#include "tile.h"
+#include "zoh.h"
+
+#include <string.h> // memcpy
+
+using namespace ZOH;
+
+
+bool ZOH::isone(const char *block)
+{
+	char code = block[0] & 0x1F;
+
+	return (code == 0x03 || code == 0x07 || code == 0x0b || code == 0x0f);
+}
+
+void ZOH::compress(const Tile &t, char *block)
+{
+	char oneblock[ZOH::BLOCKSIZE], twoblock[ZOH::BLOCKSIZE];
+
+	float mseone = ZOH::compressone(t, oneblock);
+	float msetwo = ZOH::compresstwo(t, twoblock);
+
+	if (mseone <= msetwo)
+		memcpy(block, oneblock, ZOH::BLOCKSIZE);
+	else
+		memcpy(block, twoblock, ZOH::BLOCKSIZE);
+}
+
+void ZOH::decompress(const char *block, Tile &t)
+{
+	if (ZOH::isone(block))
+		ZOH::decompressone(block, t);
+	else
+		ZOH::decompresstwo(block, t);
+}
+
+/*
+void ZOH::compress(string inf, string zohf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	Exr::readRgba(inf, pixels, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "wb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for write";
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	int ndots = 25;
+	int dotcnt = 0;
+	printf("Progress [");
+	for (int i=0; i<ndots;++i) printf(" ");
+	printf("]\rProgress ["); fflush(stdout);
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			ZOH::compress(t, block);
+			if (fwrite(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+			if (tilecnt > (ntiles * dotcnt)/ndots) { printf("."); fflush(stdout); ++dotcnt; }
+		}
+	}
+
+	printf("]\n");		// advance to next line finally
+
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+}
+
+static int str2int(std::string s)
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// zoh file name is ...-w-h.zoh, extract width and height
+static void extract(string zohf, int &w, int &h)
+{
+	size_t n = zohf.rfind('.', zohf.length()-1);
+	size_t n1 = zohf.rfind('-', n-1);
+	size_t n2 = zohf.rfind('-', n1-1);
+	string width = zohf.substr(n2+1, n1-n2-1);
+	w = str2int(width);
+	string height = zohf.substr(n1+1, n-n1-1);
+	h = str2int(height);
+}
+
+static int mode_to_prec[] = {
+	10,7,11,10,
+	10,7,11,11,
+	10,7,11,12,
+	10,7,9,16,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,6,-1,
+};
+
+static int shapeindexhist[32], modehist[32], prechistone[16], prechisttwo[16], oneregion, tworegions;
+
+static void stats(char block[ZOH::BLOCKSIZE])
+{
+	char mode = block[0] & 0x1F; if ((mode & 0x3) == 0) mode = 0; if ((mode & 0x3) == 1) mode = 1; modehist[mode]++;
+	int prec = mode_to_prec[mode];
+	nvAssert (prec != -1);
+	if (!ZOH::isone(block))
+	{
+		tworegions++;
+		prechisttwo[prec]++;
+		int shapeindex = ((block[0] & 0xe0) >> 5) | ((block[1] & 0x3) << 3);
+		shapeindexhist[shapeindex]++;
+	}
+	else
+	{
+		oneregion++;
+		prechistone[prec]++;
+	}
+}
+
+static void printstats()
+{
+	printf("\nPrecision histogram 10b to 16b one region: "); for (int i=10; i<=16; ++i) printf("%d,", prechistone[i]);
+	printf("\nPrecision histogram 6b to 11b two regions: "); for (int i=6; i<=11; ++i) printf("%d,", prechisttwo[i]);
+	printf("\nMode histogram: "); for (int i=0; i<32; ++i) printf("%d,", modehist[i]);
+	printf("\nShape index histogram: "); for (int i=0; i<32; ++i) printf("%d,", shapeindexhist[i]);
+	printf("\nOne region %5.2f%%  Two regions %5.2f%%", 100.0*oneregion/float(oneregion+tworegions), 100.0*tworegions/float(oneregion+tworegions));
+	printf("\n");
+}
+
+void ZOH::decompress(string zohf, string outf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	extract(zohf, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "rb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+
+			ZOH::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+	Exr::writeRgba(outf, pixels, w, h);
+
+#ifndef EXTERNAL_RELEASE
+	printstats();	// print statistics
+#endif
+}
+*/
--- a/3rdparty/nvtt/bc6h/zoh.h
+++ b/3rdparty/nvtt/bc6h/zoh.h
@ -0,0 +1,65 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_H
+#define _ZOH_H
+
+#include "tile.h"
+
+namespace ZOH {
+
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_ONE	= 1;
+static const int NCHANNELS		= 3;
+
+struct FltEndpts
+{
+    nv::Vector3 A;
+    nv::Vector3 B;
+};
+
+struct IntEndpts
+{
+	int A[NCHANNELS];
+	int B[NCHANNELS];
+};
+
+struct ComprEndpts
+{
+	uint A[NCHANNELS];
+	uint B[NCHANNELS];
+};
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compressone(const Tile &t, char *block);
+float compresstwo(const Tile &t, char *block);
+void decompressone(const char *block, Tile &t);
+void decompresstwo(const char *block, Tile &t);
+
+float refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block);
+float roughtwo(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_TWO]);
+
+float refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block);
+float roughone(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_ONE]);
+
+bool isone(const char *block);
+
+}
+
+#endif // _ZOH_H
--- a/3rdparty/nvtt/bc6h/zoh_utils.cpp
+++ b/3rdparty/nvtt/bc6h/zoh_utils.cpp
@ -0,0 +1,324 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "zoh_utils.h"
+#include "nvmath/Vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace ZOH;
+
+static const int denom7_weights_64[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights_64[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+/*static*/ Format Utils::FORMAT;
+
+int Utils::lerp(int a, int b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int round = 32, shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvDebugCheck(0);
+	}
+
+	return (a*weights[denom-i] +b*weights[i] + round) >> shift;
+}
+
+Vector3 Utils::lerp(const Vector3& a, const Vector3 &b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvUnreachable();
+	}
+
+	// no need to round these as this is an exact division
+	return (a*float(weights[denom-i]) +b*float(weights[i])) / float(1 << shift);
+}
+
+
+/*
+	For unsigned f16, clamp the input to [0,F16MAX]. Thus u15.
+	For signed f16, clamp the input to [-F16MAX,F16MAX]. Thus s16.
+
+	The conversions proceed as follows:
+
+	unsigned f16: get bits. if high bit set, clamp to 0, else clamp to F16MAX.
+	signed f16: get bits. extract exp+mantissa and clamp to F16MAX. return -value if sign bit was set, else value
+	unsigned int: get bits. return as a positive value.
+	signed int. get bits. return as a value in -32768..32767.
+
+	The inverse conversions are just the inverse of the above.
+*/
+
+// clamp the 3 channels of the input vector to the allowable range based on FORMAT
+// note that each channel is a float storing the allowable range as a bit pattern converted to float
+// that is, for unsigned f16 say, we would clamp each channel to the range [0, F16MAX]
+
+void Utils::clamp(Vector3 &v)
+{
+	for (int i=0; i<3; ++i)
+	{
+		switch(Utils::FORMAT)
+		{
+		case UNSIGNED_F16:
+			if (v.component[i] < 0.0) v.component[i] = 0;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		case SIGNED_F16:
+			if (v.component[i] < -F16MAX) v.component[i] = -F16MAX;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		default:
+			nvUnreachable();
+		}
+	}
+}
+
+// convert a u16 value to s17 (represented as an int) based on the format expected
+int Utils::ushort_to_format(unsigned short input)
+{
+	int out, s;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		if (input & F16S_MASK) out = 0;
+		else if (input > F16MAX) out = F16MAX;
+		else out = input;
+		break;
+
+	case SIGNED_F16:
+		s = input & F16S_MASK;
+		input &= F16EM_MASK;
+		if (input > F16MAX) out = F16MAX;
+		else out = input;
+		out = s ? -out : out;
+		break;
+	}
+	return out;
+}
+
+// convert a s17 value to u16 based on the format expected
+unsigned short Utils::format_to_ushort(int input)
+{
+	unsigned short out;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (input >= 0 && input <= F16MAX);
+		out = input;
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (input >= -F16MAX && input <= F16MAX);
+		// convert to sign-magnitude
+		int s;
+		if (input < 0) { s = F16S_MASK; input = -input; }
+		else           { s = 0; }
+		out = s | input;
+		break;
+	}
+	return out;
+}
+
+// quantize the input range into equal-sized bins
+int Utils::quantize(float value, int prec)
+{
+	int q, ivalue, s;
+
+	nvDebugCheck (prec > 1);	// didn't bother to make it work for 1
+
+	value = (float)floor(value + 0.5);
+
+	int bias = (prec > 10) ? ((1<<(prec-1))-1) : 0;	// bias precisions 11..16 to get a more accurate quantization
+
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (value >= 0 && value <= F16MAX);
+		ivalue = (int)value;
+		q = ((ivalue << prec) + bias) / (F16MAX+1);
+		nvDebugCheck (q >= 0 && q < (1 << prec));
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (value >= -F16MAX && value <= F16MAX);
+		// convert to sign-magnitude
+		ivalue = (int)value;
+		if (ivalue < 0) { s = 1; ivalue = -ivalue; } else s = 0;
+
+		q = ((ivalue << (prec-1)) + bias) / (F16MAX+1);
+		if (s)
+			q = -q;
+		nvDebugCheck (q > -(1 << (prec-1)) && q < (1 << (prec-1)));
+		break;
+	}
+
+	return q;
+}
+
+int Utils::finish_unquantize(int q, int prec)
+{
+	if (Utils::FORMAT == UNSIGNED_F16)
+		return (q * 31) >> 6;										// scale the magnitude by 31/64
+	else if (Utils::FORMAT == SIGNED_F16)
+		return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;		// scale the magnitude by 31/32
+	else
+		return q;
+}
+
+// unquantize each bin to midpoint of original bin range, except
+// for the end bins which we push to an endpoint of the bin range.
+// we do this to ensure we can represent all possible original values.
+// the asymmetric end bins do not affect PSNR for the test images.
+//
+// code this function assuming an arbitrary bit pattern as the encoded block
+int Utils::unquantize(int q, int prec)
+{
+	int unq, s;
+
+	nvDebugCheck (prec > 1);	// not implemented for prec 1
+
+	switch (Utils::FORMAT)
+	{
+	// modify this case to move the multiplication by 31 after interpolation.
+	// Need to use finish_unquantize.
+
+	// since we have 16 bits available, let's unquantize this to 16 bits unsigned
+	// thus the scale factor is [0-7c00)/[0-10000) = 31/64
+	case UNSIGNED_F16:
+		if (prec >= 15) 
+			unq = q;
+		else if (q == 0) 
+			unq = 0;
+		else if (q == ((1<<prec)-1)) 
+			unq = U16MAX;
+		else
+			unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
+		break;
+
+	// here, let's stick with S16 (no apparent quality benefit from going to S17)
+	// range is (-7c00..7c00)/(-8000..8000) = 31/32
+	case SIGNED_F16:
+		// don't remove this test even though it appears equivalent to the code below
+		// as it isn't -- the code below can overflow for prec = 16
+		if (prec >= 16)
+			unq = q;
+		else
+		{
+			if (q < 0) { s = 1; q = -q; } else s = 0;
+
+			if (q == 0)
+				unq = 0;
+			else if (q >= ((1<<(prec-1))-1))
+				unq = s ? -S16MAX : S16MAX;
+			else
+			{
+				unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
+				if (s)
+					unq = -unq;
+			}
+		}
+		break;
+	}
+	return unq;
+}
+
+
+
+// pick a norm!
+#define	NORM_EUCLIDEAN 1
+
+float Utils::norm(const Vector3 &a, const Vector3 &b)
+{
+#ifdef	NORM_EUCLIDEAN
+	return lengthSquared(a - b);
+#endif
+#ifdef	NORM_ABS
+	Vector3 err = a - b;
+	return fabs(err.x) + fabs(err.y) + fabs(err.z);
+#endif
+}
+
+// parse <name>[<start>{:<end>}]{,}	
+// the pointer starts here         ^
+// name is 1 or 2 chars and matches field names. start and end are decimal numbers
+void Utils::parse(const char *encoding, int &ptr, Field &field, int &endbit, int &len)
+{
+	if (ptr <= 0) return;
+	--ptr;
+	if (encoding[ptr] == ',') --ptr;
+	nvDebugCheck (encoding[ptr] == ']');
+	--ptr;
+	endbit = 0;
+	int scale = 1;
+	while (encoding[ptr] != ':' && encoding[ptr] != '[')
+	{
+		nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+		endbit += (encoding[ptr--] - '0') * scale;
+		scale *= 10;
+	}
+	int startbit = 0; scale = 1;
+	if (encoding[ptr] == '[')
+		startbit = endbit;
+	else  
+	{
+		ptr--;
+		while (encoding[ptr] != '[')
+		{
+			nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+			startbit += (encoding[ptr--] - '0') * scale;
+			scale *= 10;
+		}
+	}
+	len = startbit - endbit + 1;	// startbit>=endbit note
+	--ptr;
+	if (encoding[ptr] == 'm')		field = FIELD_M;
+	else if (encoding[ptr] == 'd')	field = FIELD_D;
+	else {
+		// it's wxyz
+		nvDebugCheck (encoding[ptr] >= 'w' && encoding[ptr] <= 'z');
+		int foo = encoding[ptr--] - 'w';
+		// now it is r g or b
+		if (encoding[ptr] == 'r')		foo += 10;
+		else if (encoding[ptr] == 'g')	foo += 20;
+		else if (encoding[ptr] == 'b')	foo += 30;
+		else nvDebugCheck(0);
+		field = (Field) foo;
+	}
+}
+
+
--- a/3rdparty/nvtt/bc6h/zoh_utils.h
+++ b/3rdparty/nvtt/bc6h/zoh_utils.h
@ -0,0 +1,73 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#pragma once
+#ifndef _ZOH_UTILS_H
+#define _ZOH_UTILS_H
+
+#include "nvmath/Vector.h"
+
+namespace ZOH {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((signed(x))&(1<<((nb)-1)))?((~0)<<(nb)):0)|(signed(x))); }
+
+enum Field {
+    FIELD_M = 1,	// mode
+    FIELD_D = 2,	// distribution/shape
+    FIELD_RW = 10+0, FIELD_RX = 10+1, FIELD_RY = 10+2, FIELD_RZ = 10+3,	// red channel endpoints or deltas
+    FIELD_GW = 20+0, FIELD_GX = 20+1, FIELD_GY = 20+2, FIELD_GZ = 20+3,	// green channel endpoints or deltas
+    FIELD_BW = 30+0, FIELD_BX = 30+1, FIELD_BY = 30+2, FIELD_BZ = 30+3,	// blue channel endpoints or deltas
+};
+
+// some constants
+static const int F16S_MASK	=  0x8000;		// f16 sign mask
+static const int F16EM_MASK	=  0x7fff;		// f16 exp & mantissa mask
+static const int U16MAX		=  0xffff;
+static const int S16MIN		= -0x8000;
+static const int S16MAX		=  0x7fff;
+static const int INT16_MASK	=  0xffff;
+static const int F16MAX		=  0x7bff;		// MAXFLT bit pattern for halfs
+
+enum Format { UNSIGNED_F16, SIGNED_F16 };
+
+class Utils
+{
+public:
+    static Format FORMAT;     // this is a global -- we're either handling unsigned or unsigned half values
+
+    // error metrics
+    static float norm(const nv::Vector3 &a, const nv::Vector3 &b);
+    static float mpsnr_norm(const nv::Vector3 &a, int exposure, const nv::Vector3 &b);
+
+    // conversion & clamp
+    static int ushort_to_format(unsigned short input);
+    static unsigned short format_to_ushort(int input);
+
+    // clamp to format
+    static void clamp(nv::Vector3 &v);
+
+    // quantization and unquantization
+    static int finish_unquantize(int q, int prec);
+    static int unquantize(int q, int prec);
+    static int quantize(float value, int prec);
+
+    static void parse(const char *encoding, int &ptr, Field & field, int &endbit, int &len);
+
+    // lerping
+    static int lerp(int a, int b, int i, int denom);
+    static nv::Vector3 lerp(const nv::Vector3 & a, const nv::Vector3 & b, int i, int denom);
+};
+
+}
+
+#endif // _ZOH_UTILS_H
--- a/3rdparty/nvtt/bc6h/zohone.cpp
+++ b/3rdparty/nvtt/bc6h/zohone.cpp
@ -0,0 +1,799 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// one region zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/Fitting.h"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+#define	NSHAPES	1
+
+static const int shapes[NSHAPES] =
+{
+    0x0000
+};	// only 1 shape
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	2
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;				// associated mode value
+    int modebits;			// number of mode bits
+    const char *encoding;	// verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 4
+
+static const Pattern patterns[NPATTERNS] =
+{
+    16,4,  16,4,  16,4,   1, 0x0f, 5, "bw[10],bw[11],bw[12],bw[13],bw[14],bw[15],bx[3:0],gw[10],gw[11],gw[12],gw[13],gw[14],gw[15],gx[3:0],rw[10],rw[11],rw[12],rw[13],rw[14],rw[15],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    12,8,  12,8,  12,8,   1, 0x0b, 5, "bw[10],bw[11],bx[7:0],gw[10],gw[11],gx[7:0],rw[10],rw[11],rx[7:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,9,  11,9,  11,9,   1, 0x07, 5, "bw[10],bx[8:0],gw[10],gx[8:0],rw[10],rx[8:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,10, 10,10, 10,10,  0, 0x03, 5, "bx[9:0],gx[9:0],rx[9:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+static const int mode_to_pat[MAXMODES] = {
+    -1,-1,-1,
+    3,	// 0x03
+    -1,-1,-1,
+    2,	// 0x07
+    -1,-1,-1,
+    1,	// 0x0b
+    -1,-1,-1,
+    0,	// 0x0f
+    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_ONE], ComprEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_ONE], IntEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_ONE], int prec, IntEndpts q_endpts[NREGIONS_ONE])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_one have a 0 high-order bit
+// index_one is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    int index_positions[NREGIONS_ONE];
+
+    index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        int x = index_positions[region] & 3;
+        int y = (index_positions[region] >> 2) & 3;
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_ONE], const ComprEndpts compressed[NREGIONS_ONE], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_ONE];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_ONE; ++j)
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+        if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+        if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_ONE], const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+}
+
+static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx;
+    int gw, gx;
+    int bw, bx;
+
+    d = 0;
+    rw = rx = 0;
+    gw = gx = 0;
+    bw = bx = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 63);
+
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx;
+}
+
+// compress index 0
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(endpts, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+// position 0 was compressed
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+void ZOH::decompressone(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_ONE];
+    ComprEndpts compr_endpts[NREGIONS_ONE];
+
+    read_header(in, compr_endpts, p);
+    int shapeindex = 0;		// only one shape
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+    for (int r = 0; r < NREGIONS_ONE; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    // read indices
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+            t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+	*/
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_ONE], 
+                            const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block)
+{
+    float orig_err[NREGIONS_ONE], opt_err[NREGIONS_ONE], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_ONE], opt_endpts[NREGIONS_ONE];
+    ComprEndpts compr_orig[NREGIONS_ONE], compr_opt[NREGIONS_ONE];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_ONE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+
+	nvAssert (false); // "No candidate found, should never happen (refineone.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vector3 palette[NREGIONS_ONE][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE])
+{
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x,y,shapeindex) == region)
+                {
+                    colors[np] = tile.data[y][x];
+                    mean += tile.data[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compressone(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE];
+    float msebest = FLT_MAX;
+
+    /*
+		collect the mse values that are within 5% of the best values
+		optimize each one and choose the best
+	*/
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughone(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refineone(t, shapeindex_best, endptsbest, block);
+}
--- a/3rdparty/nvtt/bc6h/zohtwo.cpp
+++ b/3rdparty/nvtt/bc6h/zohtwo.cpp
@ -0,0 +1,883 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// two regions zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+/* optimization algorithm
+
+	get initial float endpoints
+	convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
+		note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
+	for each EC candidate in order from max precision to smaller precision
+		convert endpoints using the appropriate precision.
+		optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
+			(thus the endpoints and indices are in final form.)
+		transform and get bit delta.
+		if the bit delta fits, exit
+	if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
+		add a state variable to nvDebugCheck we only do this once.
+	convert to bit stream.
+	return the error.
+
+	Global optimization
+		order all tiles based on their errors
+		do something special for high-error tiles
+			the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
+
+	display an image that shows partitioning and precision selected for each tile
+*/
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/Fitting.h"
+#include "nvmath/Vector.inl"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#include "shapes_two.h"
+// use only the first 32 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 32
+#define SHAPEBITS 5
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	4
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];    // allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;            // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;                   // associated mode value
+    int modebits;               // number of mode bits
+    const char *encoding;       // verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 10
+
+static const Pattern patterns[NPATTERNS] =
+{
+    11,5,5,5,	11,4,4,4,	11,4,4,4,	1,	0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,5,5,5,	11,4,4,4,	1,	0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,4,4,4,	11,5,5,5,	1,	0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,5,5,5,	10,5,5,5,	10,5,5,5,	1,	0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
+    9,5,5,5,	9,5,5,5,	9,5,5,5,	1,	0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
+    8,6,6,6,	8,5,5,5,	8,5,5,5,	1,	0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
+    8,5,5,5,	8,6,6,6,	8,5,5,5,	1,	0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
+    8,5,5,5,	8,5,5,5,	8,6,6,6,	1,	0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
+    7,6,6,6,	7,6,6,6,	7,6,6,6,	1,	0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
+    6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
+static const int mode_to_pat[MAXMODES] = {	
+    3,	// 0x00
+    8,	// 0x01
+    0,	// 0x02
+    -1,-1,-1,
+    1,	// 0x06
+    -1,-1,-1,
+    2,	// 0x0a
+    -1,-1,-1,
+    4,	// 0x0e
+    -1,-1,-1,
+    5,	// 0x12
+    -2,-1,-1,
+    6,	// 0x16
+    -2,-1,-1,
+    7,	// 0x1a
+    -2,-1,-1,
+    9,	// 0x1e
+    -2
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	R_2(ep)	(ep)[1].A[i]
+#define	R_3(ep)	(ep)[1].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+            R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
+            R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+            R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
+            R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+            R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
+            R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+        int x = POS_TO_X(position);
+        int y = POS_TO_Y(position);
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i)
+            {
+                t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+            }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_TWO];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_TWO; ++j)
+    {
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+            if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+            if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+        }
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int d = shapeindex;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_D:	out.write( d >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_RY:	out.write(ry >> endbit, len); break;
+        case FIELD_RZ:	out.write(rz >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_GY:	out.write(gy >> endbit, len); break;
+        case FIELD_GZ:	out.write(gz >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+        case FIELD_BY:	out.write(by >> endbit, len); break;
+        case FIELD_BZ:	out.write(bz >> endbit, len); break;
+        default: nvUnreachable();
+        }
+    }
+}
+
+static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    if (pat_index == -2)
+        return false;		// reserved mode found
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx, ry, rz;
+    int gw, gx, gy, gz;
+    int bw, bx, by, bz;
+
+    d = 0;
+    rw = rx = ry = rz = 0;
+    gw = gx = gy = gz = 0;
+    bw = bx = by = bz = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_D:	 d |= in.read(len) << endbit; break;
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_RY:	ry |= in.read(len) << endbit; break;
+        case FIELD_RZ:	rz |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_GY:	gy |= in.read(len) << endbit; break;
+        case FIELD_GZ:	gz |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+        case FIELD_BY:	by |= in.read(len) << endbit; break;
+        case FIELD_BZ:	bz |= in.read(len) << endbit; break;
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 46);
+
+    shapeindex = d;
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
+
+    return true;
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(compr_endpts, shapeindex, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+void ZOH::decompresstwo(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_TWO];
+    ComprEndpts compr_endpts[NREGIONS_TWO];
+    int shapeindex;
+
+    if (!read_header(in, compr_endpts, shapeindex, p))
+    {
+        // reserved mode, return all zeroes
+        for (int y = 0; y < Tile::TILE_H; y++)
+            for (int x = 0; x < Tile::TILE_W; x++)
+                t.data[y][x] = Vector3(0.0f);
+
+        return;
+    }
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+        t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+    */
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO], 
+                            const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            pixels[np] = tile.data[y][x];
+            importance[np] = tile.importance_map[y][x];
+            ++np;
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
+{
+    float orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
+    ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+    nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
+{
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            colors[np] = tile.data[y][x];
+            mean += tile.data[y][x];
+            ++np;
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compresstwo(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
+    float msebest = FLT_MAX;
+
+    /*
+    collect the mse values that are within 5% of the best values
+    optimize each one and choose the best
+    */
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughtwo(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refinetwo(t, shapeindex_best, endptsbest, block);
+}
+
--- a/3rdparty/nvtt/bc7/avpcl.cpp
+++ b/3rdparty/nvtt/bc7/avpcl.cpp
@ -0,0 +1,264 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the avpcl compressor and decompressor
+
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include <cstring>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// global flags
+bool AVPCL::flag_premult = false;
+bool AVPCL::flag_nonuniform = false;
+bool AVPCL::flag_nonuniform_ati = false;
+
+// global mode
+bool AVPCL::mode_rgb = false;		// true if image had constant alpha = 255
+
+void AVPCL::compress(const Tile &t, char *block)
+{
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	float mse_mode0 = AVPCL::compress_mode0(t, tempblock);		if(mse_mode0 < msebest) { msebest = mse_mode0; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode1 = AVPCL::compress_mode1(t, tempblock);		if(mse_mode1 < msebest) { msebest = mse_mode1; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode2 = AVPCL::compress_mode2(t, tempblock);		if(mse_mode2 < msebest) { msebest = mse_mode2; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode3 = AVPCL::compress_mode3(t, tempblock);		if(mse_mode3 < msebest) { msebest = mse_mode3; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode4 = AVPCL::compress_mode4(t, tempblock);		if(mse_mode4 < msebest) { msebest = mse_mode4; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode5 = AVPCL::compress_mode5(t, tempblock);		if(mse_mode5 < msebest) { msebest = mse_mode5; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode6 = AVPCL::compress_mode6(t, tempblock);		if(mse_mode6 < msebest) { msebest = mse_mode6; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode7 = AVPCL::compress_mode7(t, tempblock);		if(mse_mode7 < msebest) { msebest = mse_mode7; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+		
+	/*if (errfile)
+	{
+		float errs[21];
+		int nerrs = 8;
+		errs[0] = mse_mode0; 
+		errs[1] = mse_mode1; 
+		errs[2] = mse_mode2; 
+		errs[3] = mse_mode3; 
+		errs[4] = mse_mode4; 
+		errs[5] = mse_mode5; 
+		errs[6] = mse_mode6; 
+		errs[7] = mse_mode7;
+		if (fwrite(errs, sizeof(float), nerrs, errfile) != nerrs)
+			throw "Write error on error file";
+	}*/
+}
+
+/*
+static int getbit(char *b, int start)
+{
+	if (start < 0 || start >= 128) return 0; // out of range
+
+	int ix = start >> 3;
+	return (b[ix] & (1 << (start & 7))) != 0;
+}
+
+static int getbits(char *b, int start, int len)
+{
+	int out = 0;
+	for (int i=0; i<len; ++i)
+		out |= getbit(b, start+i) << i;
+	return out;
+}
+
+static void setbit(char *b, int start, int bit)
+{
+	if (start < 0 || start >= 128) return; // out of range
+
+	int ix = start >> 3;
+
+	if (bit & 1)
+		b[ix] |= (1 << (start & 7));
+	else
+		b[ix] &= ~(1 << (start & 7));
+}
+
+static void setbits(char *b, int start, int len, int bits)
+{
+	for (int i=0; i<len; ++i)
+		setbit(b, start+i, bits >> i);
+}
+*/
+
+void AVPCL::decompress(const char *cblock, Tile &t)
+{
+	char block[AVPCL::BLOCKSIZE];
+	memcpy(block, cblock, AVPCL::BLOCKSIZE);
+
+	switch(getmode(block))
+	{
+	case 0:	AVPCL::decompress_mode0(block, t);	break;
+	case 1:	AVPCL::decompress_mode1(block, t);	break;
+	case 2:	AVPCL::decompress_mode2(block, t);	break;
+	case 3:	AVPCL::decompress_mode3(block, t);	break;
+	case 4:	AVPCL::decompress_mode4(block, t);	break;
+	case 5:	AVPCL::decompress_mode5(block, t);	break;
+	case 6:	AVPCL::decompress_mode6(block, t);	break;
+	case 7:	AVPCL::decompress_mode7(block, t);	break;
+	case 8: // return a black tile if you get a reserved mode
+		for (int y=0; y<Tile::TILE_H; ++y)
+			for (int x=0; x<Tile::TILE_W; ++x)
+				t.data[y][x].set(0, 0, 0, 0);
+		break;
+	default: nvUnreachable();
+	}
+}
+
+/*
+void AVPCL::compress(string inf, string avpclf, string errf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	Targa::read(inf, pixels, w, h);
+	FILE *avpclfile = fopen(avpclf.c_str(), "wb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for write";
+	FILE *errfile = NULL;
+	if (errf != "")
+	{
+		errfile = fopen(errf.c_str(), "wb");
+		if (errfile == NULL) throw "Unable to open error file for write";
+	}
+
+	// Look at alpha channel and override the premult flag if alpha is constant (but only if premult is set)
+	if (AVPCL::flag_premult)
+	{
+		if (AVPCL::mode_rgb)
+		{
+			AVPCL::flag_premult = false;
+			cout << endl << "NOTE: Source image alpha is constant 255, turning off premultiplied-alpha error metric." << endl << endl;
+		}
+	}
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	clock_t start, prev, cur;
+
+	start = prev = clock();
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			if ((tilecnt%100) == 0) { cur = clock(); printf("Progress %d of %d, %5.2f seconds per 100 tiles\r", tilecnt, ntiles, float(cur-prev)/CLOCKS_PER_SEC); fflush(stdout); prev = cur; }
+
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			AVPCL::compress(t, block, errfile);
+			if (fwrite(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+		}
+	}
+
+	cur = clock();
+	printf("\nTotal time to compress: %.2f seconds\n\n", float(cur-start)/CLOCKS_PER_SEC);		// advance to next line finally
+
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+	if (errfile && fclose(errfile)) throw "Close failed on error file";
+}
+
+static int str2int(std::string s) 
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// avpcl file name is ...-w-h-RGB[A].avpcl, extract width and height
+static void extract(string avpclf, int &w, int &h, bool &mode_rgb)
+{
+	size_t n = avpclf.rfind('.', avpclf.length()-1);
+	size_t n1 = avpclf.rfind('-', n-1);
+	size_t n2 = avpclf.rfind('-', n1-1);
+	size_t n3 = avpclf.rfind('-', n2-1);
+	//	...-wwww-hhhh-RGB[A].avpcl
+	//     ^    ^    ^      ^
+	//     n3   n2   n1     n n3<n2<n1<n
+	string width = avpclf.substr(n3+1, n2-n3-1);
+	w = str2int(width);
+	string height = avpclf.substr(n2+1, n1-n2-1);
+	h = str2int(height);
+	string mode = avpclf.substr(n1+1, n-n1-1);
+	mode_rgb = mode == "RGB";
+}
+
+static int modehist[8];
+
+static void stats(char block[AVPCL::BLOCKSIZE])
+{
+	int m = AVPCL::getmode(block);
+	modehist[m]++;
+}
+
+static void printstats()
+{
+	printf("\nMode histogram: "); for (int i=0; i<8; ++i) { printf("%d,", modehist[i]); }
+	printf("\n");
+}
+
+void AVPCL::decompress(string avpclf, string outf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	extract(avpclf, w, h, AVPCL::mode_rgb);
+	FILE *avpclfile = fopen(avpclf.c_str(), "rb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+		
+			AVPCL::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+
+	Targa::write(outf, pixels, w, h);
+
+	printstats();	// print statistics
+}
+*/
--- a/3rdparty/nvtt/bc7/avpcl.h
+++ b/3rdparty/nvtt/bc7/avpcl.h
@ -0,0 +1,99 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_H
+#define _AVPCL_H
+
+#include "tile.h"
+#include "bits.h"
+
+#define	DISABLE_EXHAUSTIVE	1	// define this if you don't want to spend a lot of time on exhaustive compression
+#define	USE_ZOH_INTERP		1	// use zoh interpolator, otherwise use exact avpcl interpolators
+#define	USE_ZOH_INTERP_ROUNDED 1	// use the rounded versions!
+
+namespace AVPCL {
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_THREE	= 3;
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+// global flags
+extern bool flag_premult;
+extern bool flag_nonuniform;
+extern bool flag_nonuniform_ati;
+
+// global mode
+extern bool mode_rgb;		// true if image had constant alpha = 255
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compress_mode0(const Tile &t, char *block);
+void decompress_mode0(const char *block, Tile &t);
+
+float compress_mode1(const Tile &t, char *block);
+void decompress_mode1(const char *block, Tile &t);
+
+float compress_mode2(const Tile &t, char *block);
+void decompress_mode2(const char *block, Tile &t);
+
+float compress_mode3(const Tile &t, char *block);
+void decompress_mode3(const char *block, Tile &t);
+
+float compress_mode4(const Tile &t, char *block);
+void decompress_mode4(const char *block, Tile &t);
+
+float compress_mode5(const Tile &t, char *block);
+void decompress_mode5(const char *block, Tile &t);
+
+float compress_mode6(const Tile &t, char *block);
+void decompress_mode6(const char *block, Tile &t);
+
+float compress_mode7(const Tile &t, char *block);
+void decompress_mode7(const char *block, Tile &t);
+
+inline int getmode(Bits &in)
+{
+	int mode = 0;
+
+	if (in.read(1))			mode = 0;
+	else if (in.read(1))	mode = 1;
+	else if (in.read(1))	mode = 2;
+	else if (in.read(1))	mode = 3;
+	else if (in.read(1))	mode = 4;
+	else if (in.read(1))	mode = 5;
+	else if (in.read(1))	mode = 6;
+	else if (in.read(1))	mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+inline int getmode(const char *block)
+{
+	int bits = block[0], mode = 0;
+
+	if (bits & 1) mode = 0;
+	else if ((bits&3) == 2) mode = 1;
+	else if ((bits&7) == 4) mode = 2;
+	else if ((bits & 0xF) == 8) mode = 3;
+	else if ((bits & 0x1F) == 16) mode = 4;
+	else if ((bits & 0x3F) == 32) mode = 5;
+	else if ((bits & 0x7F) == 64) mode = 6;
+	else if ((bits & 0xFF) == 128) mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc7/avpcl_mode0.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode0.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode1.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode1.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode2.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode2.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode3.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode3.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode4.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode4.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode5.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode5.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode6.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode6.cpp
--- a/3rdparty/nvtt/bc7/avpcl_mode7.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode7.cpp
--- a/3rdparty/nvtt/bc7/avpcl_utils.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_utils.cpp
@ -0,0 +1,389 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "avpcl_utils.h"
+#include "avpcl.h"
+#include "nvmath/Vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+static const int denom7_weights[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+int Utils::lerp(int a, int b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+	nvAssert (a >= 0 && b >= 0);
+
+	int round = 0;
+#ifdef	USE_ZOH_INTERP_ROUNDED
+	round = 32;
+#endif
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*denom15_weights[denom-i] + b*denom15_weights[i] + round) >> 6;
+	case 7:	return (a*denom7_weights[denom-i] + b*denom7_weights[i] + round) >> 6;
+	default: nvUnreachable(); return 0;
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+Vector4 Utils::lerp(Vector4::Arg a, Vector4::Arg b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+//	nvAssert (a >= 0 && b >= 0);
+
+	// no need to bias these as this is an exact division
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*float(denom15_weights[denom-i]) + b*float(denom15_weights[i])) / 64.0f;
+	case 7:	return (a*float(denom7_weights[denom-i]) + b*float(denom7_weights[i])) / 64.0f;
+	default: nvUnreachable(); return Vector4(0);
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+
+int Utils::unquantize(int q, int prec)
+{
+	int unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+#ifdef USE_ZOH_QUANT
+	if (prec >= 8)
+		unq = q;
+	else if (q == 0) 
+		unq = 0;
+	else if (q == ((1<<prec)-1)) 
+		unq = 255;
+	else
+		unq = (q * 256 + 128) >> prec;
+#else
+	// avpcl unquantizer -- bit replicate
+	unq = (q << (8-prec)) | (q >> (2*prec-8));
+#endif
+
+	return unq;
+}
+
+// quantize to the best value -- i.e., minimize unquantize error
+int Utils::quantize(float value, int prec)
+{
+	int q, unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+	unq = (int)floor(value + 0.5f);
+	nvAssert (unq <= 255);
+
+#ifdef USE_ZOH_QUANT
+	q = (prec >= 8) ? unq : (unq << prec) / 256;
+#else
+	// avpcl quantizer -- scale properly for best possible bit-replicated result
+	q = (unq * ((1<<prec)-1) + 127)/255;
+#endif
+
+	nvAssert (q >= 0 && q < (1 << prec));
+
+	return q;
+}
+
+float Utils::metric4(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+// WORK -- implement rotatemode for the below -- that changes where the rwt, gwt, and bwt's go.
+float Utils::metric3(Vector3::Arg a, Vector3::Arg b, int rotatemode)
+{
+	Vector3 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: rwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RABG: gwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RGAB: bwt = 1.0f; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1(const float a, const float b, int rotatemode)
+{
+	float err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
+
+float Utils::premult(float r, float a)
+{
+	// note that the args are really integers stored in floats
+	int R = int(r), A = int(a);
+
+	nvAssert ((R==r) && (A==a));
+
+	return float((R*A + 127)/255);
+}
+
+static void premult4(Vector4& rgba)
+{
+	rgba.x = Utils::premult(rgba.x, rgba.w);
+	rgba.y = Utils::premult(rgba.y, rgba.w);
+	rgba.z = Utils::premult(rgba.z, rgba.w);
+}
+
+static void premult3(Vector3& rgb, float a)
+{
+	rgb.x = Utils::premult(rgb.x, a);
+	rgb.y = Utils::premult(rgb.y, a);
+	rgb.z = Utils::premult(rgb.z, a);
+}
+
+float Utils::metric4premult(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 pma = a, pmb = b;
+
+	premult4(pma);
+	premult4(pmb);
+
+	Vector4 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphaout(Vector3::Arg rgb0, float a0, Vector3::Arg rgb1, float a1)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	premult3(pma, a0);
+	premult3(pmb, a1);
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphain(Vector3::Arg rgb0, Vector3::Arg rgb1, int rotatemode)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	switch(rotatemode)
+	{
+	case ROTATEMODE_RGBA_RGBA:
+		// this function isn't supposed to be called for this rotatemode
+		nvUnreachable();
+		break;
+	case ROTATEMODE_RGBA_AGBR:
+		pma.y = premult(pma.y, pma.x);
+		pma.z = premult(pma.z, pma.x);
+		pmb.y = premult(pmb.y, pmb.x);
+		pmb.z = premult(pmb.z, pmb.x);
+		break;
+	case ROTATEMODE_RGBA_RABG:
+		pma.x = premult(pma.x, pma.y);
+		pma.z = premult(pma.z, pma.y);
+		pmb.x = premult(pmb.x, pmb.y);
+		pmb.z = premult(pmb.z, pmb.y);
+		break;
+	case ROTATEMODE_RGBA_RGAB:
+		pma.x = premult(pma.x, pma.z);
+		pma.y = premult(pma.y, pma.z);
+		pmb.x = premult(pmb.x, pmb.z);
+		pmb.y = premult(pmb.y, pmb.z);
+		break;
+	default: nvUnreachable();
+	}
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode)
+{
+	float err = premult(rgb0, a0) - premult(rgb1, a1);
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
--- a/3rdparty/nvtt/bc7/avpcl_utils.h
+++ b/3rdparty/nvtt/bc7/avpcl_utils.h
@ -0,0 +1,61 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#ifndef _AVPCL_UTILS_H
+#define _AVPCL_UTILS_H
+
+#include "nvmath/Vector.h"
+
+namespace AVPCL {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x)); }
+
+static const int INDEXMODE_BITS				= 1;		// 2 different index modes
+static const int NINDEXMODES				= (1<<(INDEXMODE_BITS));
+static const int INDEXMODE_ALPHA_IS_3BITS	= 0;
+static const int INDEXMODE_ALPHA_IS_2BITS	= 1;
+
+static const int ROTATEMODE_BITS		= 2;		// 4 different rotate modes
+static const int NROTATEMODES			= (1<<(ROTATEMODE_BITS));
+static const int ROTATEMODE_RGBA_RGBA	= 0;
+static const int ROTATEMODE_RGBA_AGBR	= 1;
+static const int ROTATEMODE_RGBA_RABG	= 2;
+static const int ROTATEMODE_RGBA_RGAB	= 3;
+
+class Utils
+{
+public:
+	// error metrics
+	static float metric4(nv::Vector4::Arg a, nv::Vector4::Arg b);
+	static float metric3(nv::Vector3::Arg a, nv::Vector3::Arg b, int rotatemode);
+	static float metric1(float a, float b, int rotatemode);
+
+	static float metric4premult(nv::Vector4::Arg rgba0, nv::Vector4::Arg rgba1);
+	static float metric3premult_alphaout(nv::Vector3::Arg rgb0, float a0, nv::Vector3::Arg rgb1, float a1);
+	static float metric3premult_alphain(nv::Vector3::Arg rgb0, nv::Vector3::Arg rgb1, int rotatemode);
+	static float metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode);
+
+	static float premult(float r, float a);
+
+	// quantization and unquantization
+	static int unquantize(int q, int prec);
+	static int quantize(float value, int prec);
+
+	// lerping
+	static int lerp(int a, int b, int i, int bias, int denom);
+	static nv::Vector4 lerp(nv::Vector4::Arg a, nv::Vector4::Arg b, int i, int bias, int denom);
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc7/bits.h
+++ b/3rdparty/nvtt/bc7/bits.h
@ -0,0 +1,76 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_BITS_H
+#define _AVPCL_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/Debug.h"
+
+namespace AVPCL {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc7/endpts.h
+++ b/3rdparty/nvtt/bc7/endpts.h
@ -0,0 +1,81 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_ENDPTS_H
+#define _AVPCL_ENDPTS_H
+
+// endpoint definitions and routines to search through endpoint space
+
+#include "nvmath/Vector.h"
+
+namespace AVPCL {
+
+static const int NCHANNELS_RGB	= 3;
+static const int NCHANNELS_RGBA	= 4;
+static const int CHANNEL_R		= 0;
+static const int CHANNEL_G		= 1;
+static const int CHANNEL_B		= 2;
+static const int CHANNEL_A		= 3;
+
+struct FltEndpts
+{
+	nv::Vector4	A;
+	nv::Vector4	B;
+};
+
+struct IntEndptsRGB
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+};
+
+struct IntEndptsRGB_1
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		lsb;				// shared lsb for A and B
+};
+
+struct IntEndptsRGB_2
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+
+struct IntEndptsRGBA
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+};
+
+struct IntEndptsRGBA_2
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+struct IntEndptsRGBA_2a
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for RGB channels of A
+	int		b_lsb;				// lsb for RGB channels of A
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/bc7/shapes_three.h
+++ b/3rdparty/nvtt/bc7/shapes_three.h
@ -0,0 +1,132 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef	_AVPCL_SHAPES_THREE_H
+#define _AVPCL_SHAPES_THREE_H
+
+// shapes for 3 regions
+
+#define NREGIONS 3
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 2, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   2, 0, 0, 1,   0, 0, 2, 2,   
+0, 2, 2, 1,   2, 2, 1, 1,   2, 2, 1, 1,   0, 0, 1, 1,   
+2, 2, 2, 2,   2, 2, 2, 1,   2, 2, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 2,   
+0, 0, 0, 0,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 1, 2,   
+1, 1, 1, 1,   1, 1, 1, 1,   2, 2, 2, 2,   0, 0, 1, 2,   
+2, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 2,   0, 0, 1, 2,   
+
+0, 1, 1, 2,   0, 1, 2, 2,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   0, 1, 1, 2,   2, 0, 0, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 1, 2, 2,   2, 2, 0, 0,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 2, 2, 2,   2, 2, 2, 0,   
+
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+0, 1, 1, 2,   2, 0, 0, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+1, 1, 2, 2,   2, 2, 0, 0,   1, 1, 2, 2,   1, 1, 1, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   1, 1, 0, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+
+0, 1, 2, 2,   0, 0, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 2, 2,   0, 0, 1, 2,   1, 2, 2, 1,   0, 1, 1, 0,   
+0, 0, 1, 1,   1, 1, 2, 2,   1, 2, 2, 1,   1, 2, 2, 1,   
+0, 0, 0, 0,   2, 2, 2, 2,   0, 1, 1, 0,   1, 2, 2, 1,   
+
+0, 0, 2, 2,   0, 1, 1, 0,   0, 0, 1, 1,   0, 0, 0, 0,   
+1, 1, 0, 2,   0, 1, 1, 0,   0, 1, 2, 2,   2, 0, 0, 0,   
+1, 1, 0, 2,   2, 0, 0, 2,   0, 1, 2, 2,   2, 2, 1, 1,   
+0, 0, 2, 2,   2, 2, 2, 2,   0, 0, 1, 1,   2, 2, 2, 1,   
+
+0, 0, 0, 0,   0, 2, 2, 2,   0, 0, 1, 1,   0, 1, 2, 0,   
+0, 0, 0, 2,   0, 0, 2, 2,   0, 0, 1, 2,   0, 1, 2, 0,   
+1, 1, 2, 2,   0, 0, 1, 2,   0, 0, 2, 2,   0, 1, 2, 0,   
+1, 2, 2, 2,   0, 0, 1, 1,   0, 2, 2, 2,   0, 1, 2, 0,   
+
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+1, 1, 1, 1,   1, 2, 0, 1,   2, 0, 1, 2,   2, 2, 0, 0,   
+2, 2, 2, 2,   2, 0, 1, 2,   1, 2, 0, 1,   1, 1, 2, 2,   
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+1, 1, 2, 2,   0, 1, 0, 1,   0, 0, 0, 0,   1, 1, 2, 2,   
+2, 2, 0, 0,   2, 2, 2, 2,   2, 1, 2, 1,   0, 0, 2, 2,   
+0, 0, 1, 1,   2, 2, 2, 2,   2, 1, 2, 1,   1, 1, 2, 2,   
+
+0, 0, 2, 2,   0, 2, 2, 0,   0, 1, 0, 1,   0, 0, 0, 0,   
+0, 0, 1, 1,   1, 2, 2, 1,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 2, 2,   0, 2, 2, 0,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 1, 1,   1, 2, 2, 1,   0, 1, 0, 1,   2, 1, 2, 1,   
+
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   0, 0, 0, 0,   
+0, 1, 0, 1,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   2, 1, 1, 2,   
+2, 2, 2, 2,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+
+0, 2, 2, 2,   0, 0, 0, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   2, 1, 1, 2,   
+0, 2, 2, 2,   0, 0, 0, 2,   2, 2, 2, 2,   2, 1, 1, 2,   
+
+0, 1, 1, 0,   0, 0, 2, 2,   0, 0, 2, 2,   0, 0, 0, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 2, 2,   0, 0, 2, 2,   2, 1, 1, 2,   
+
+0, 0, 0, 2,   0, 2, 2, 2,   0, 1, 0, 1,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 0, 1, 1,   
+0, 0, 0, 2,   0, 2, 2, 2,   2, 2, 2, 2,   2, 2, 0, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 0,
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*3] = 
+{
+	0, 3,15,  0, 3, 8,  0,15, 8,  0,15, 3,
+	0, 8,15,  0, 3,15,  0,15, 3,  0,15, 8,
+	0, 8,15,  0, 8,15,  0, 6,15,  0, 6,15,
+	0, 6,15,  0, 5,15,  0, 3,15,  0, 3, 8,
+
+	0, 3,15,  0, 3, 8,  0, 8,15,  0,15, 3,
+	0, 3,15,  0, 3, 8,  0, 6,15,  0,10, 8,
+	0, 5, 3,  0, 8,15,  0, 8, 6,  0, 6,10,
+	0, 8,15,  0, 5,15,  0,15,10,  0,15, 8,
+
+	0, 8,15,  0,15, 3,  0, 3,15,  0, 5,10,
+	0, 6,10,  0,10, 8,  0, 8, 9,  0,15,10,
+	0,15, 6,  0, 3,15,  0,15, 8,  0, 5,15,
+	0,15, 3,  0,15, 6,  0,15, 6,  0,15, 8,
+
+	0, 3,15,  0,15, 3,  0, 5,15,  0, 5,15,
+	0, 5,15,  0, 8,15,  0, 5,15,  0,10,15,
+	0, 5,15,  0,10,15,  0, 8,15,  0,13,15,
+	0,15, 3,  0,12,15,  0, 3,15,  0, 3, 8
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*3+(region)]
+
+#endif
--- a/3rdparty/nvtt/bc7/shapes_two.h
+++ b/3rdparty/nvtt/bc7/shapes_two.h
@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_SHAPES_TWO_H
+#define _AVPCL_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
--- a/3rdparty/nvtt/bc7/tile.h
+++ b/3rdparty/nvtt/bc7/tile.h
@ -0,0 +1,41 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_TILE_H
+#define _AVPCL_TILE_H
+
+#include "nvmath/Vector.h"
+#include <math.h>
+#include "avpcl_utils.h"
+
+namespace AVPCL {
+
+// extract a tile of pixels from an array
+
+class Tile
+{
+public:
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+	nv::Vector4 data[TILE_H][TILE_W];
+    float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+};
+
+}
+
+#endif
--- a/3rdparty/nvtt/nvcore/Array.inl
+++ b/3rdparty/nvtt/nvcore/Array.inl
@ -0,0 +1,437 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_ARRAY_INL
+#define NV_CORE_ARRAY_INL
+
+#include "array.h"
+
+#include "stream.h"
+#include "utils.h" // swap
+
+#include <string.h>	// memmove
+#include <new> // for placement new
+
+
+
+namespace nv 
+{
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::append()
+    {
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size);
+
+        return m_buffer[old_size]; // Return reference to last element.
+    }
+
+    // Push an element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::push_back( const T & val )
+    {
+#if 1
+        nvDebugCheck(&val < m_buffer || &val >= m_buffer+m_size);
+
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size, val);
+#else
+        uint new_size = m_size + 1;
+
+        if (new_size > m_capacity)
+        {
+            // @@ Is there any way to avoid this copy?
+            // @@ Can we create a copy without side effects? Ie. without calls to constructor/destructor. Use alloca + memcpy?
+            // @@ Assert instead of copy?
+            const T copy(val);	// create a copy in case value is inside of this array.
+
+            setArraySize(new_size);
+
+            new (m_buffer+new_size-1) T(copy);
+        }
+        else
+        {
+            m_size = new_size;
+            new(m_buffer+new_size-1) T(val);
+        }
+#endif // 0/1
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pushBack( const T & val )
+    {
+        push_back(val);
+    }
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::append( const T & val )
+    {
+        push_back(val);
+        return *this;
+    }
+
+    // Qt like push operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator<< ( T & t )
+    {
+        push_back(t);
+        return *this;
+    }
+
+    // Pop the element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pop_back()
+    {
+        nvDebugCheck( m_size > 0 );
+        resize( m_size - 1 );
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popBack(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        resize(m_size - count);
+    }
+
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popFront(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        //resize(m_size - count);
+
+        if (m_size == count) {
+            clear();
+        }
+        else {
+            destroy_range(m_buffer, 0, count);
+
+            memmove(m_buffer, m_buffer + count, sizeof(T) * (m_size - count));
+
+            m_size -= count;
+        }
+
+    }
+
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::back() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::back()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::front() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::front()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Check if the given element is contained in the array.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::contains(const T & e) const
+    {
+        return find(e, NULL);
+    }
+
+    // Return true if element found.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint * indexPtr) const
+    {
+        return find(element, 0, m_size, indexPtr);
+    }
+
+    // Return true if element found within the given range.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint begin, uint end, uint * indexPtr) const
+    {
+        return ::nv::find(element, m_buffer, begin, end, indexPtr);
+    }
+
+
+    // Remove the element at the given index. This is an expensive operation!
+    template <typename T>
+    void Array<T>::removeAt(uint index)
+    {
+        nvDebugCheck(index >= 0 && index < m_size);
+
+        if (m_size == 1) {
+            clear();
+        }
+        else {
+            m_buffer[index].~T();
+
+            memmove(m_buffer+index, m_buffer+index+1, sizeof(T) * (m_size - 1 - index));
+            m_size--;
+        }
+    }
+
+    // Remove the first instance of the given element.
+    template <typename T>
+    bool Array<T>::remove(const T & element)
+    {
+        uint index;
+        if (find(element, &index)) {
+            removeAt(index);
+            return true;
+        }
+        return false;
+    }
+
+    // Insert the given element at the given index shifting all the elements up.
+    template <typename T>
+    void Array<T>::insertAt(uint index, const T & val/*=T()*/)
+    {
+        nvDebugCheck( index >= 0 && index <= m_size );
+
+        setArraySize(m_size + 1);
+
+        if (index < m_size - 1) {
+            memmove(m_buffer+index+1, m_buffer+index, sizeof(T) * (m_size - 1 - index));
+        }
+
+        // Copy-construct into the newly opened slot.
+        new(m_buffer+index) T(val);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::append(const Array<T> & other)
+    {
+        append(other.m_buffer, other.m_size);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    void Array<T>::append(const T other[], uint count)
+    {
+        if (count > 0) {
+            const uint old_size = m_size;
+
+            setArraySize(m_size + count);
+
+            for (uint i = 0; i < count; i++ ) {
+                new(m_buffer + old_size + i) T(other[i]);
+            }
+        }
+    }
+
+
+    // Remove the given element by replacing it with the last one.
+    template <typename T> 
+    void Array<T>::replaceWithLast(uint index)
+    {
+        nvDebugCheck( index < m_size );
+        nv::swap(m_buffer[index], back());      // @@ Is this OK when index == size-1?
+        (m_buffer+m_size-1)->~T();
+        m_size--;
+    }
+
+    // Resize the vector preserving existing elements.
+    template <typename T> 
+    void Array<T>::resize(uint new_size)
+    {
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call default constructors
+        construct_range(m_buffer, new_size, old_size);
+    }
+
+
+    // Resize the vector preserving existing elements and initializing the
+    // new ones with the given value.
+    template <typename T> 
+    void Array<T>::resize(uint new_size, const T & elem)
+    {
+        nvDebugCheck(&elem < m_buffer || &elem > m_buffer+m_size);
+
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call copy constructors
+        construct_range(m_buffer, new_size, old_size, elem);
+    }
+
+    // Fill array with the given value.
+    template <typename T>
+    void Array<T>::fill(const T & elem)
+    {
+        fill(m_buffer, m_size, elem);
+    }
+
+    // Clear the buffer.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::clear()
+    {
+        nvDebugCheck(isValidPtr(m_buffer));
+
+        // Destruct old elements
+        destroy_range(m_buffer, 0, m_size);
+
+        m_size = 0;
+    }
+
+    // Shrink the allocated vector.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::shrink()
+    {
+        if (m_size < m_capacity) {
+            setArrayCapacity(m_size);
+        }
+    }
+
+    // Preallocate space.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::reserve(uint desired_size)
+    {
+        if (desired_size > m_capacity) {
+            setArrayCapacity(desired_size);
+        }
+    }
+
+    // Copy elements to this array. Resizes it if needed.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::copy(const T * data, uint count)
+    {
+#if 1   // More simple, but maybe not be as efficient?
+        destroy_range(m_buffer, 0, m_size);
+
+        setArraySize(count);
+
+        construct_range(m_buffer, count, 0, data);
+#else
+        const uint old_size = m_size;
+
+        destroy_range(m_buffer, count, old_size);
+
+        setArraySize(count);
+
+        copy_range(m_buffer, data, old_size);
+
+        construct_range(m_buffer, count, old_size, data);
+#endif
+    }
+
+    // Assignment operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator=( const Array<T> & a )
+    {
+        copy(a.m_buffer, a.m_size);
+        return *this;
+    }
+
+    // Release ownership of allocated memory and returns pointer to it.
+    template <typename T>
+    T * Array<T>::release() {
+        T * tmp = m_buffer;
+        m_buffer = NULL;
+        m_capacity = 0;
+        m_size = 0;
+        return tmp;
+    }
+
+
+
+    // Change array size.
+    template <typename T> 
+    inline void Array<T>::setArraySize(uint new_size) {
+        m_size = new_size;
+
+        if (new_size > m_capacity) {
+            uint new_buffer_size;
+            if (m_capacity == 0) {
+                // first allocation is exact
+                new_buffer_size = new_size;
+            }
+            else {
+                // following allocations grow array by 25%
+                new_buffer_size = new_size + (new_size >> 2);
+            }
+
+            setArrayCapacity( new_buffer_size );
+        }
+    }
+
+    // Change array capacity.
+    template <typename T> 
+    inline void Array<T>::setArrayCapacity(uint new_capacity) {
+        nvDebugCheck(new_capacity >= m_size);
+
+        if (new_capacity == 0) {
+            // free the buffer.
+            if (m_buffer != NULL) {
+                free<T>(m_buffer);
+                m_buffer = NULL;
+            }
+        }
+        else {
+            // realloc the buffer
+            m_buffer = realloc<T>(m_buffer, new_capacity);
+        }
+
+        m_capacity = new_capacity;
+    }
+
+    // Array serialization.
+    template <typename Typ> 
+    inline Stream & operator<< ( Stream & s, Array<Typ> & p )
+    {
+        if (s.isLoading()) {
+            uint size;
+            s << size;
+            p.resize( size );
+        }
+        else {
+            s << p.m_size;
+        }
+
+        for (uint i = 0; i < p.m_size; i++) {
+            s << p.m_buffer[i];
+        }
+
+        return s;
+    }
+
+    // Swap the members of the two given vectors.
+    template <typename Typ>
+    inline void swap(Array<Typ> & a, Array<Typ> & b)
+    {
+        nv::swap(a.m_buffer, b.m_buffer);
+        nv::swap(a.m_capacity, b.m_capacity);
+        nv::swap(a.m_size, b.m_size);
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_INL
--- a/3rdparty/nvtt/nvcore/Debug.h
+++ b/3rdparty/nvtt/nvcore/Debug.h
@ -0,0 +1,216 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_DEBUG_H
+#define NV_CORE_DEBUG_H
+
+#include "nvcore.h"
+
+#include <stdarg.h> // va_list
+
+
+// Make sure we are using our assert.
+#undef assert
+
+#define NV_ABORT_DEBUG      1
+#define NV_ABORT_IGNORE     2
+#define NV_ABORT_EXIT       3
+
+#define nvNoAssert(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    (void)sizeof(exp); \
+    NV_MULTI_LINE_MACRO_END
+
+#if NV_NO_ASSERT
+
+#   define nvAssert(exp) nvNoAssert(exp)
+#   define nvCheck(exp) nvNoAssert(exp)
+#   define nvDebugAssert(exp) nvNoAssert(exp)
+#   define nvDebugCheck(exp) nvNoAssert(exp)
+#   define nvDebugBreak() nvNoAssert(0)
+
+#else // NV_NO_ASSERT
+
+#   if NV_CC_MSVC
+        // @@ Does this work in msvc-6 and earlier?
+#       define nvDebugBreak()       __debugbreak()
+//#       define nvDebugBreak()        __asm { int 3 }
+#   elif NV_OS_ORBIS
+#       define nvDebugBreak()       __debugbreak()
+#   elif NV_CC_GNUC
+#       define nvDebugBreak()       __builtin_trap()
+#   else
+#       error "No nvDebugBreak()!"
+#   endif
+
+/*
+#   elif NV_CC_GNUC || NV_CPU_PPC && NV_OS_DARWIN
+        // @@ Use __builtin_trap() on GCC
+#       define nvDebugBreak()       __asm__ volatile ("trap")
+#   elif (NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64) && NV_OS_DARWIN
+#       define nvDebugBreak()       __asm__ volatile ("int3")
+#   elif NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64
+#       define nvDebugBreak()       __asm__ ( "int %0" : :"I"(3) )
+#   else
+#       include <signal.h>
+#       define nvDebugBreak()       raise(SIGTRAP)
+#   endif
+*/
+
+#define nvDebugBreakOnce() \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    static bool firstTime = true; \
+    if (firstTime) { firstTime = false; nvDebugBreak(); } \
+    NV_MULTI_LINE_MACRO_END
+
+#define nvAssertMacro(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    if (!(exp)) { \
+        if (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) { \
+            nvDebugBreak(); \
+        } \
+    } \
+    NV_MULTI_LINE_MACRO_END
+
+// GCC, LLVM need "##" before the __VA_ARGS__, MSVC doesn't care
+#define nvAssertMacroWithIgnoreAll(exp,...) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+        static bool ignoreAll = false; \
+        if (!ignoreAll && !(exp)) { \
+            int result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__, ##__VA_ARGS__); \
+            if (result == NV_ABORT_DEBUG) { \
+                nvDebugBreak(); \
+            } else if (result == NV_ABORT_IGNORE) { \
+                ignoreAll = true; \
+            } \
+        } \
+    NV_MULTI_LINE_MACRO_END
+
+// Interesting assert macro from Insomniac:
+// http://www.gdcvault.com/play/1015319/Developing-Imperfect-Software-How-to
+// Used as follows:
+// if (nvCheck(i < count)) {
+//     normal path
+// } else {
+//     fixup code.
+// }
+// This style of macro could be combined with __builtin_expect to let the compiler know failure is unlikely.
+#define nvCheckMacro(exp) \
+    (\
+        (exp) ? true : ( \
+            (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) ? (nvDebugBreak(), true) : ( false ) \
+        ) \
+    )
+
+
+#define nvAssert(exp)    nvAssertMacro(exp)
+#define nvCheck(exp)     nvAssertMacro(exp)
+
+#if defined(_DEBUG)
+#   define nvDebugAssert(exp)   nvAssertMacro(exp)
+#   define nvDebugCheck(exp)    nvAssertMacro(exp)
+#else // _DEBUG
+#   define nvDebugAssert(exp)   nvNoAssert(exp)
+#   define nvDebugCheck(exp)    nvNoAssert(exp)
+#endif // _DEBUG
+
+#endif // NV_NO_ASSERT
+
+// Use nvAssume for very simple expresions only: nvAssume(0), nvAssume(value == true), etc.
+/*#if !defined(_DEBUG)
+#   if NV_CC_MSVC
+#       define nvAssume(exp)    __assume(exp)
+#   else
+#       define nvAssume(exp)    nvCheck(exp)
+#   endif
+#else
+#   define nvAssume(exp)    nvCheck(exp)
+#endif*/
+
+#if defined(_DEBUG)
+#  if NV_CC_MSVC
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __assume(0)
+#  else
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __builtin_unreachable()
+#  endif
+#else
+#  if NV_CC_MSVC
+#   define nvUnreachable() __assume(0)
+#  else
+#   define nvUnreachable() __builtin_unreachable()
+#  endif
+#endif
+
+
+#define nvError(x)      nvAbort(x, __FILE__, __LINE__, __FUNC__)
+#define nvWarning(x)    nvDebugPrint("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x))
+
+#ifndef NV_DEBUG_PRINT
+#define NV_DEBUG_PRINT 1 //defined(_DEBUG)
+#endif
+
+#if NV_DEBUG_PRINT
+#define nvDebug(...)    nvDebugPrint(__VA_ARGS__)
+#else
+#if NV_CC_MSVC
+#define nvDebug(...)    __noop(__VA_ARGS__)
+#else
+#define nvDebug(...)    ((void)0) // Non-msvc platforms do not evaluate arguments?
+#endif
+#endif
+
+
+NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...) __attribute__((format (printf, 5, 6)));
+NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
+
+namespace nv
+{
+    inline bool isValidPtr(const void * ptr) {
+    #if NV_CPU_X86_64
+        if (ptr == NULL) return true;
+        if (reinterpret_cast<uint64>(ptr) < 0x10000ULL) return false;
+        if (reinterpret_cast<uint64>(ptr) >= 0x000007FFFFFEFFFFULL) return false;
+    #else
+	    if (reinterpret_cast<uint32>(ptr) == 0xcccccccc) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xcdcdcdcd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xdddddddd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xffffffff) return false;
+    #endif
+        return true;
+    }
+
+    // Message handler interface.
+    struct MessageHandler {
+        virtual void log(const char * str, va_list arg) = 0;
+        virtual ~MessageHandler() {}
+    };
+
+    // Assert handler interface.
+    struct AssertHandler {
+        virtual int assertion(const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg) = 0;
+        virtual ~AssertHandler() {}
+    };
+
+
+    namespace debug
+    {
+        NVCORE_API void dumpInfo();
+        NVCORE_API void dumpCallstack( MessageHandler *messageHandler, int callstackLevelsToSkip = 0 );
+
+        NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
+        NVCORE_API void resetMessageHandler();
+
+        NVCORE_API void setAssertHandler( AssertHandler * assertHanlder );
+        NVCORE_API void resetAssertHandler();
+
+        NVCORE_API void enableSigHandler(bool interactive);
+        NVCORE_API void disableSigHandler();
+
+        NVCORE_API bool isDebuggerPresent();
+        NVCORE_API bool attachToDebugger();
+
+        NVCORE_API void terminate(int code);
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_DEBUG_H
--- a/3rdparty/nvtt/nvcore/array.h
+++ b/3rdparty/nvtt/nvcore/array.h
@ -0,0 +1,181 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_ARRAY_H
+#define NV_CORE_ARRAY_H
+
+/*
+This array class requires the elements to be relocable; it uses memmove and realloc. Ideally I should be 
+using swap, but I honestly don't care. The only thing that you should be aware of is that internal pointers
+are not supported.
+
+Note also that push_back and resize does not support inserting arguments elements that are in the same 
+container. This is forbidden to prevent an extra copy.
+*/
+
+
+#include "memory.h"
+#include "debug.h"
+#include "foreach.h" // pseudoindex
+
+
+namespace nv 
+{
+    class Stream;
+
+    /**
+    * Replacement for std::vector that is easier to debug and provides
+    * some nice foreach enumerators. 
+    */
+    template<typename T>
+    class NVCORE_CLASS Array {
+    public:
+        typedef uint size_type;
+
+        // Default constructor.
+        NV_FORCEINLINE Array() : m_buffer(NULL), m_capacity(0), m_size(0) {}
+
+        // Copy constructor.
+        NV_FORCEINLINE Array(const Array & a) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(a.m_buffer, a.m_size);
+        }
+
+        // Constructor that initializes the vector with the given elements.
+        NV_FORCEINLINE Array(const T * ptr, uint num) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(ptr, num);
+        }
+
+        // Allocate array.
+        NV_FORCEINLINE explicit Array(uint capacity) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            setArrayCapacity(capacity);
+        }
+
+        // Destructor.
+        NV_FORCEINLINE ~Array() {
+            clear();
+            free<T>(m_buffer);
+        }
+
+
+        /// Const element access.
+        NV_FORCEINLINE const T & operator[]( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE const T & at( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Element access.
+        NV_FORCEINLINE T & operator[] ( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE T & at( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint size() const { return m_size; }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint count() const { return m_size; }
+
+        /// Get vector capacity.
+        NV_FORCEINLINE uint capacity() const { return m_capacity; }
+
+        /// Get const vector pointer.
+        NV_FORCEINLINE const T * buffer() const { return m_buffer; }
+
+        /// Get vector pointer.
+        NV_FORCEINLINE T * buffer() { return m_buffer; }
+
+        /// Provide begin/end pointers for C++11 range-based for loops.
+        NV_FORCEINLINE T * begin() { return m_buffer; }
+        NV_FORCEINLINE T * end() { return m_buffer + m_size; }
+        NV_FORCEINLINE const T * begin() const { return m_buffer; }
+        NV_FORCEINLINE const T * end() const { return m_buffer + m_size; }
+
+        /// Is vector empty.
+        NV_FORCEINLINE bool isEmpty() const { return m_size == 0; }
+
+        /// Is a null vector.
+        NV_FORCEINLINE bool isNull() const { return m_buffer == NULL; }
+
+
+        T & append();
+        void push_back( const T & val );
+        void pushBack( const T & val );
+        Array<T> & append( const T & val );
+        Array<T> & operator<< ( T & t );
+        void pop_back();
+        void popBack(uint count = 1);
+        void popFront(uint count = 1);
+        const T & back() const;
+        T & back();
+        const T & front() const;
+        T & front();
+        bool contains(const T & e) const;
+        bool find(const T & element, uint * indexPtr) const;
+        bool find(const T & element, uint begin, uint end, uint * indexPtr) const;
+        void removeAt(uint index);
+        bool remove(const T & element);
+        void insertAt(uint index, const T & val = T());
+        void append(const Array<T> & other);
+        void append(const T other[], uint count);
+        void replaceWithLast(uint index);
+        void resize(uint new_size);
+        void resize(uint new_size, const T & elem);
+        void fill(const T & elem);
+        void clear();
+        void shrink();
+        void reserve(uint desired_size);
+        void copy(const T * data, uint count);
+        Array<T> & operator=( const Array<T> & a );
+        T * release();
+
+
+        // Array enumerator.
+        typedef uint PseudoIndex;
+
+        NV_FORCEINLINE PseudoIndex start() const { return 0; }
+        NV_FORCEINLINE bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; }
+        NV_FORCEINLINE void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; }
+
+#if NV_CC_MSVC
+        NV_FORCEINLINE T & operator[]( const PseudoIndexWrapper & i ) {
+            return m_buffer[i(this)];
+        }
+        NV_FORCEINLINE const T & operator[]( const PseudoIndexWrapper & i ) const {
+            return m_buffer[i(this)];
+        }
+#endif
+
+        // Friends.
+        template <typename Typ> 
+        friend Stream & operator<< ( Stream & s, Array<Typ> & p );
+
+        template <typename Typ>
+        friend void swap(Array<Typ> & a, Array<Typ> & b);
+
+
+    protected:
+
+        void setArraySize(uint new_size);
+        void setArrayCapacity(uint new_capacity);
+
+        T * m_buffer;
+        uint m_capacity;
+        uint m_size;
+
+    };
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_H
--- a/3rdparty/nvtt/nvcore/defsgnucdarwin.h
+++ b/3rdparty/nvtt/nvcore/defsgnucdarwin.h
@ -0,0 +1,53 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#	define DLL_EXPORT __attribute__((visibility("default")))
+#	define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#	define DLL_EXPORT
+#	define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	__attribute__((always_inline)) inline
+#define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL //ACS: there's no "__thread" or equivalent on iOS/OSX
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__
--- a/3rdparty/nvtt/nvcore/defsgnuclinux.h
+++ b/3rdparty/nvtt/nvcore/defsgnuclinux.h
@ -0,0 +1,59 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#   define DLL_EXPORT   __attribute__((visibility("default")))
+#   define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#   define DLL_EXPORT
+#   define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#   define NV_CDECL     __attribute__((cdecl))
+#   define NV_STDCALL   __attribute__((stdcall))
+#else
+#   define NV_CDECL 
+#   define NV_STDCALL
+#endif
+
+#define NV_FASTCALL     __attribute__((fastcall))
+//#if __GNUC__ > 3
+// It seems that GCC does not assume always_inline implies inline. I think this depends on the GCC version :(
+#define NV_FORCEINLINE  inline __attribute__((always_inline))
+//#else
+// Some compilers complain that inline and always_inline are redundant.
+//#define NV_FORCEINLINE  __attribute__((always_inline))
+//#endif
+#define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL __thread 
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#   if __GNUC__ >= 2
+#       define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__
+#   else
+#       define __FUNC__ "<unknown>"
+#   endif
+#else
+#   define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__
--- a/3rdparty/nvtt/nvcore/defsgnucwin32.h
+++ b/3rdparty/nvtt/nvcore/defsgnucwin32.h
@ -0,0 +1,65 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+//#include <cstddef> // size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT	__declspec(dllimport)
+#define DLL_EXPORT	__declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	__attribute__((always_inline))
+#define NV_DEPRECATED   __attribute__((deprecated))
+
+#if __GNUC__ > 2
+#define NV_PURE		__attribute__((pure))
+#define NV_CONST	__attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict	__restrict__
+
+/*
+// Type definitions
+typedef unsigned char		uint8;
+typedef signed char			int8;
+
+typedef unsigned short		uint16;
+typedef signed short		int16;
+
+typedef unsigned int		uint32;
+typedef signed int			int32;
+
+typedef unsigned long long	uint64;
+typedef signed long long	int64;
+
+// Aliases
+typedef uint32				uint;
+*/
+
--- a/3rdparty/nvtt/nvcore/defsvcwin32.h
+++ b/3rdparty/nvtt/nvcore/defsvcwin32.h
@ -0,0 +1,94 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+// Function linkage
+#define DLL_IMPORT __declspec(dllimport)
+#define DLL_EXPORT __declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#define NV_CDECL        __cdecl
+#define NV_STDCALL      __stdcall
+#define NV_FASTCALL     __fastcall
+#define NV_DEPRECATED
+
+#define NV_PURE
+#define NV_CONST
+
+// Set standard function names.
+#if _MSC_VER < 1900
+#   define snprintf _snprintf
+#endif
+#if _MSC_VER < 1500
+#   define vsnprintf _vsnprintf
+#endif
+#if _MSC_VER < 1700
+#   define strtoll _strtoi64
+#   define strtoull _strtoui64
+#endif
+#define chdir _chdir
+#define getcwd _getcwd 
+
+#if _MSC_VER < 1800 // Not sure what version introduced this.
+#define va_copy(a, b) (a) = (b)
+#endif
+
+#if !defined restrict
+#define restrict
+#endif
+
+// Ignore gcc attributes.
+#define __attribute__(X)
+
+#if !defined __FUNC__
+#define __FUNC__ __FUNCTION__ 
+#endif
+
+#define NV_NOINLINE __declspec(noinline)
+#define NV_FORCEINLINE __forceinline
+
+#define NV_THREAD_LOCAL __declspec(thread)
+
+/*
+// Type definitions
+typedef unsigned char       uint8;
+typedef signed char         int8;
+
+typedef unsigned short      uint16;
+typedef signed short        int16;
+
+typedef unsigned int        uint32;
+typedef signed int          int32;
+
+typedef unsigned __int64    uint64;
+typedef signed __int64      int64;
+
+// Aliases
+typedef uint32              uint;
+*/
+
+// Unwanted VC++ warnings to disable.
+/*
+#pragma warning(disable : 4244)     // conversion to float, possible loss of data
+#pragma warning(disable : 4245)     // conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch
+#pragma warning(disable : 4100)     // unreferenced formal parameter
+#pragma warning(disable : 4514)     // unreferenced inline function has been removed
+#pragma warning(disable : 4710)     // inline function not expanded
+#pragma warning(disable : 4127)     // Conditional expression is constant
+#pragma warning(disable : 4305)     // truncation from 'const double' to 'float'
+#pragma warning(disable : 4505)     // unreferenced local function has been removed
+
+#pragma warning(disable : 4702)     // unreachable code in inline expanded function
+#pragma warning(disable : 4711)     // function selected for automatic inlining
+#pragma warning(disable : 4725)     // Pentium fdiv bug
+
+#pragma warning(disable : 4786)     // Identifier was truncated and cannot be debugged.
+
+#pragma warning(disable : 4675)     // resolved overload was found by argument-dependent lookup
+*/
+
+#pragma warning(1 : 4705)     // Report unused local variables.
+#pragma warning(1 : 4555)     // Expression has no effect.
--- a/3rdparty/nvtt/nvcore/foreach.h
+++ b/3rdparty/nvtt/nvcore/foreach.h
@ -0,0 +1,68 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_FOREACH_H
+#define NV_CORE_FOREACH_H
+
+/*
+These foreach macros are very non-standard and somewhat confusing, but I like them.
+*/
+
+#include "nvcore.h"
+
+#if NV_CC_GNUC // If typeof or decltype is available:
+#if !NV_CC_CPP11
+#   define NV_DECLTYPE typeof // Using a non-standard extension over typeof that behaves as C++11 decltype
+#else
+#   define NV_DECLTYPE decltype
+#endif
+
+/*
+Ideally we would like to write this:
+
+#define NV_FOREACH(i, container) \
+    for(NV_DECLTYPE(container)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+But gcc versions prior to 4.7 required an intermediate type. See:
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6709
+*/
+
+#define NV_FOREACH(i, container) \
+    typedef NV_DECLTYPE(container) NV_STRING_JOIN2(cont,__LINE__); \
+    for(NV_STRING_JOIN2(cont,__LINE__)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+#else // If typeof not available:
+
+#include <new> // placement new
+
+struct PseudoIndexWrapper {
+    template <typename T>
+    PseudoIndexWrapper(const T & container) {
+        nvStaticCheck(sizeof(typename T::PseudoIndex) <= sizeof(memory));
+        new (memory) typename T::PseudoIndex(container.start());
+    }
+    // PseudoIndex cannot have a dtor!
+
+    template <typename T> typename T::PseudoIndex & operator()(const T * /*container*/) {
+        return *reinterpret_cast<typename T::PseudoIndex *>(memory);
+    }
+    template <typename T> const typename T::PseudoIndex & operator()(const T * /*container*/) const {
+        return *reinterpret_cast<const typename T::PseudoIndex *>(memory);
+    }
+
+    uint8 memory[4];	// Increase the size if we have bigger enumerators.
+};
+
+#define NV_FOREACH(i, container) \
+    for(PseudoIndexWrapper i(container); !(container).isDone(i(&(container))); (container).advance(i(&(container))))
+
+#endif
+
+// Declare foreach keyword.
+#if !defined NV_NO_USE_KEYWORDS
+#   define foreach NV_FOREACH
+#   define foreach_index NV_FOREACH
+#endif
+
+
+#endif // NV_CORE_FOREACH_H
--- a/3rdparty/nvtt/nvcore/hash.h
+++ b/3rdparty/nvtt/nvcore/hash.h
@ -0,0 +1,83 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_HASH_H
+#define NV_CORE_HASH_H
+
+#include "nvcore.h"
+
+namespace nv
+{
+    inline uint sdbmHash(const void * data_in, uint size, uint h = 5381)
+    {
+        const uint8 * data = (const uint8 *) data_in;
+        uint i = 0;
+        while (i < size) {
+            h = (h << 16) + (h << 6) - h + (uint) data[i++];
+        }
+        return h;
+    }
+
+    // Note that this hash does not handle NaN properly.
+    inline uint sdbmFloatHash(const float * f, uint count, uint h = 5381)
+    {
+        for (uint i = 0; i < count; i++) {
+            //nvDebugCheck(nv::isFinite(*f));
+            union { float f; uint32 i; } x = { f[i] };
+            if (x.i == 0x80000000) x.i = 0;
+            h = sdbmHash(&x, 4, h);
+        }
+        return h;
+    }
+
+
+    template <typename T>
+    inline uint hash(const T & t, uint h = 5381)
+    {
+        return sdbmHash(&t, sizeof(T), h);
+    }
+
+    template <>
+    inline uint hash(const float & f, uint h)
+    {
+        return sdbmFloatHash(&f, 1, h);
+    }
+
+
+    // Functors for hash table:
+    template <typename Key> struct Hash 
+    {
+        uint operator()(const Key & k) const {
+            return hash(k);
+        }
+    };
+
+    template <typename Key> struct Equal
+    {
+        bool operator()(const Key & k0, const Key & k1) const {
+            return k0 == k1;
+        }
+    };
+
+
+    // @@ Move to Utils.h?
+    template <typename T1, typename T2>
+    struct Pair {
+        T1 first;
+        T2 second;
+    };
+
+    template <typename T1, typename T2>
+    bool operator==(const Pair<T1,T2> & p0, const Pair<T1,T2> & p1) {
+        return p0.first == p1.first && p0.second == p1.second;
+    }
+
+    template <typename T1, typename T2>
+    uint hash(const Pair<T1,T2> & p, uint h = 5381) {
+        return hash(p.second, hash(p.first));
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_HASH_H
--- a/3rdparty/nvtt/nvcore/memory.h
+++ b/3rdparty/nvtt/nvcore/memory.h
@ -0,0 +1,29 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_MEMORY_H
+#define NV_CORE_MEMORY_H
+
+#include "nvcore.h"
+
+namespace nv {
+
+    // C++ helpers.
+    template <typename T> NV_FORCEINLINE T * malloc(size_t count) {
+        return (T *)::malloc(sizeof(T) * count);
+    }
+
+    template <typename T> NV_FORCEINLINE T * realloc(T * ptr, size_t count) {
+        return (T *)::realloc(ptr, sizeof(T) * count);
+    }
+
+    template <typename T> NV_FORCEINLINE void free(const T * ptr) {
+        ::free((void *)ptr);
+    }
+
+    template <typename T> NV_FORCEINLINE void zero(T & data) {
+        memset(&data, 0, sizeof(T));
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_MEMORY_H
--- a/3rdparty/nvtt/nvcore/nvcore.h
+++ b/3rdparty/nvtt/nvcore/nvcore.h
@ -0,0 +1,299 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_H
+#define NV_CORE_H
+
+// Function linkage
+#if NVCORE_SHARED
+#ifdef NVCORE_EXPORTS
+#define NVCORE_API DLL_EXPORT
+#define NVCORE_CLASS DLL_EXPORT_CLASS
+#else
+#define NVCORE_API DLL_IMPORT
+#define NVCORE_CLASS DLL_IMPORT
+#endif
+#else // NVCORE_SHARED
+#define NVCORE_API
+#define NVCORE_CLASS
+#endif // NVCORE_SHARED
+
+
+// Platform definitions
+#include "posh.h"
+
+// OS:
+// NV_OS_WIN32
+// NV_OS_WIN64
+// NV_OS_MINGW
+// NV_OS_CYGWIN
+// NV_OS_LINUX
+// NV_OS_UNIX
+// NV_OS_DARWIN
+// NV_OS_XBOX
+// NV_OS_ORBIS
+// NV_OS_IOS
+
+#define NV_OS_STRING POSH_OS_STRING
+
+#if defined POSH_OS_LINUX
+#   define NV_OS_LINUX 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_ORBIS
+#   define NV_OS_ORBIS 1
+#elif defined POSH_OS_FREEBSD
+#   define NV_OS_FREEBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_OPENBSD
+#   define NV_OS_OPENBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_CYGWIN32
+#   define NV_OS_CYGWIN 1
+#elif defined POSH_OS_MINGW
+#   define NV_OS_MINGW 1
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_OSX
+#   define NV_OS_DARWIN 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_IOS
+#   define NV_OS_DARWIN 1 //ACS should we keep this on IOS?
+#   define NV_OS_UNIX 1
+#   define NV_OS_IOS 1
+#elif defined POSH_OS_UNIX
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_WIN64
+#   define NV_OS_WIN32 1
+#   define NV_OS_WIN64 1
+#elif defined POSH_OS_WIN32
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_XBOX
+#   define NV_OS_XBOX 1
+#else
+#   error "Unsupported OS"
+#endif
+
+
+// Threading:
+// some platforms don't implement __thread or similar for thread-local-storage
+#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios?
+#   define NV_OS_USE_PTHREAD 1
+#   if NV_OS_DARWIN || NV_OS_IOS
+#       define NV_OS_HAS_TLS_QUALIFIER 0
+#   else
+#       define NV_OS_HAS_TLS_QUALIFIER 1
+#   endif
+#else
+#   define NV_OS_USE_PTHREAD 0
+#   define NV_OS_HAS_TLS_QUALIFIER 1
+#endif
+
+
+// CPUs:
+// NV_CPU_X86
+// NV_CPU_X86_64
+// NV_CPU_PPC
+// NV_CPU_ARM
+// NV_CPU_AARCH64
+
+#define NV_CPU_STRING   POSH_CPU_STRING
+
+#if defined POSH_CPU_X86_64
+//#   define NV_CPU_X86 1
+#   define NV_CPU_X86_64 1
+#elif defined POSH_CPU_X86
+#   define NV_CPU_X86 1
+#elif defined POSH_CPU_PPC
+#   define NV_CPU_PPC 1
+#elif defined POSH_CPU_STRONGARM
+#   define NV_CPU_ARM 1
+#elif defined POSH_CPU_AARCH64
+#   define NV_CPU_AARCH64 1
+#else
+#   error "Unsupported CPU"
+#endif
+
+
+// Compiler:
+// NV_CC_GNUC
+// NV_CC_MSVC
+// NV_CC_CLANG
+
+#if defined POSH_COMPILER_CLANG
+#   define NV_CC_CLANG  1
+#   define NV_CC_GNUC   1    // Clang is compatible with GCC.
+#   define NV_CC_STRING "clang"
+#elif defined POSH_COMPILER_GCC
+#   define NV_CC_GNUC   1
+#   define NV_CC_STRING "gcc"
+#elif defined POSH_COMPILER_MSVC
+#   define NV_CC_MSVC   1
+#   define NV_CC_STRING "msvc"
+#else
+#   error "Unsupported compiler"
+#endif
+
+#if NV_CC_MSVC
+#define NV_CC_CPP11 (__cplusplus > 199711L || _MSC_VER >= 1800) // Visual Studio 2013 has all the features we use, but doesn't advertise full C++11 support yet.
+#else
+// @@ IC: This works in CLANG, about GCC?
+// @@ ES: Doesn't work in gcc. These 3 features are available in GCC >= 4.4.
+#ifdef __clang__
+#define NV_CC_CPP11 (__has_feature(cxx_deleted_functions) && __has_feature(cxx_rvalue_references) && __has_feature(cxx_static_assert))
+#elif defined __GNUC__ 
+#define NV_CC_CPP11 ( __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+#endif
+#endif
+
+// Endiannes:
+#define NV_LITTLE_ENDIAN    POSH_LITTLE_ENDIAN
+#define NV_BIG_ENDIAN       POSH_BIG_ENDIAN
+#define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
+
+
+// Define the right printf prefix for size_t arguments:
+#if POSH_64BIT_POINTER
+#  define NV_SIZET_PRINTF_PREFIX POSH_I64_PRINTF_PREFIX
+#else
+#  define NV_SIZET_PRINTF_PREFIX
+#endif
+
+
+// Type definitions:
+typedef posh_u8_t   uint8;
+typedef posh_i8_t   int8;
+
+typedef posh_u16_t  uint16;
+typedef posh_i16_t  int16;
+
+typedef posh_u32_t  uint32;
+typedef posh_i32_t  int32;
+
+typedef posh_u64_t  uint64;
+typedef posh_i64_t  int64;
+
+// Aliases
+typedef uint32      uint;
+
+
+// Version string:
+#define NV_VERSION_STRING \
+    NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
+    NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__
+
+
+// Disable copy constructor and assignment operator. 
+#if NV_CC_CPP11
+#define NV_FORBID_COPY(C) \
+    C( const C & ) = delete; \
+    C &operator=( const C & ) = delete
+#else
+#define NV_FORBID_COPY(C) \
+    private: \
+    C( const C & ); \
+    C &operator=( const C & )
+#endif
+
+// Disable dynamic allocation on the heap. 
+// See Prohibiting Heap-Based Objects in More Effective C++.
+#define NV_FORBID_HEAPALLOC() \
+    private: \
+    void *operator new(size_t size); \
+    void *operator new[](size_t size)
+    //static void *operator new(size_t size); \
+    //static void *operator new[](size_t size);
+
+// String concatenation macros.
+#define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
+#define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3)
+#define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3
+#define NV_STRING2(x) #x
+#define NV_STRING(x) NV_STRING2(x)
+
+#if NV_CC_MSVC
+#define NV_MULTI_LINE_MACRO_BEGIN do {  
+#define NV_MULTI_LINE_MACRO_END \
+    __pragma(warning(push)) \
+    __pragma(warning(disable:4127)) \
+    } while(false) \
+    __pragma(warning(pop))  
+#else
+#define NV_MULTI_LINE_MACRO_BEGIN do {
+#define NV_MULTI_LINE_MACRO_END } while(false)
+#endif
+
+#if NV_CC_CPP11
+#define nvStaticCheck(x) static_assert((x), "Static assert "#x" failed")
+#else
+#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
+#endif
+#define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
+
+// Make sure type definitions are fine.
+NV_COMPILER_CHECK(sizeof(int8) == 1);
+NV_COMPILER_CHECK(sizeof(uint8) == 1);
+NV_COMPILER_CHECK(sizeof(int16) == 2);
+NV_COMPILER_CHECK(sizeof(uint16) == 2);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+
+
+#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
+
+#if 0 // Disabled in The Witness.
+#if NV_CC_MSVC
+#define NV_MESSAGE(x) message(__FILE__ "(" NV_STRING(__LINE__) ") : " x)
+#else
+#define NV_MESSAGE(x) message(x)
+#endif
+#else
+#define NV_MESSAGE(x) 
+#endif
+
+
+// Startup initialization macro.
+#define NV_AT_STARTUP(some_code) \
+    namespace { \
+        static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \
+            NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \
+        } \
+        NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
+    }
+
+// Indicate the compiler that the parameter is not used to suppress compier warnings.
+#define NV_UNUSED(a) ((a)=(a))
+
+// Null index. @@ Move this somewhere else... it's only used by nvmesh.
+//const unsigned int NIL = unsigned int(~0);
+//#define NIL uint(~0)
+
+// Null pointer.
+#ifndef NULL
+#define NULL 0
+#endif
+
+// Platform includes
+#if NV_CC_MSVC
+#   if NV_OS_WIN32
+#       include "DefsVcWin32.h"
+#   elif NV_OS_XBOX
+#       include "DefsVcXBox.h"
+#   else
+#       error "MSVC: Platform not supported"
+#   endif
+#elif NV_CC_GNUC
+#   if NV_OS_LINUX
+#       include "DefsGnucLinux.h"
+#   elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
+#       include "DefsGnucDarwin.h"
+#   elif NV_OS_MINGW
+#       include "DefsGnucWin32.h"
+#   elif NV_OS_CYGWIN
+#       error "GCC: Cygwin not supported"
+#   else
+#       error "GCC: Platform not supported"
+#   endif
+#endif
+
+#endif // NV_CORE_H
--- a/3rdparty/nvtt/nvcore/posh.h
+++ b/3rdparty/nvtt/nvcore/posh.h
--- a/3rdparty/nvtt/nvcore/stdstream.h
+++ b/3rdparty/nvtt/nvcore/stdstream.h
@ -0,0 +1,459 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "nvcore.h"
+#include "stream.h"
+#include "array.h"
+
+#include <stdio.h> // fopen
+#include <string.h> // memcpy
+
+namespace nv
+{
+
+    // Portable version of fopen.
+    inline FILE * fileOpen(const char * fileName, const char * mode)
+    {
+        nvCheck(fileName != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+        FILE * fp;
+        if (fopen_s(&fp, fileName, mode) == 0) {
+            return fp;
+        }
+        return NULL;
+#else
+        return fopen(fileName, mode);
+#endif
+    }
+
+
+    /// Base stdio stream.
+    class NVCORE_CLASS StdStream : public Stream
+    {
+        NV_FORBID_COPY(StdStream);
+    public:
+
+        /// Ctor.
+        StdStream( FILE * fp, bool autoclose ) : m_fp(fp), m_autoclose(autoclose) { }
+
+        /// Dtor. 
+        virtual ~StdStream()
+        {
+            if( m_fp != NULL && m_autoclose ) {
+#if NV_OS_WIN32
+                _fclose_nolock( m_fp );
+#else
+                fclose( m_fp );
+#endif
+            }
+        }
+
+
+        /** @name Stream implementation. */
+        //@{
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(m_fp != NULL);
+            nvDebugCheck(pos <= size());
+#if NV_OS_WIN32
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return _ftell_nolock(m_fp);
+#else
+            return (uint)ftell(m_fp);
+#endif
+        }
+
+        virtual uint size() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return end;
+        }
+
+        virtual bool isError() const
+        {
+            return m_fp == NULL || ferror( m_fp ) != 0;
+        }
+
+        virtual void clearError()
+        {
+            nvDebugCheck(m_fp != NULL);
+            clearerr(m_fp);
+        }
+
+        // @@ The original implementation uses feof, which only returns true when we attempt to read *past* the end of the stream. 
+        // That is, if we read the last byte of a file, then isAtEnd would still return false, even though the stream pointer is at the file end. This is not the intent and was inconsistent with the implementation of the MemoryStream, a better 
+        // implementation uses use ftell and fseek to determine our location within the file.
+        virtual bool isAtEnd() const
+        {
+            if (m_fp == NULL) return true;
+            //nvDebugCheck(m_fp != NULL);
+            //return feof( m_fp ) != 0;
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return pos == end;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const { return true; }
+        //@}
+
+    protected:
+
+        FILE * m_fp;
+        bool m_autoclose;
+
+    };
+
+
+    /// Standard output stream.
+    class NVCORE_CLASS StdOutputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdOutputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdOutputStream( FILE * fp, bool autoclose ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Write data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fwrite_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fwrite_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                putc_unlocked(((char *)data)[i], m_fp);
+            }
+            return len;
+#else
+            return (uint)fwrite(data, 1, len, m_fp);
+#endif
+        }
+
+        virtual bool isLoading() const
+        {
+            return false;
+        }
+
+        virtual bool isSaving() const
+        {
+            return true;
+        }
+        //@}
+
+    };
+
+
+    /// Standard input stream.
+    class NVCORE_CLASS StdInputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdInputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fread_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fread_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                ((char *)data)[i] = getc_unlocked(m_fp);
+            }
+            return len;
+#else
+            return (uint)fread(data, 1, len, m_fp);
+#endif
+            
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+    };
+
+
+
+    /// Memory input stream.
+    class NVCORE_CLASS MemoryInputStream : public Stream
+    {
+        NV_FORBID_COPY(MemoryInputStream);
+    public:
+
+        /// Ctor.
+        MemoryInputStream( const uint8 * mem, uint size ) : m_mem(mem), m_ptr(mem), m_size(size) { }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(!isError());
+
+            uint left = m_size - tell();
+            if (len > left) len = left;
+
+            memcpy( data, m_ptr, len );
+            m_ptr += len;
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(!isError());
+            m_ptr = m_mem + pos;
+            nvDebugCheck(!isError());
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_ptr >= m_mem);
+            return uint(m_ptr - m_mem);
+        }
+
+        virtual uint size() const
+        {
+            return m_size;
+        }
+
+        virtual bool isError() const
+        {
+            return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem;
+        }
+
+        virtual void clearError()
+        {
+            // Nothing to do.
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_ptr == m_mem + m_size;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const
+        {
+            return true;
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+
+        const uint8 * ptr() const { return m_ptr; }
+
+
+    private:
+
+        const uint8 * m_mem;
+        const uint8 * m_ptr;
+        uint m_size;
+
+    };
+
+
+    /// Buffer output stream.
+    class NVCORE_CLASS BufferOutputStream : public Stream
+    {
+        NV_FORBID_COPY(BufferOutputStream);
+    public:
+
+        BufferOutputStream(Array<uint8> & buffer) : m_buffer(buffer) { }
+
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            m_buffer.append((uint8 *)data, len);
+            return len;
+        }
+
+        virtual void seek( uint /*pos*/ ) { /*Not implemented*/ }
+        virtual uint tell() const { return m_buffer.size(); }
+        virtual uint size() const { return m_buffer.size(); }
+
+        virtual bool isError() const { return false; }
+        virtual void clearError() {}
+
+        virtual bool isAtEnd() const { return true; }
+        virtual bool isSeekable() const { return false; }
+        virtual bool isLoading() const { return false; }
+        virtual bool isSaving() const { return true; }
+
+    private:
+        Array<uint8> & m_buffer;
+    };
+
+
+    /// Protected input stream.
+    class NVCORE_CLASS ProtectedStream : public Stream
+    {
+        NV_FORBID_COPY(ProtectedStream);
+    public:
+
+        /// Ctor.
+        ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false)
+        { 
+        }
+
+        /// Ctor.
+        ProtectedStream( Stream * s, bool autodelete = true ) : 
+        m_s(s), m_autodelete(autodelete) 
+        {
+            nvDebugCheck(m_s != NULL);
+        }
+
+        /// Dtor.
+        virtual ~ProtectedStream()
+        {
+            if( m_autodelete ) {
+                delete m_s;
+            }
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            len = m_s->serialize( data, len );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            m_s->seek( pos );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+        }
+
+        virtual uint tell() const
+        {
+            return m_s->tell();
+        }
+
+        virtual uint size() const
+        {
+            return m_s->size();
+        }
+
+        virtual bool isError() const
+        {
+            return m_s->isError();
+        }
+
+        virtual void clearError()
+        {
+            m_s->clearError();
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_s->isAtEnd();
+        }
+
+        virtual bool isSeekable() const
+        {
+            return m_s->isSeekable();
+        }
+
+        virtual bool isLoading() const
+        {
+            return m_s->isLoading();
+        }
+
+        virtual bool isSaving() const
+        {
+            return m_s->isSaving();
+        }
+        //@}
+
+
+    private:
+
+        Stream * const m_s;
+        bool const m_autodelete;
+
+    };
+
+} // nv namespace
+
+
+//#endif // NV_CORE_STDSTREAM_H
--- a/3rdparty/nvtt/nvcore/stream.h
+++ b/3rdparty/nvtt/nvcore/stream.h
@ -0,0 +1,163 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_STREAM_H
+#define NV_CORE_STREAM_H
+
+#include "nvcore.h"
+#include "debug.h"
+
+namespace nv
+{
+
+    /// Base stream class.
+    class NVCORE_CLASS Stream {
+    public:
+
+        enum ByteOrder {
+            LittleEndian = false,
+            BigEndian = true,
+        };
+
+        /// Get the byte order of the system.
+        static ByteOrder getSystemByteOrder() { 
+#if NV_LITTLE_ENDIAN
+            return LittleEndian;
+#else
+            return BigEndian;
+#endif
+        }
+
+
+        /// Ctor.
+        Stream() : m_byteOrder(LittleEndian) { }
+
+        /// Virtual destructor.
+        virtual ~Stream() {}
+
+        /// Set byte order.
+        void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
+
+        /// Get byte order.
+        ByteOrder byteOrder() const { return m_byteOrder; }
+
+
+        /// Serialize the given data.
+        virtual uint serialize( void * data, uint len ) = 0;
+
+        /// Move to the given position in the archive.
+        virtual void seek( uint pos ) = 0;
+
+        /// Return the current position in the archive.
+        virtual uint tell() const = 0;
+
+        /// Return the current size of the archive.
+        virtual uint size() const = 0;
+
+        /// Determine if there has been any error.
+        virtual bool isError() const = 0;
+
+        /// Clear errors.
+        virtual void clearError() = 0;
+
+        /// Return true if the stream is at the end.
+        virtual bool isAtEnd() const = 0;
+
+        /// Return true if the stream is seekable.
+        virtual bool isSeekable() const = 0;
+
+        /// Return true if this is an input stream.
+        virtual bool isLoading() const = 0;
+
+        /// Return true if this is an output stream.
+        virtual bool isSaving() const = 0;
+
+
+        void advance(uint offset) { seek(tell() + offset); }
+
+
+        // friends	
+        friend Stream & operator<<( Stream & s, bool & c ) {
+#if NV_OS_DARWIN && !NV_CC_CPP11
+            nvStaticCheck(sizeof(bool) == 4);
+            uint8 b = c ? 1 : 0;
+            s.serialize( &b, 1 );
+            c = (b == 1);
+#else
+            nvStaticCheck(sizeof(bool) == 1);
+            s.serialize( &c, 1 );
+#endif
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, char & c ) {
+            nvStaticCheck(sizeof(char) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint8 & c ) {
+            nvStaticCheck(sizeof(uint8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, int8 & c ) {
+            nvStaticCheck(sizeof(int8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint16 & c ) {
+            nvStaticCheck(sizeof(uint16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, int16 & c ) {
+            nvStaticCheck(sizeof(int16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, uint32 & c ) {
+            nvStaticCheck(sizeof(uint32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, int32 & c ) {
+            nvStaticCheck(sizeof(int32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, uint64 & c ) {
+            nvStaticCheck(sizeof(uint64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, int64 & c ) {
+            nvStaticCheck(sizeof(int64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, float & c ) {
+            nvStaticCheck(sizeof(float) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, double & c ) {
+            nvStaticCheck(sizeof(double) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+
+    protected:
+
+        /// Serialize in the stream byte order.
+        Stream & byteOrderSerialize( void * v, uint len ) {
+            if( m_byteOrder == getSystemByteOrder() ) {
+                serialize( v, len );
+            }
+            else {
+                for( uint i = len; i > 0; i-- ) {
+                    serialize( (uint8 *)v + i - 1, 1 );
+                }
+            }
+            return *this;
+        }
+
+
+    private:
+
+        ByteOrder m_byteOrder;
+
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_STREAM_H
--- a/3rdparty/nvtt/nvcore/strlib.h
+++ b/3rdparty/nvtt/nvcore/strlib.h
@ -0,0 +1,429 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_STRING_H
+#define NV_CORE_STRING_H
+
+#include "debug.h"
+#include "hash.h" // hash
+
+//#include <string.h> // strlen, etc.
+
+#if NV_OS_WIN32
+#define NV_PATH_SEPARATOR '\\'
+#else
+#define NV_PATH_SEPARATOR '/'
+#endif
+
+namespace nv
+{
+
+    NVCORE_API uint strHash(const char * str, uint h) NV_PURE;
+
+    /// String hash based on Bernstein's hash.
+    inline uint strHash(const char * data, uint h = 5381)
+    {
+        uint i = 0;
+        while(data[i] != 0) {
+            h = (33 * h) ^ uint(data[i]);
+            i++;
+        }
+        return h;
+    }
+
+    template <> struct Hash<const char *> {
+        uint operator()(const char * str) const { return strHash(str); }
+    };
+
+    NVCORE_API uint strLen(const char * str) NV_PURE;                       // Asserts on NULL strings.
+
+    NVCORE_API int strDiff(const char * s1, const char * s2) NV_PURE;       // Asserts on NULL strings.
+    NVCORE_API int strCaseDiff(const char * s1, const char * s2) NV_PURE;   // Asserts on NULL strings.
+    NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE;     // Accepts NULL strings.
+    NVCORE_API bool strCaseEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings.
+
+    template <> struct Equal<const char *> {
+        bool operator()(const char * a, const char * b) const { return strEqual(a, b); }
+    };
+
+    NVCORE_API bool strBeginsWith(const char * dst, const char * prefix) NV_PURE;
+    NVCORE_API bool strEndsWith(const char * dst, const char * suffix) NV_PURE;
+
+
+    NVCORE_API void strCpy(char * dst, uint size, const char * src);
+    NVCORE_API void strCpy(char * dst, uint size, const char * src, uint len);
+    NVCORE_API void strCat(char * dst, uint size, const char * src);
+
+    NVCORE_API const char * strSkipWhiteSpace(const char * str);
+    NVCORE_API char * strSkipWhiteSpace(char * str);
+
+    NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE;
+
+    NVCORE_API bool isNumber(const char * str) NV_PURE;
+
+    /* @@ Implement these two functions and modify StringBuilder to use them?
+    NVCORE_API void strFormat(const char * dst, const char * fmt, ...);
+    NVCORE_API void strFormatList(const char * dst, const char * fmt, va_list arg);
+
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) __attribute__((format (printf, 2, 3)));
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        strFormatList(buffer, count, fmt, args);
+        va_end(args);
+    }
+    template <size_t count> void strFormatListSafe(char (&buffer)[count], const char *fmt, va_list arg) {
+        va_list tmp;
+        va_copy(tmp, args);
+        strFormatList(buffer, count, fmt, tmp);
+        va_end(tmp);
+    }*/
+
+    template <int count> void strCpySafe(char (&buffer)[count], const char *src) {
+        strCpy(buffer, count, src);
+    }
+
+    template <int count> void strCatSafe(char (&buffer)[count], const char * src) {
+        strCat(buffer, count, src);
+    }
+
+
+
+    /// String builder.
+    class NVCORE_CLASS StringBuilder
+    {
+    public:
+
+        StringBuilder();
+        explicit StringBuilder( uint size_hint );
+        StringBuilder(const char * str);
+        StringBuilder(const char * str, uint len);
+        StringBuilder(const StringBuilder & other);
+
+        ~StringBuilder();
+
+        StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3)));
+        StringBuilder & formatList( const char * format, va_list arg );
+
+        StringBuilder & append(const char * str);
+		StringBuilder & append(const char * str, uint len);
+        StringBuilder & appendFormat(const char * format, ...) __attribute__((format (printf, 2, 3)));
+        StringBuilder & appendFormatList(const char * format, va_list arg);
+
+        StringBuilder & appendSpace(uint n);
+
+        StringBuilder & number( int i, int base = 10 );
+        StringBuilder & number( uint i, int base = 10 );
+
+        StringBuilder & reserve(uint size_hint);
+        StringBuilder & copy(const char * str);
+        StringBuilder & copy(const char * str, uint len);
+        StringBuilder & copy(const StringBuilder & str);
+
+        StringBuilder & toLower();
+        StringBuilder & toUpper();
+
+        bool endsWith(const char * str) const;
+        bool beginsWith(const char * str) const;
+
+        char * reverseFind(char c);
+
+        void reset();
+        bool isNull() const { return m_size == 0; }
+
+        // const char * accessors
+        //operator const char * () const { return m_str; }
+        //operator char * () { return m_str; }
+        const char * str() const { return m_str; }
+        char * str() { return m_str; }
+
+        char * release();
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const StringBuilder & s ) {
+            return copy(s);
+        }
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const char * s ) {
+            return copy(s);
+        }
+
+        /// Equal operator.
+        bool operator==( const StringBuilder & s ) const {
+            return strMatch(s.m_str, m_str);
+        }
+
+        /// Return the exact length.
+        uint length() const { return isNull() ? 0 : strLen(m_str); }
+
+        /// Return the size of the string container.
+        uint capacity() const { return m_size; }
+
+        /// Return the hash of the string.
+        uint hash() const { return isNull() ? 0 : strHash(m_str); }
+
+        // Swap strings.
+        friend void swap(StringBuilder & a, StringBuilder & b);
+
+    protected:
+
+        /// Size of the string container.
+        uint m_size;
+
+        /// String.
+        char * m_str;
+
+    };
+
+
+    /// Path string. @@ This should be called PathBuilder.
+    class NVCORE_CLASS Path : public StringBuilder
+    {
+    public:
+        Path() : StringBuilder() {}
+        explicit Path(int size_hint) : StringBuilder(size_hint) {}
+        Path(const char * str) : StringBuilder(str) {}
+        Path(const Path & path) : StringBuilder(path) {}
+
+        const char * fileName() const;
+        const char * extension() const;
+
+        void translatePath(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void appendSeparator(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void stripFileName();
+        void stripExtension();
+
+        // statics
+        NVCORE_API static char separator();
+        NVCORE_API static const char * fileName(const char *);
+        NVCORE_API static const char * extension(const char *);
+
+        NVCORE_API static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
+    };
+
+
+    /// String class.
+    class NVCORE_CLASS String
+    {
+    public:
+
+        /// Constructs a null string. @sa isNull()
+        String()
+        {
+            data = NULL;
+        }
+
+        /// Constructs a shared copy of str.
+        String(const String & str)
+        {
+            data = str.data;
+            if (data != NULL) addRef();
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str)
+        {
+            setString(str);
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str, int length)
+        {
+            setString(str, length);
+        }
+
+        /// Constructs a shared string from a StringBuilder.
+        String(const StringBuilder & str)
+        {
+            setString(str);
+        }
+
+        /// Dtor.
+        ~String()
+        {
+            release();
+        }
+
+        String clone() const;
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const char * str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const StringBuilder & str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Implement value semantics.
+        String & operator=( const String & str )
+        {
+            if (str.data != data)
+            {
+                release();
+                data = str.data;
+                addRef();
+            }
+            return *this;
+        }
+
+        /// Equal operator.
+        bool operator==( const String & str ) const
+        {
+            return strMatch(str.data, data);
+        }
+
+        /// Equal operator.
+        bool operator==( const char * str ) const
+        {
+            return strMatch(str, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const String & str ) const
+        {
+            return !strMatch(str.data, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const char * str ) const
+        {
+            return !strMatch(str, data);
+        }
+
+        /// Returns true if this string is the null string.
+        bool isNull() const { return data == NULL; }
+
+        /// Return the exact length.
+        uint length() const { nvDebugCheck(data != NULL); return strLen(data); }
+
+        /// Return the hash of the string.
+        uint hash() const { nvDebugCheck(data != NULL); return strHash(data); }
+
+        /// const char * cast operator.
+        operator const char * () const { return data; }
+
+        /// Get string pointer.
+        const char * str() const { return data; }
+
+
+    private:
+
+        // Add reference count.
+        void addRef();
+
+        // Decrease reference count.
+        void release();
+
+        uint16 getRefCount() const
+        {
+            nvDebugCheck(data != NULL);
+            return *reinterpret_cast<const uint16 *>(data - 2);
+        }
+
+        void setRefCount(uint16 count) {
+            nvDebugCheck(data != NULL);
+            nvCheck(count < 0xFFFF);
+            *reinterpret_cast<uint16 *>(const_cast<char *>(data - 2)) = uint16(count);
+        }
+
+        void setData(const char * str) {
+            data = str + 2;
+        }
+
+        void allocString(const char * str)
+        {
+            allocString(str, strLen(str));
+        }
+
+        void allocString(const char * str, uint length);
+
+        void setString(const char * str);
+        void setString(const char * str, uint length);
+        void setString(const StringBuilder & str);
+
+        // Swap strings.
+        friend void swap(String & a, String & b);
+
+    private:
+
+        const char * data;
+
+    };
+
+    template <> struct Hash<String> {
+        uint operator()(const String & str) const { return str.hash(); }
+    };
+
+
+    // Like AutoPtr, but for const char strings.
+    class AutoString
+    {
+        NV_FORBID_COPY(AutoString);
+        NV_FORBID_HEAPALLOC();
+    public:
+
+        // Ctor.
+        AutoString(const char * p = NULL) : m_ptr(p) { }
+
+#if NV_CC_CPP11
+        // Move ctor.
+        AutoString(AutoString && ap) : m_ptr(ap.m_ptr) { ap.m_ptr = NULL; }
+#endif
+        
+        // Dtor. Deletes owned pointer.
+        ~AutoString() {
+            delete [] m_ptr;
+            m_ptr = NULL;
+        }
+
+        // Delete owned pointer and assign new one.
+        void operator=(const char * p) {
+            if (p != m_ptr) 
+            {
+                delete [] m_ptr;
+                m_ptr = p;
+            }
+        }
+
+        // Get pointer.
+        const char * ptr() const { return m_ptr; }
+        operator const char *() const { return m_ptr; }
+
+        // Relinquish ownership of the underlying pointer and returns that pointer.
+        const char * release() {
+            const char * tmp = m_ptr;
+            m_ptr = NULL;
+            return tmp;
+        }
+
+        // comparison operators.
+        friend bool operator == (const AutoString & ap, const char * const p) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const AutoString & ap, const char * const p) {
+            return (ap.ptr() != p);
+        }
+        friend bool operator == (const char * const p, const AutoString & ap) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const char * const p, const AutoString & ap) {
+            return (ap.ptr() != p);
+        }
+
+    private:
+        const char * m_ptr;
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_STRING_H
--- a/3rdparty/nvtt/nvcore/utils.h
+++ b/3rdparty/nvtt/nvcore/utils.h
@ -0,0 +1,281 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_UTILS_H
+#define NV_CORE_UTILS_H
+
+#include "debug.h" // nvdebugcheck
+
+#include <new> // for placement new
+
+
+// Just in case. Grrr.
+#undef min
+#undef max
+
+#define NV_INT8_MIN    (-128)
+#define NV_INT8_MAX    127
+#define NV_UINT8_MAX    255
+#define NV_INT16_MIN    (-32767-1)
+#define NV_INT16_MAX    32767
+#define NV_UINT16_MAX   0xffff
+#define NV_INT32_MIN    (-2147483647-1)
+#define NV_INT32_MAX    2147483647
+#define NV_UINT32_MAX   0xffffffff
+#define NV_INT64_MAX    POSH_I64(9223372036854775807)
+#define NV_INT64_MIN    (-POSH_I64(9223372036854775807)-1)
+#define NV_UINT64_MAX   POSH_U64(0xffffffffffffffff)
+
+#define NV_HALF_MAX     65504.0F
+#define NV_FLOAT_MAX    3.402823466e+38F
+
+#define NV_INTEGER_TO_FLOAT_MAX  16777217     // Largest integer such that it and all smaller integers can be stored in a 32bit float.
+
+
+namespace nv
+{
+    // Less error prone than casting. From CB:
+    // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html
+
+    // These intentionally look like casts.
+
+    // uint32 casts:
+    template <typename T> inline uint32 U32(T x) { return x; }
+    template <> inline uint32 U32<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; }
+    template <> inline uint32 U32<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; }
+    //template <> inline uint32 U32<uint32>(uint32 x) { return x; }
+    template <> inline uint32 U32<int32>(int32 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint16>(uint16 x) { return x; }
+    template <> inline uint32 U32<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint8>(uint8 x) { return x; }
+    template <> inline uint32 U32<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+
+    // int32 casts:
+    template <typename T> inline int32 I32(T x) { return x; }
+    template <> inline int32 I32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    template <> inline int32 I32<int64>(int64 x) { nvDebugCheck(x >= NV_INT32_MIN && x <= NV_UINT32_MAX); return (int32)x; }
+    template <> inline int32 I32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    //template <> inline int32 I32<int32>(int32 x) { return x; }
+    //template <> inline int32 I32<uint16>(uint16 x) { return x; }
+    //template <> inline int32 I32<int16>(int16 x) { return x; }
+    //template <> inline int32 I32<uint8>(uint8 x) { return x; }
+    //template <> inline int32 I32<int8>(int8 x) { return x; }
+
+    // uint16 casts:
+    template <typename T> inline uint16 U16(T x) { return x; }
+    template <> inline uint16 U16<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    //template <> inline uint16 U16<uint16>(uint16 x) { return x; }
+    template <> inline uint16 U16<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+    //template <> inline uint16 U16<uint8>(uint8 x) { return x; }
+    template <> inline uint16 U16<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+
+    // int16 casts:
+    template <typename T> inline int16 I16(T x) { return x; }
+    template <> inline int16 I16<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int64>(int64 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int32>(int32 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    //template <> inline int16 I16<int16>(int16 x) { return x; }
+    //template <> inline int16 I16<uint8>(uint8 x) { return x; }
+    //template <> inline int16 I16<int8>(int8 x) { return x; }
+
+    // uint8 casts:
+    template <typename T> inline uint8 U8(T x) { return x; }
+    template <> inline uint8 U8<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint16>(uint16 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int16>(int16 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    //template <> inline uint8 U8<uint8>(uint8 x) { return x; }
+    template <> inline uint8 U8<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint8)x; }
+    //template <> inline uint8 U8<float>(int8 x) { nvDebugCheck(x >= 0.0f && x <= 255.0f); return (uint8)x; }
+
+    // int8 casts:
+    template <typename T> inline int8 I8(T x) { return x; }
+    template <> inline int8 I8<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int64>(int64 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int32>(int32 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int16>(int16 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint8>(uint8 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    //template <> inline int8 I8<int8>(int8 x) { return x; }
+
+    // float casts:
+    template <typename T> inline float F32(T x) { return x; }
+    template <> inline float F32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int64>(int64 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int32>(int32 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    // The compiler should not complain about these conversions:
+    //template <> inline float F32<uint16>(uint16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int16>(int16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<uint8>(uint8 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int8>(int8 x) { nvDebugCheck(return (float)x; }
+
+
+    /// Swap two values.
+    template <typename T> 
+    inline void swap(T & a, T & b)
+    {
+        T temp(a);
+        a = b; 
+        b = temp;
+    }
+
+    /// Return the maximum of the two arguments. For floating point values, it returns the second value if the first is NaN.
+    template <typename T> 
+    //inline const T & max(const T & a, const T & b)
+    inline T max(const T & a, const T & b)
+    {
+        return (b < a) ? a : b;
+    }
+
+	/// Return the maximum of the four arguments.
+	template <typename T> 
+	//inline const T & max4(const T & a, const T & b, const T & c)
+	inline T max4(const T & a, const T & b, const T & c, const T & d)
+	{
+		return max(max(a, b), max(c, d));
+	}
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & max3(const T & a, const T & b, const T & c)
+    inline T max3(const T & a, const T & b, const T & c)
+    {
+        return max(a, max(b, c));
+    }
+
+    /// Return the minimum of two values.
+    template <typename T> 
+    //inline const T & min(const T & a, const T & b)
+    inline T min(const T & a, const T & b)
+    {
+        return (a < b) ? a : b;
+    }
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & min3(const T & a, const T & b, const T & c)
+    inline T min3(const T & a, const T & b, const T & c)
+    {
+        return min(a, min(b, c));
+    }
+
+    /// Clamp between two values.
+    template <typename T> 
+    //inline const T & clamp(const T & x, const T & a, const T & b)
+    inline T clamp(const T & x, const T & a, const T & b)
+    {
+        return min(max(x, a), b);
+    }
+
+    /** Return the next power of two. 
+    * @see http://graphics.stanford.edu/~seander/bithacks.html
+    * @warning Behaviour for 0 is undefined.
+    * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
+    * @note nextPowerOfTwo(x) = 2 << log2(x-1)
+    */
+    inline uint nextPowerOfTwo( uint x )
+    {
+        nvDebugCheck( x != 0 );
+#if 1	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
+        x--;
+        x |= x >> 1;
+        x |= x >> 2;
+        x |= x >> 4;
+        x |= x >> 8;
+        x |= x >> 16;
+        return x+1;	
+#else
+        uint p = 1;
+        while( x > p ) {
+            p += p;
+        }
+        return p;
+#endif
+    }
+
+    /// Return true if @a n is a power of two.
+    inline bool isPowerOfTwo( uint n )
+    {
+        return (n & (n-1)) == 0;
+    }
+
+
+    // @@ Move this to utils?
+    /// Delete all the elements of a container.
+    template <typename T>
+    void deleteAll(T & container)
+    {
+        for (typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i))
+        {
+            delete container[i];
+        }
+    }
+
+
+
+    // @@ Specialize these methods for numeric, pointer, and pod types.
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T; // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T & elem) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(elem); // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(src[i]); // placement new
+        }
+    }
+
+    template <typename T>
+    void destroy_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = new_size; i < old_size; i++) {
+            (ptr+i)->~T(); // Explicit call to the destructor
+        }
+    }
+
+    template <typename T>
+    void fill(T * restrict dst, uint count, const T & value) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = value;
+        }
+    }
+
+    template <typename T>
+    void copy_range(T * restrict dst, const T * restrict src, uint count) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = src[i];
+        }
+    }
+
+    template <typename T>
+    bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) {
+        for (uint i = begin; i < end; i++) {
+            if (ptr[i] == element) {
+                if (index != NULL) *index = i;
+                return true;
+            }
+        }
+        return false;
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_UTILS_H
--- a/3rdparty/nvtt/nvmath/Vector.inl
+++ b/3rdparty/nvtt/nvmath/Vector.inl
@ -0,0 +1,921 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_VECTOR_INL
+#define NV_MATH_VECTOR_INL
+
+#include "vector.h"
+#include "nvcore/utils.h" // min, max
+#include "nvcore/hash.h" // hash
+
+namespace nv
+{
+
+    // Helpers to convert vector types. Assume T has x,y members and 2 argument constructor.
+    //template <typename T> T to(Vector2::Arg v) { return T(v.x, v.y); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector3::Arg v) { return T(v.x, v.y, v.z); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector4::Arg v) { return T(v.x, v.y, v.z, v.w); }
+
+
+    // Vector2
+    inline Vector2::Vector2() {}
+    inline Vector2::Vector2(float f) : x(f), y(f) {}
+    inline Vector2::Vector2(float x, float y) : x(x), y(y) {}
+    inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {}
+
+    inline const Vector2 & Vector2::operator=(Vector2::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        return *this;
+    }
+
+    inline const float * Vector2::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector2::set(float x, float y)
+    {
+        this->x = x;
+        this->y = y;
+    }
+
+    inline Vector2 Vector2::operator-() const
+    {
+        return Vector2(-x, -y);
+    }
+
+    inline void Vector2::operator+=(Vector2::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+    }
+
+    inline void Vector2::operator-=(Vector2::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+    }
+
+    inline void Vector2::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+    }
+
+    inline void Vector2::operator*=(Vector2::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+    }
+
+    inline bool operator==(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x == b.x && a.y == b.y; 
+    }
+    inline bool operator!=(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x != b.x || a.y != b.y; 
+    }
+
+
+    // Vector3
+    inline Vector3::Vector3() {}
+    inline Vector3::Vector3(float f) : x(f), y(f), z(f) {}
+    inline Vector3::Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
+    inline Vector3::Vector3(Vector2::Arg v, float z) : x(v.x), y(v.y), z(z) {}
+    inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {}
+
+    inline const Vector3 & Vector3::operator=(Vector3::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        return *this;
+    }
+
+
+    inline Vector2 Vector3::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline const float * Vector3::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector3::set(float x, float y, float z)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+    }
+
+    inline Vector3 Vector3::operator-() const
+    {
+        return Vector3(-x, -y, -z);
+    }
+
+    inline void Vector3::operator+=(Vector3::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+    }
+
+    inline void Vector3::operator-=(Vector3::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+    }
+
+    inline void Vector3::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+    }
+
+    inline void Vector3::operator/=(float s)
+    {
+        float is = 1.0f / s;
+        x *= is;
+        y *= is;
+        z *= is;
+    }
+
+    inline void Vector3::operator*=(Vector3::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+    }
+
+    inline void Vector3::operator/=(Vector3::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+    }
+
+    inline bool operator==(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z; 
+    }
+    inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z; 
+    }
+
+
+    // Vector4
+    inline Vector4::Vector4() {}
+    inline Vector4::Vector4(float f) : x(f), y(f), z(f), w(f) {}
+    inline Vector4::Vector4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, float z, float w) : x(v.x), y(v.y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
+    inline Vector4::Vector4(Vector3::Arg v, float w) : x(v.x), y(v.y), z(v.z), w(w) {}
+    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+
+    inline const Vector4 & Vector4::operator=(const Vector4 & v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        w = v.w;
+        return *this;
+    }
+
+    inline Vector2 Vector4::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline Vector2 Vector4::zw() const
+    {
+        return Vector2(z, w);
+    }
+
+    inline Vector3 Vector4::xyz() const
+    {
+        return Vector3(x, y, z);
+    }
+
+    inline const float * Vector4::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector4::set(float x, float y, float z, float w)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+        this->w = w;
+    }
+
+    inline Vector4 Vector4::operator-() const
+    {
+        return Vector4(-x, -y, -z, -w);
+    }
+
+    inline void Vector4::operator+=(Vector4::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+        w += v.w;
+    }
+
+    inline void Vector4::operator-=(Vector4::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+        w -= v.w;
+    }
+
+    inline void Vector4::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+        w *= s;
+    }
+
+    inline void Vector4::operator/=(float s)
+    {
+        x /= s;
+        y /= s;
+        z /= s;
+        w /= s;
+    }
+
+    inline void Vector4::operator*=(Vector4::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+        w *= v.w;
+    }
+
+    inline void Vector4::operator/=(Vector4::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+        w /= v.w;
+    }
+
+    inline bool operator==(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; 
+    }
+    inline bool operator!=(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; 
+    }
+
+
+
+    // Functions
+
+
+    // Vector2
+
+    inline Vector2 add(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x + b.x, a.y + b.y);
+    }
+    inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector2 sub(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x - b.x, a.y - b.y);
+    }
+    inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, float s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, Vector2::Arg s)
+    {
+        return Vector2(v.x * s.x, v.y * s.y);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2)
+    {
+        return Vector2(v1.x*v2.x, v1.y*v2.y);
+    }
+
+    inline Vector2 operator*(float s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
+    }
+
+    inline float dot(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x * b.x + a.y * b.y;
+    }
+
+    inline float lengthSquared(Vector2::Arg v)
+    {
+        return v.x * v.x + v.y * v.y;
+    }
+
+    inline float length(Vector2::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector2::Arg a, Vector2::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float inverseLength(Vector2::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        NV_UNUSED(epsilon);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector2 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector2 normalizeFast(Vector2::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
+    }
+
+    inline Vector2 min(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(min(a.x, b.x), min(a.y, b.y));
+    }
+
+    inline Vector2 max(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(max(a.x, b.x), max(a.y, b.y));
+    }
+
+    inline Vector2 clamp(Vector2::Arg v, float min, float max)
+    {
+        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
+    }
+
+    inline Vector2 saturate(Vector2::Arg v)
+    {
+        return Vector2(saturate(v.x), saturate(v.y));
+    }
+
+    inline bool isFinite(Vector2::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y);
+    }
+
+    inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector2 vf = v;
+        nv::floatCleanup(vf.component, 2);
+        return vf;
+    }
+
+    // Note, this is the area scaled by 2!
+    inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
+    {
+	    return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
+    }
+    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
+    {
+        // IC: While it may be appealing to use the following expression:
+        //return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
+
+        // That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point 
+        // numbers and the results becomes very unstable and dependent on the order of the factors.
+
+        // Instead, it's preferable to substract the vertices first, and multiply the resulting small values together. The result
+        // in this case is always much more accurate (as long as the triangle is small) and less dependent of the location of 
+        // the triangle.
+
+        //return ((a.x - c.x) * (b.y - c.y) - (a.y - c.y) * (b.x - c.x)); // * 0.5f;
+        return triangleArea(a-c, b-c);
+    }
+
+
+    template <>
+    inline uint hash(const Vector2 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 2, h);
+    }
+
+
+
+    // Vector3
+
+    inline Vector3 add(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+    inline Vector3 add(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x + b, a.y + b, a.z + b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b)
+    {
+        return add(a, b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, float b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
+    }
+    inline Vector3 sub(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x - b, a.y - b, a.z - b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
+    {
+        return sub(a, b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, float b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector3 cross(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, float s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, Vector3::Arg s)
+    {
+        return Vector3(v.x * s.x, v.y * s.y, v.z * s.z);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(float s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, float s)
+    {
+        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
+    }*/
+
+    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector3(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z);
+    }
+
+    inline float dot(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z;
+    }
+
+    inline float lengthSquared(Vector3::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z;
+    }
+
+    inline float length(Vector3::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector3::Arg a, Vector3::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
+    {
+        return lengthSquared(a - b);
+    }
+
+    inline float inverseLength(Vector3::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        NV_UNUSED(epsilon);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector3 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector3 normalizeFast(Vector3::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
+    }
+
+    inline Vector3 min(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+    }
+
+    inline Vector3 max(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+    }
+
+    inline Vector3 clamp(Vector3::Arg v, float min, float max)
+    {
+        return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
+    }
+
+    inline Vector3 saturate(Vector3::Arg v)
+    {
+        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
+    }
+
+    inline Vector3 floor(Vector3::Arg v)
+    {
+        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
+    }
+
+    inline Vector3 ceil(Vector3::Arg v)
+    {
+        return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
+    }
+
+    inline bool isFinite(Vector3::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
+    }
+
+    inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector3 vf = v;
+        nv::floatCleanup(vf.component, 3);
+        return vf;
+    }
+
+    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
+    {
+	    return v - (2 * dot(v, n)) * n;
+    }
+
+    template <>
+    inline uint hash(const Vector3 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 3, h);
+    }
+
+
+    // Vector4
+
+    inline Vector4 add(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+    }
+    inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector4 sub(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+    }
+    inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, float s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, Vector4::Arg s)
+    {
+        return Vector4(v.x * s.x, v.y * s.y, v.z * s.z, v.w * s.w);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(float s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, Vector4::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
+    {
+        return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
+    }*/
+
+    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
+    }
+
+    inline float dot(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+    }
+
+    inline float lengthSquared(Vector4::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
+    }
+
+    inline float length(Vector4::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float inverseLength(Vector4::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        NV_UNUSED(epsilon);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector4 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector4 normalizeFast(Vector4::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
+    }
+
+    inline Vector4 min(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+    }
+
+    inline Vector4 max(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+    }
+
+    inline Vector4 clamp(Vector4::Arg v, float min, float max)
+    {
+        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
+    }
+
+    inline Vector4 saturate(Vector4::Arg v)
+    {
+        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
+    }
+
+    inline bool isFinite(Vector4::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
+    }
+
+    inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector4 vf = v;
+        nv::floatCleanup(vf.component, 4);
+        return vf;
+    }
+
+    template <>
+    inline uint hash(const Vector4 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 4, h);
+    }
+
+
+#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float
+
+    //int:
+
+    inline Vector2 scale(Vector2::Arg v, int s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(int s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, int s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(int s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, int s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(int s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    //double:
+
+    inline Vector3 operator*(Vector3::Arg v, double s)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator*(double s, Vector3::Arg v)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, double s)
+    {
+        return scale(v, 1.f/((float)s));
+    }    
+        
+#endif //NV_OS_IOS
+
+} // nv namespace
+
+#endif // NV_MATH_VECTOR_INL
--- a/3rdparty/nvtt/nvmath/fitting.cpp
+++ b/3rdparty/nvtt/nvmath/fitting.cpp
--- a/3rdparty/nvtt/nvmath/fitting.h
+++ b/3rdparty/nvtt/nvmath/fitting.h
@ -0,0 +1,49 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_MATH_FITTING_H
+#define NV_MATH_FITTING_H
+
+#include "vector.h"
+#include "plane.h"
+
+namespace nv
+{
+    namespace Fit
+    {
+        Vector3 computeCentroid(int n, const Vector3 * points);
+        Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        Vector4 computeCentroid(int n, const Vector4 * points);
+        Vector4 computeCentroid(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
+        Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, const Vector3 & metric, float * covariance);
+
+        Vector4 computeCovariance(int n, const Vector4 * points, float * covariance);
+        Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance);
+
+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+		Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
+        Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points);
+        Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points);
+
+        Plane bestPlane(int n, const Vector3 * points);
+        bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
+
+        bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
+        bool eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]);
+
+        // Returns number of clusters [1-4].
+        int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_FITTING_H
--- a/3rdparty/nvtt/nvmath/matrix.h
+++ b/3rdparty/nvtt/nvmath/matrix.h
@ -0,0 +1,112 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_MATRIX_H
+#define NV_MATH_MATRIX_H
+
+#include "vector.h"
+
+// - Matrices are stored in memory in *column major* order.
+// - Points are to be though of as column vectors.
+// - Transformation of a point p by a matrix M is: p' = M * p
+
+namespace nv
+{
+    enum identity_t { identity };
+
+    // 3x3 matrix.
+    class NVMATH_CLASS Matrix3
+    {
+    public:
+        Matrix3();
+        explicit Matrix3(float f);
+        explicit Matrix3(identity_t);
+        Matrix3(const Matrix3 & m);
+        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+
+        Vector3 row(uint i) const;
+        Vector3 column(uint i) const;
+
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator+=(const Matrix3 & m);
+        void operator-=(const Matrix3 & m);
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        float determinant() const;
+
+    private:
+        float m_data[9];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix3 & m, const Vector3 & b, Vector3 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x);
+
+
+    // 4x4 matrix.
+    class NVMATH_CLASS Matrix
+    {
+    public:
+        typedef Matrix const & Arg;
+
+        Matrix();
+        explicit Matrix(float f);
+        explicit Matrix(identity_t);
+        Matrix(const Matrix3 & m);
+        Matrix(const Matrix & m);
+        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
+        //explicit Matrix(const float m[]);	// m is assumed to contain 16 elements
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+        const float * ptr() const;
+
+        Vector4 row(uint i) const;
+        Vector4 column(uint i) const;
+
+        void zero();
+        void identity();
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        void translate(Vector3::Arg t);
+        void rotate(float theta, float v0, float v1, float v2);
+        float determinant() const;
+
+        void operator+=(const Matrix & m);
+        void operator-=(const Matrix & m);
+
+        void apply(Matrix::Arg m);
+
+    private:
+        float m_data[16];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Compute inverse using LU decomposition.
+    extern Matrix inverseLU(const Matrix & m);
+
+    // Compute inverse using Gaussian elimination and partial pivoting.
+    extern Matrix inverse(const Matrix & m);
+    extern Matrix3 inverse(const Matrix3 & m);
+
+} // nv namespace
+
+#endif // NV_MATH_MATRIX_H
--- a/3rdparty/nvtt/nvmath/matrix.inl
+++ b/3rdparty/nvtt/nvmath/matrix.inl
--- a/3rdparty/nvtt/nvmath/nvmath.h
+++ b/3rdparty/nvtt/nvmath/nvmath.h
@ -0,0 +1,56 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_H
+#define NV_MATH_H
+
+#include <math.h>
+#include <float.h>  // finite, isnan
+
+#include "nvcore/utils.h"   // max, clamp
+
+#define NVMATH_API
+#define NVMATH_CLASS
+
+#define PI                  float(3.1415926535897932384626433833)
+#define NV_EPSILON          (0.0001f)
+#define NV_NORMAL_EPSILON   (0.001f)
+
+namespace nv
+{
+    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
+    inline float toDegree(float radian) { return radian * (180.0f / PI); }
+
+    // Robust floating point comparisons:
+    // http://realtimecollisiondetection.net/blog/?p=89
+    inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
+    {
+        //return fabs(f0-f1) <= epsilon;
+        return fabs(f0-f1) <= epsilon * max3(1.0f, fabsf(f0), fabsf(f1));
+    }
+
+    inline bool isZero(const float f, const float epsilon = NV_EPSILON)
+    {
+        return fabsf(f) <= epsilon;
+    }
+
+    inline bool isFinite(const float f)
+    {
+        return _finite(f) != 0;
+    }
+
+    // Eliminates negative zeros from a float array.
+    inline void floatCleanup(float * fp, int n)
+    {
+        for (int i = 0; i < n; i++) {
+            //nvDebugCheck(isFinite(fp[i]));
+            union { float f; uint32 i; } x = { fp[i] };
+            if (x.i == 0x80000000) fp[i] = 0.0f;
+        }
+    }
+
+    inline float saturate(float f) {
+        return clamp(f, 0.0f, 1.0f);
+    }
+}
+
+#endif // NV_MATH_H
--- a/3rdparty/nvtt/nvmath/plane.h
+++ b/3rdparty/nvtt/nvmath/plane.h
@ -0,0 +1,40 @@
+// This code is in the public domain -- Ignacio Castańo <castano@gmail.com>
+
+#ifndef NV_MATH_PLANE_H
+#define NV_MATH_PLANE_H
+
+#include "nvmath.h"
+#include "vector.h"
+
+namespace nv
+{
+    class Matrix;
+
+    class NVMATH_CLASS Plane
+    {
+    public:
+        Plane();
+        Plane(float x, float y, float z, float w);
+        Plane(const Vector4 & v);
+        Plane(const Vector3 & v, float d);
+        Plane(const Vector3 & normal, const Vector3 & point);
+        Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2);
+
+        const Plane & operator=(const Plane & v);
+
+        Vector3 vector() const;
+        float offset() const;
+
+        void operator*=(float s);
+
+        Vector4 v;
+    };
+
+    Plane transformPlane(const Matrix &, const Plane &);
+
+    Vector3 planeIntersection(const Plane & a, const Plane & b, const Plane & c);
+
+
+} // nv namespace
+
+#endif // NV_MATH_PLANE_H
--- a/3rdparty/nvtt/nvmath/plane.inl
+++ b/3rdparty/nvtt/nvmath/plane.inl
@ -0,0 +1,49 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_PLANE_INL
+#define NV_MATH_PLANE_INL
+
+#include "Plane.h"
+#include "Vector.inl"
+
+namespace nv
+{
+    inline Plane::Plane() {}
+    inline Plane::Plane(float x, float y, float z, float w) : v(x, y, z, w) {}
+    inline Plane::Plane(const Vector4 & v) : v(v) {}
+    inline Plane::Plane(const Vector3 & v, float d) : v(v, d) {}
+    inline Plane::Plane(const Vector3 & normal, const Vector3 & point) : v(normal, -dot(normal, point)) {}
+    inline Plane::Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2) {
+        Vector3 n = cross(v1-v0, v2-v0);
+        float d = -dot(n, v0);
+        v = Vector4(n, d);
+    }
+
+    inline const Plane & Plane::operator=(const Plane & p) { v = p.v; return *this; }
+
+    inline Vector3 Plane::vector() const { return v.xyz(); }
+    inline float Plane::offset() const { return v.w; }
+
+    // Normalize plane.
+    inline Plane normalize(const Plane & plane, float epsilon = NV_EPSILON)
+    {
+        const float len = length(plane.vector());
+        const float inv = isZero(len, epsilon) ? 0 : 1.0f / len;
+        return Plane(plane.v * inv);
+    }
+
+    // Get the signed distance from the given point to this plane.
+    inline float distance(const Plane & plane, const Vector3 & point)
+    {
+        return dot(plane.vector(), point) + plane.offset();
+    }
+
+    inline void Plane::operator*=(float s)
+    {
+        v *= s;
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_PLANE_H
--- a/3rdparty/nvtt/nvmath/vector.h
+++ b/3rdparty/nvtt/nvmath/vector.h
@ -0,0 +1,148 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_VECTOR_H
+#define NV_MATH_VECTOR_H
+
+#include "nvmath.h"
+
+namespace nv
+{
+    class NVMATH_CLASS Vector2
+    {
+    public:
+        typedef Vector2 const & Arg;
+
+        Vector2();
+        explicit Vector2(float f);
+        Vector2(float x, float y);
+        Vector2(Vector2::Arg v);
+
+        //template <typename T> explicit Vector2(const T & v) : x(v.x), y(v.y) {}
+        //template <typename T> operator T() const { return T(x, y); }
+
+        const Vector2 & operator=(Vector2::Arg v);
+
+        const float * ptr() const;
+
+        void set(float x, float y);
+
+        Vector2 operator-() const;
+        void operator+=(Vector2::Arg v);
+        void operator-=(Vector2::Arg v);
+        void operator*=(float s);
+        void operator*=(Vector2::Arg v);
+
+        friend bool operator==(Vector2::Arg a, Vector2::Arg b);
+        friend bool operator!=(Vector2::Arg a, Vector2::Arg b);
+
+        union {
+            struct {
+                float x, y;
+            };
+            float component[2];
+        };
+    };
+
+    class NVMATH_CLASS Vector3
+    {
+    public:
+        typedef Vector3 const & Arg;
+
+        Vector3();
+        explicit Vector3(float x);
+        //explicit Vector3(int x) : x(float(x)), y(float(x)), z(float(x)) {}
+        Vector3(float x, float y, float z);
+        Vector3(Vector2::Arg v, float z);
+        Vector3(Vector3::Arg v);
+
+        //template <typename T> explicit Vector3(const T & v) : x(v.x), y(v.y), z(v.z) {}
+        //template <typename T> operator T() const { return T(x, y, z); }
+
+        const Vector3 & operator=(Vector3::Arg v);
+
+        Vector2 xy() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z);
+
+        Vector3 operator-() const;
+        void operator+=(Vector3::Arg v);
+        void operator-=(Vector3::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector3::Arg v);
+        void operator/=(Vector3::Arg v);
+
+        friend bool operator==(Vector3::Arg a, Vector3::Arg b);
+        friend bool operator!=(Vector3::Arg a, Vector3::Arg b);
+
+        union {
+            struct {
+                float x, y, z;
+            };
+            float component[3];
+        };
+    };
+
+    class NVMATH_CLASS Vector4
+    {
+    public:
+        typedef Vector4 const & Arg;
+
+        Vector4();
+        explicit Vector4(float x);
+        Vector4(float x, float y, float z, float w);
+        Vector4(Vector2::Arg v, float z, float w);
+        Vector4(Vector2::Arg v, Vector2::Arg u);
+        Vector4(Vector3::Arg v, float w);
+        Vector4(Vector4::Arg v);
+        //	Vector4(const Quaternion & v);
+
+        //template <typename T> explicit Vector4(const T & v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+        //template <typename T> operator T() const { return T(x, y, z, w); }
+
+        const Vector4 & operator=(Vector4::Arg v);
+
+        Vector2 xy() const;
+        Vector2 zw() const;
+        Vector3 xyz() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z, float w);
+
+        Vector4 operator-() const;
+        void operator+=(Vector4::Arg v);
+        void operator-=(Vector4::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector4::Arg v);
+        void operator/=(Vector4::Arg v);
+
+        friend bool operator==(Vector4::Arg a, Vector4::Arg b);
+        friend bool operator!=(Vector4::Arg a, Vector4::Arg b);
+
+        union {
+            struct {
+                float x, y, z, w;
+            };
+            float component[4];
+        };
+    };
+
+} // nv namespace
+
+// If we had these functions, they would be ambiguous, the compiler would not know which one to pick:
+//template <typename T> Vector2 to(const T & v) { return Vector2(v.x, v.y); }
+//template <typename T> Vector3 to(const T & v) { return Vector3(v.x, v.y, v.z); }
+//template <typename T> Vector4 to(const T & v) { return Vector4(v.x, v.y, v.z, v.z); }
+
+// We could use a cast operator so that we could infer the expected type, but that doesn't work the same way in all compilers and produces horrible error messages.
+
+// Instead we simply have explicit casts:
+template <typename T> T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); }
+template <typename T> T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); }
+template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); }
+
+#endif // NV_MATH_VECTOR_H
--- a/3rdparty/nvtt/nvtt.cpp
+++ b/3rdparty/nvtt/nvtt.cpp
@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2015 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#include "nvtt.h"
+
+#include <string.h>
+#include <bx/uint32_t.h>
+
+#include "bc6h/zoh.h"
+#include "bc7/avpcl.h"
+#include "nvmath/vector.inl"
+
+NVCORE_API int nvAbort(const char *, const char *, int , const char *, const char *, ...) __attribute__((format (printf, 5, 6)))
+{
+	abort();
+	return 0;
+}
+
+namespace nvtt
+{
+	using namespace nv;
+
+	void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output)
+	{
+		const uint8_t* src = (const uint8_t*)_input;
+		char* dst = (char*)_output;
+
+		for (uint32_t yy = 0; yy < _height; yy += 4)
+		{
+			for (uint32_t xx = 0; xx < _width; xx += 4)
+			{
+				const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4];
+
+				ZOH::Utils::FORMAT = ZOH::UNSIGNED_F16;
+				ZOH::Tile zohTile(4, 4);
+
+				memset(zohTile.data, 0, sizeof(zohTile.data) );
+				memset(zohTile.importance_map, 0, sizeof(zohTile.importance_map) );
+
+				for (uint32_t blockY = 0; blockY < 4; ++blockY)
+				{
+					for (uint32_t blockX = 0; blockX < 4; ++blockX)
+					{
+						Vector4 color = rgba[blockY*4 + blockX];
+						uint16 rHalf = bx::halfFromFloat(color.x);
+						uint16 gHalf = bx::halfFromFloat(color.y);
+						uint16 bHalf = bx::halfFromFloat(color.z);
+						zohTile.data[blockY][blockX].x = ZOH::Tile::half2float(rHalf);
+						zohTile.data[blockY][blockX].y = ZOH::Tile::half2float(gHalf);
+						zohTile.data[blockY][blockX].z = ZOH::Tile::half2float(bHalf);
+						zohTile.importance_map[blockY][blockX] = 1.0f;
+					}
+				}
+
+				ZOH::compress(zohTile, &dst[( (yy*_width) + xx)/4 * 16]);
+			}
+		}
+	}
+
+	void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output)
+	{
+		const uint8_t* src = (const uint8_t*)_input;
+		char* dst = (char*)_output;
+
+		for (uint32_t yy = 0; yy < _height; yy += 4)
+		{
+			for (uint32_t xx = 0; xx < _width; xx += 4)
+			{
+				const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4];
+
+				AVPCL::mode_rgb     = false;
+				AVPCL::flag_premult = false;
+				AVPCL::flag_nonuniform     = false;
+				AVPCL::flag_nonuniform_ati = false;
+
+				AVPCL::Tile avpclTile(4, 4);
+				memset(avpclTile.data, 0, sizeof(avpclTile.data) );
+				for (uint32_t blockY = 0; blockY < 4; ++blockY)
+				{
+					for (uint32_t blockX = 0; blockX < 4; ++blockX)
+					{
+						Vector4 color = rgba[blockY*4 + blockX];
+						avpclTile.data[blockY][blockX] = color * 255.0f;
+						avpclTile.importance_map[blockY][blockX] = 1.0f;
+					}
+				}
+
+				AVPCL::compress(avpclTile, &dst[( (yy*_width) + xx)/4 * 16]);
+			}
+		}
+	}
+
+} //namespace nvtt
--- a/3rdparty/nvtt/nvtt.h
+++ b/3rdparty/nvtt/nvtt.h
@ -0,0 +1,13 @@
+#ifndef NVTT_H
+#define NVTT_H
+
+#include <stdint.h>
+
+namespace nvtt
+{
+void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output);
+void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output);
+
+} // namespace nvtt
+
+#endif // NVTT_H
--- a/scripts/texturec.lua
+++ b/scripts/texturec.lua
@ -12,6 +12,7 @@ project "texturec"
 		path.join(BGFX_DIR, "include"),
 		path.join(BGFX_DIR, "src"),
 		path.join(BGFX_DIR, "3rdparty"),
+		path.join(BGFX_DIR, "3rdparty/nvtt"),
 	}

 	files {
@ -20,6 +21,8 @@ project "texturec"
 		path.join(BGFX_DIR, "3rdparty/libsquish/**.h"),
 		path.join(BGFX_DIR, "3rdparty/etc1/**.cpp"),
 		path.join(BGFX_DIR, "3rdparty/etc1/**.h"),
+		path.join(BGFX_DIR, "3rdparty/nvtt/**.cpp"),
+		path.join(BGFX_DIR, "3rdparty/nvtt/**.h"),
 		path.join(BGFX_DIR, "tools/texturec/**.cpp"),
 		path.join(BGFX_DIR, "tools/texturec/**.h"),
 	}
--- a/tools/texturec/texturec.cpp
+++ b/tools/texturec/texturec.cpp
@ -13,6 +13,7 @@
 #include "image.h"
 #include <libsquish/squish.h>
 #include <etc1/etc1.h>
+#include <nvtt/nvtt.h>

 #if 0
 #	define BX_TRACE(_format, ...) fprintf(stderr, "" _format "\n", ##__VA_ARGS__)
@ -113,6 +114,14 @@ int main(int _argc, const char* _argv[])
 		{
 			format = TextureFormat::ETC1;
 		}
+		else if (0 == bx::stricmp(type, "bc6h") )
+		{
+			format = TextureFormat::BC6H;
+		}
+		else if (0 == bx::stricmp(type, "bc7") )
+		{
+			format = TextureFormat::BC7;
+		}
 	}

 	uint32_t size = (uint32_t)bx::getSize(&reader);
@ -154,10 +163,33 @@ int main(int _argc, const char* _argv[])
 					);
 				break;

+			case TextureFormat::BC4:
+			case TextureFormat::BC5:
+				break;
+
+			case TextureFormat::BC6H:
+				nvtt::compressBC6H(rgba, mip.m_width, mip.m_height, 4, output);
+				break;
+
+			case TextureFormat::BC7:
+				nvtt::compressBC7(rgba, mip.m_width, mip.m_height, 4, output);
+				break;
+
 			case TextureFormat::ETC1:
 				etc1_encode_image(rgba, mip.m_width, mip.m_height, 4, mip.m_width*4, output);
 				break;

+			case TextureFormat::ETC2:
+			case TextureFormat::ETC2A:
+			case TextureFormat::ETC2A1:
+			case TextureFormat::PTC12:
+			case TextureFormat::PTC14:
+			case TextureFormat::PTC12A:
+			case TextureFormat::PTC14A:
+			case TextureFormat::PTC22:
+			case TextureFormat::PTC24:
+				break;
+
 			default:
 				break;
 			}