From 607b0d6d42583a199b28bfd7f3200ed3fa6a591d Mon Sep 17 00:00:00 2001
From: Bill Hollings <bill.hollings@brenwill.com>
Date: Sun, 11 Feb 2018 16:52:57 -0500
Subject: [PATCH] CompilerMSL support smaller offsets for 3-row row-major
 matrices.

Support MSL typedefs to declare 3-row row-major matrices as 3-column matrices.
Allow those matrices to be decorated as packed.
Support transposing those matrices when used.
Modify how member alignments are calculated.
---
 .../opt/shaders-msl/comp/struct-packing.comp  |   7 +-
 .../opt/shaders-msl/vert/packed_matrix.vert   |  56 ++++++
 .../shaders-msl/comp/struct-packing.comp      |   7 +-
 reference/shaders-msl/vert/packed_matrix.vert |  56 ++++++
 shaders-msl/comp/struct-packing.comp          |   1 +
 shaders-msl/vert/packed_matrix.vert           |  41 ++++
 spirv_glsl.cpp                                |  22 ++-
 spirv_glsl.hpp                                |   2 +-
 spirv_msl.cpp                                 | 187 ++++++++++--------
 spirv_msl.hpp                                 |   4 +-
 10 files changed, 293 insertions(+), 90 deletions(-)
 create mode 100644 reference/opt/shaders-msl/vert/packed_matrix.vert
 create mode 100644 reference/shaders-msl/vert/packed_matrix.vert
 create mode 100644 shaders-msl/vert/packed_matrix.vert

diff --git a/reference/opt/shaders-msl/comp/struct-packing.comp b/reference/opt/shaders-msl/comp/struct-packing.comp
index cf626ce..f59cba5 100644
--- a/reference/opt/shaders-msl/comp/struct-packing.comp
+++ b/reference/opt/shaders-msl/comp/struct-packing.comp
@@ -3,6 +3,8 @@
 
 using namespace metal;
 
+typedef float3x2 packed_float2x3;
+
 struct S0
 {
     float2 a[1];
@@ -58,8 +60,10 @@ struct SSBO1
     float3x2 m3;
     float2x2 m4;
     float2x2 m5[9];
-    float2x3 m6[4][2];
+    packed_float2x3 m6[4][2];
+    char pad10[8];
     float3x2 m7;
+    char pad11[8];
     float array[1];
 };
 
@@ -96,5 +100,6 @@ kernel void main0(device SSBO1& ssbo_430 [[buffer(0)]], device SSBO0& ssbo_140 [
     ssbo_430.content.m3s[5].c = ssbo_140.content.m3s[5].c;
     ssbo_430.content.m3s[6].c = ssbo_140.content.m3s[6].c;
     ssbo_430.content.m3s[7].c = ssbo_140.content.m3s[7].c;
+    ssbo_430.content.m1.a = ssbo_430.content.m3.a * ssbo_430.m6[1][1];
 }
 
diff --git a/reference/opt/shaders-msl/vert/packed_matrix.vert b/reference/opt/shaders-msl/vert/packed_matrix.vert
new file mode 100644
index 0000000..53d7d16
--- /dev/null
+++ b/reference/opt/shaders-msl/vert/packed_matrix.vert
@@ -0,0 +1,56 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+typedef float3x4 packed_float4x3;
+
+struct _15
+{
+    packed_float4x3 _m0;
+    packed_float4x3 _m1;
+};
+
+struct _42
+{
+    float4x4 _m0;
+    float4x4 _m1;
+    float _m2;
+    char pad3[12];
+    packed_float3 _m3;
+    float _m4;
+    packed_float3 _m5;
+    float _m6;
+    float _m7;
+    float _m8;
+    float2 _m9;
+};
+
+struct main0_in
+{
+    float4 m_25 [[attribute(0)]];
+};
+
+struct main0_out
+{
+    float3 m_72 [[user(locn0)]];
+    float4 gl_Position [[position]];
+};
+
+vertex main0_out main0(main0_in in [[stage_in]], constant _15& _17 [[buffer(0)]], constant _42& _44 [[buffer(1)]])
+{
+    main0_out out = {};
+    float3 _34;
+    do
+    {
+        _34 = normalize(float4(in.m_25.xyz, 0.0) * _17._m1);
+        break;
+    } while (false);
+    float4 _70 = _44._m0 * float4(_44._m3 + (in.m_25.xyz * (_44._m6 + _44._m7)), 1.0);
+    out.m_72 = _34;
+    float4 _95 = _70;
+    _95.y = -_70.y;
+    out.gl_Position = _95;
+    return out;
+}
+
diff --git a/reference/shaders-msl/comp/struct-packing.comp b/reference/shaders-msl/comp/struct-packing.comp
index cf626ce..f59cba5 100644
--- a/reference/shaders-msl/comp/struct-packing.comp
+++ b/reference/shaders-msl/comp/struct-packing.comp
@@ -3,6 +3,8 @@
 
 using namespace metal;
 
+typedef float3x2 packed_float2x3;
+
 struct S0
 {
     float2 a[1];
@@ -58,8 +60,10 @@ struct SSBO1
     float3x2 m3;
     float2x2 m4;
     float2x2 m5[9];
-    float2x3 m6[4][2];
+    packed_float2x3 m6[4][2];
+    char pad10[8];
     float3x2 m7;
+    char pad11[8];
     float array[1];
 };
 
@@ -96,5 +100,6 @@ kernel void main0(device SSBO1& ssbo_430 [[buffer(0)]], device SSBO0& ssbo_140 [
     ssbo_430.content.m3s[5].c = ssbo_140.content.m3s[5].c;
     ssbo_430.content.m3s[6].c = ssbo_140.content.m3s[6].c;
     ssbo_430.content.m3s[7].c = ssbo_140.content.m3s[7].c;
+    ssbo_430.content.m1.a = ssbo_430.content.m3.a * ssbo_430.m6[1][1];
 }
 
diff --git a/reference/shaders-msl/vert/packed_matrix.vert b/reference/shaders-msl/vert/packed_matrix.vert
new file mode 100644
index 0000000..5d025c4
--- /dev/null
+++ b/reference/shaders-msl/vert/packed_matrix.vert
@@ -0,0 +1,56 @@
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+typedef float3x4 packed_float4x3;
+
+struct _15
+{
+    packed_float4x3 _m0;
+    packed_float4x3 _m1;
+};
+
+struct _42
+{
+    float4x4 _m0;
+    float4x4 _m1;
+    float _m2;
+    char pad3[12];
+    packed_float3 _m3;
+    float _m4;
+    packed_float3 _m5;
+    float _m6;
+    float _m7;
+    float _m8;
+    float2 _m9;
+};
+
+struct main0_in
+{
+    float4 m_25 [[attribute(0)]];
+};
+
+struct main0_out
+{
+    float3 m_72 [[user(locn0)]];
+    float4 gl_Position [[position]];
+};
+
+vertex main0_out main0(main0_in in [[stage_in]], constant _15& _17 [[buffer(0)]], constant _42& _44 [[buffer(1)]])
+{
+    main0_out out = {};
+    float3 _13;
+    do
+    {
+        _13 = normalize(float4(in.m_25.xyz, 0.0) * _17._m1);
+        break;
+    } while (false);
+    float4 _39 = _44._m0 * float4(_44._m3 + (in.m_25.xyz * (_44._m6 + _44._m7)), 1.0);
+    out.m_72 = _13;
+    float4 _74 = _39;
+    _74.y = -_39.y;
+    out.gl_Position = _74;
+    return out;
+}
+
diff --git a/shaders-msl/comp/struct-packing.comp b/shaders-msl/comp/struct-packing.comp
index 04b933d..5baf45c 100644
--- a/shaders-msl/comp/struct-packing.comp
+++ b/shaders-msl/comp/struct-packing.comp
@@ -72,5 +72,6 @@ layout(binding = 0, std140) buffer SSBO0
 void main()
 {
     ssbo_430.content = ssbo_140.content;
+    ssbo_430.content.m1.a = ssbo_430.m6[1][1] * ssbo_430.content.m3.a;	// test packed matrix access
 }
 
diff --git a/shaders-msl/vert/packed_matrix.vert b/shaders-msl/vert/packed_matrix.vert
new file mode 100644
index 0000000..4d99d21
--- /dev/null
+++ b/shaders-msl/vert/packed_matrix.vert
@@ -0,0 +1,41 @@
+#version 450
+
+layout(binding = 13, std140) uniform _1365_18812
+{
+    layout(row_major) mat4x3 _m0;
+    layout(row_major) mat4x3 _m1;
+} _18812;
+
+layout(binding = 12, std140) uniform _1126_22044
+{
+    layout(row_major) mat4 _m0;
+    layout(row_major) mat4 _m1;
+    float _m9;
+    vec3 _m10;
+    float _m11;
+    vec3 _m12;
+    float _m17;
+    float _m18;
+    float _m19;
+    vec2 _m20;
+} _22044;
+
+layout(location = 0) out vec3 _3976;
+layout(location = 0) in vec4 _5275;
+
+vec3 _2;
+
+void main()
+{
+    vec3 _23783;
+    do
+    {
+        _23783 = normalize(_18812._m1 * vec4(_5275.xyz, 0.0));
+        break;
+    } while (false);
+    vec4 _14995 = vec4(_22044._m10 + (_5275.xyz * (_22044._m17 + _22044._m18)), 1.0) * _22044._m0;
+    _3976 = _23783;
+    vec4 _6282 = _14995;
+    _6282.y = -_14995.y;
+    gl_Position = _6282;
+}
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
index 8bdc376..3dd641a 100644
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@@ -2238,7 +2238,10 @@ string CompilerGLSL::to_expression(uint32_t id)
 		if (e.base_expression)
 			return to_enclosed_expression(e.base_expression) + e.expression;
 		else if (e.need_transpose)
-			return convert_row_major_matrix(e.expression, get<SPIRType>(e.expression_type));
+		{
+			bool is_packed = has_decoration(id, DecorationCPacked);
+			return convert_row_major_matrix(e.expression, get<SPIRType>(e.expression_type), is_packed);
+		}
 		else
 		{
 			if (force_recompile)
@@ -4289,7 +4292,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 
 	bool access_chain_is_arrayed = false;
 	bool row_major_matrix_needs_conversion = is_non_native_row_major_matrix(base);
-	bool vector_is_packed = false;
+	bool is_packed = false;
 	bool pending_array_enclose = false;
 	bool dimension_flatten = false;
 
@@ -4421,7 +4424,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 				}
 			}
 
-			vector_is_packed = member_is_packed_type(*type, index);
+			is_packed = member_is_packed_type(*type, index);
 			row_major_matrix_needs_conversion = member_is_non_native_row_major_matrix(*type, index);
 			type = &get<SPIRType>(type->member_types[index]);
 		}
@@ -4430,8 +4433,9 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 		{
 			if (row_major_matrix_needs_conversion)
 			{
-				expr = convert_row_major_matrix(expr, *type);
+				expr = convert_row_major_matrix(expr, *type, is_packed);
 				row_major_matrix_needs_conversion = false;
+				is_packed = false;
 			}
 
 			expr += "[";
@@ -4447,10 +4451,10 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 		// Vector -> Scalar
 		else if (type->vecsize > 1)
 		{
-			if (vector_is_packed)
+			if (is_packed)
 			{
 				expr = unpack_expression_type(expr, *type);
-				vector_is_packed = false;
+				is_packed = false;
 			}
 
 			if (index_is_literal)
@@ -4489,7 +4493,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 		*need_transpose = row_major_matrix_needs_conversion;
 
 	if (result_is_packed)
-		*result_is_packed = vector_is_packed;
+		*result_is_packed = is_packed;
 
 	return expr;
 }
@@ -4620,7 +4624,7 @@ std::string CompilerGLSL::flattened_access_chain_struct(uint32_t base, const uin
 
 		// Cannot forward transpositions, so resolve them here.
 		if (need_transpose)
-			expr += convert_row_major_matrix(tmp, member_type);
+			expr += convert_row_major_matrix(tmp, member_type, false);
 		else
 			expr += tmp;
 	}
@@ -7050,7 +7054,7 @@ bool CompilerGLSL::member_is_packed_type(const SPIRType &type, uint32_t index) c
 // row_major matrix result of the expression to a column_major matrix.
 // Base implementation uses the standard library transpose() function.
 // Subclasses may override to use a different function.
-string CompilerGLSL::convert_row_major_matrix(string exp_str, const SPIRType & /*exp_type*/)
+string CompilerGLSL::convert_row_major_matrix(string exp_str, const SPIRType & /*exp_type*/, bool /*is_packed*/)
 {
 	strip_enclosed_expression(exp_str);
 	return join("transpose(", exp_str, ")");
diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp
index 0cae70c..e811803 100644
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@@ -298,7 +298,7 @@ protected:
 	virtual bool is_non_native_row_major_matrix(uint32_t id);
 	virtual bool member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index);
 	bool member_is_packed_type(const SPIRType &type, uint32_t index) const;
-	virtual std::string convert_row_major_matrix(std::string exp_str, const SPIRType &exp_type);
+	virtual std::string convert_row_major_matrix(std::string exp_str, const SPIRType &exp_type, bool is_packed);
 
 	std::unordered_set<std::string> local_variable_names;
 	std::unordered_set<std::string> resource_names;
diff --git a/spirv_msl.cpp b/spirv_msl.cpp
index a85b38c..20dac98 100644
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@@ -744,29 +744,9 @@ void CompilerMSL::align_struct(SPIRType &ib_type)
 	curr_offset = 0;
 	for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
 	{
-		// Align current offset to the current member's default alignment.
-		size_t align_mask = get_declared_struct_member_alignment(ib_type, mbr_idx) - 1;
-		curr_offset = uint32_t((curr_offset + align_mask) & ~align_mask);
+		if (is_member_packable(ib_type, mbr_idx))
+			set_member_decoration(ib_type_id, mbr_idx, DecorationCPacked);
 
-		// Fetch the member offset as declared in the SPIRV.
-		uint32_t mbr_offset = get_member_decoration(ib_type_id, mbr_idx, DecorationOffset);
-		if (curr_offset > mbr_offset)
-		{
-			uint32_t prev_mbr_idx = mbr_idx - 1;
-			if (is_member_packable(ib_type, prev_mbr_idx))
-				set_member_decoration(ib_type_id, prev_mbr_idx, DecorationCPacked);
-		}
-
-		// Increment the current offset to be positioned immediately after the current member.
-		curr_offset = mbr_offset + uint32_t(get_declared_struct_member_size(ib_type, mbr_idx));
-	}
-
-	// Test the alignment of each member, and if a member is positioned farther than its
-	// alignment and the end of the previous member, add a dummy padding member that will
-	// be added before the current member when the delaration of this struct is emitted.
-	curr_offset = 0;
-	for (uint32_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
-	{
 		// Align current offset to the current member's default alignment.
 		size_t align_mask = get_declared_struct_member_alignment(ib_type, mbr_idx) - 1;
 		curr_offset = uint32_t((curr_offset + align_mask) & ~align_mask);
@@ -791,14 +771,48 @@ void CompilerMSL::align_struct(SPIRType &ib_type)
 // variation that is smaller than the unpacked variation of that type.
 bool CompilerMSL::is_member_packable(SPIRType &ib_type, uint32_t index)
 {
-	uint32_t mbr_type_id = ib_type.member_types[index];
-	auto &mbr_type = get<SPIRType>(mbr_type_id);
-
-	// 3-element vectors (char3, uchar3, short3, ushort3, int3, uint3, half3, float3)
-	if (mbr_type.vecsize == 3 && mbr_type.columns == 1)
+	// We've already marked it as packable
+	if (has_member_decoration(ib_type.self, index, DecorationCPacked))
 		return true;
 
-	return false;
+	auto &mbr_type = get<SPIRType>(ib_type.member_types[index]);
+
+	// Only 3-element vectors or 3-row matrices need to be packed.
+	if (mbr_type.vecsize != 3)
+		return false;
+
+	// Only row-major matrices need to be packed.
+	if (is_matrix(mbr_type) && !has_member_decoration(ib_type.self, index, DecorationRowMajor))
+		return false;
+
+	uint32_t component_size = mbr_type.width / 8;
+	uint32_t unpacked_mbr_size = component_size * (mbr_type.vecsize + 1) * mbr_type.columns;
+	if (is_array(mbr_type))
+	{
+		// If member is an array, and the array stride is larger than the type needs, don't pack it.
+		// Take into consideration multi-dimentional arrays.
+		uint32_t md_elem_cnt = 1;
+		size_t last_elem_idx = mbr_type.array.size() - 1;
+		for (uint32_t i = 0; i < last_elem_idx; i++)
+			md_elem_cnt *= max(to_array_size_literal(mbr_type, i), 1U);
+
+		uint32_t unpacked_array_stride = unpacked_mbr_size * md_elem_cnt;
+		uint32_t array_stride = type_struct_member_array_stride(ib_type, index);
+		return unpacked_array_stride > array_stride;
+	}
+	else
+	{
+		// Pack if there is not enough space between this member and next.
+		// If last member, only pack if it's a row-major matrix.
+		if (index < ib_type.member_types.size() - 1)
+		{
+			uint32_t mbr_offset_curr = get_member_decoration(ib_type.self, index, DecorationOffset);
+			uint32_t mbr_offset_next = get_member_decoration(ib_type.self, index + 1, DecorationOffset);
+			return unpacked_mbr_size > mbr_offset_next - mbr_offset_curr;
+		}
+		else
+			return is_matrix(mbr_type);
+	}
 }
 
 // Returns a combination of type ID and member index for use as hash key
@@ -835,11 +849,26 @@ void CompilerMSL::emit_header()
 	statement("");
 	statement("using namespace metal;");
 	statement("");
+
+	for (auto &td : typedef_lines)
+		statement(td);
+
+	if (!typedef_lines.empty())
+		statement("");
 }
 
 void CompilerMSL::add_pragma_line(const string &line)
 {
-	pragma_lines.insert(line);
+	auto rslt = pragma_lines.insert(line);
+	if (rslt.second)
+		force_recompile = true;
+}
+
+void CompilerMSL::add_typedef_line(const string &line)
+{
+	auto rslt = typedef_lines.insert(line);
+	if (rslt.second)
+		force_recompile = true;
 }
 
 // Emits any needed custom function bodies.
@@ -1611,11 +1640,12 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 	case OpVectorTimesMatrix:
 	case OpMatrixTimesVector:
 	{
-		// If the matrix needs transpose and it is square, just flip the multiply order.
+		// If the matrix needs transpose and it is square or packed, just flip the multiply order.
 		uint32_t mtx_id = ops[opcode == OpMatrixTimesVector ? 2 : 3];
 		auto *e = maybe_get<SPIRExpression>(mtx_id);
 		auto &t = expression_type(mtx_id);
-		if (e && e->need_transpose && t.columns == t.vecsize)
+		bool is_packed = has_decoration(mtx_id, DecorationCPacked);
+		if (e && e->need_transpose && (t.columns == t.vecsize || is_packed))
 		{
 			e->need_transpose = false;
 			emit_binary_op(ops[0], ops[1], ops[3], ops[2], "*");
@@ -2310,8 +2340,13 @@ bool CompilerMSL::is_non_native_row_major_matrix(uint32_t id)
 		return false;
 
 	// Generate a function that will swap matrix elements from row-major to column-major.
-	const auto type = expression_type(id);
-	add_convert_row_major_matrix_function(type.columns, type.vecsize);
+	// Packed row-matrix should just use transpose() function.
+	if (!has_decoration(id, DecorationCPacked))
+	{
+		const auto type = expression_type(id);
+		add_convert_row_major_matrix_function(type.columns, type.vecsize);
+	}
+
 	return true;
 }
 
@@ -2323,12 +2358,17 @@ bool CompilerMSL::member_is_non_native_row_major_matrix(const SPIRType &type, ui
 		return false;
 
 	// Non-matrix or column-major matrix types do not need to be converted.
-	if (!(combined_decoration_for_member(type, index) & (1ull << DecorationRowMajor)))
+	if (!has_member_decoration(type.self, index, DecorationRowMajor))
 		return false;
 
 	// Generate a function that will swap matrix elements from row-major to column-major.
-	const auto mbr_type = get<SPIRType>(type.member_types[index]);
-	add_convert_row_major_matrix_function(mbr_type.columns, mbr_type.vecsize);
+	// Packed row-matrix should just use transpose() function.
+	if (!has_member_decoration(type.self, index, DecorationCPacked))
+	{
+		const auto mbr_type = get<SPIRType>(type.member_types[index]);
+		add_convert_row_major_matrix_function(mbr_type.columns, mbr_type.vecsize);
+	}
+
 	return true;
 }
 
@@ -2355,20 +2395,19 @@ void CompilerMSL::add_convert_row_major_matrix_function(uint32_t cols, uint32_t
 
 	auto rslt = spv_function_implementations.insert(spv_func);
 	if (rslt.second)
-	{
 		add_pragma_line("#pragma clang diagnostic ignored \"-Wmissing-prototypes\"");
-		force_recompile = true;
-	}
 }
 
 // Wraps the expression string in a function call that converts the
 // row_major matrix result of the expression to a column_major matrix.
-string CompilerMSL::convert_row_major_matrix(string exp_str, const SPIRType &exp_type)
+string CompilerMSL::convert_row_major_matrix(string exp_str, const SPIRType &exp_type, bool is_packed)
 {
 	strip_enclosed_expression(exp_str);
 
 	string func_name;
-	if (exp_type.columns == exp_type.vecsize)
+
+	// Square and packed matrices can just use transpose
+	if (exp_type.columns == exp_type.vecsize || is_packed)
 		func_name = "transpose";
 	else
 		func_name = string("spvConvertFromRowMajor") + to_string(exp_type.columns) + "x" + to_string(exp_type.vecsize);
@@ -2405,7 +2444,23 @@ void CompilerMSL::emit_struct_member(const SPIRType &type, uint32_t member_type_
 		statement("char pad", to_string(index), "[", to_string(pad_len), "];");
 
 	// If this member is packed, mark it as so.
-	string pack_pfx = member_is_packed_type(type, index) ? "packed_" : "";
+	string pack_pfx = "";
+	if (member_is_packed_type(type, index))
+	{
+		pack_pfx = "packed_";
+
+		// If we're packing a matrix, output an appropriate typedef
+		if (membertype.vecsize > 1 && membertype.columns > 1)
+		{
+			string base_type = membertype.width == 16 ? "half" : "float";
+			string td_line = "typedef ";
+			td_line += base_type + to_string(membertype.vecsize) + "x" + to_string(membertype.columns);
+			td_line += " " + pack_pfx;
+			td_line += base_type + to_string(membertype.columns) + "x" + to_string(membertype.vecsize);
+			td_line += ";";
+			add_typedef_line(td_line);
+		}
+	}
 
 	statement(pack_pfx, type_to_glsl(membertype), " ", qualifier, to_member_name(type, index),
 	          member_attribute_qualifier(type, index), type_to_array_glsl(membertype), ";");
@@ -3369,7 +3424,6 @@ string CompilerMSL::built_in_func_arg(BuiltIn builtin, bool prefix_comma)
 // Returns the byte size of a struct member.
 size_t CompilerMSL::get_declared_struct_member_size(const SPIRType &struct_type, uint32_t index) const
 {
-	auto dec_mask = get_member_decoration_mask(struct_type.self, index);
 	auto &type = get<SPIRType>(struct_type.member_types[index]);
 
 	switch (type.basetype)
@@ -3384,10 +3438,6 @@ size_t CompilerMSL::get_declared_struct_member_size(const SPIRType &struct_type,
 
 	default:
 	{
-		size_t component_size = type.width / 8;
-		unsigned vecsize = type.vecsize;
-		unsigned columns = type.columns;
-
 		// For arrays, we can use ArrayStride to get an easy check.
 		// Runtime arrays will have zero size so force to min of one.
 		if (!type.array.empty())
@@ -3396,29 +3446,15 @@ size_t CompilerMSL::get_declared_struct_member_size(const SPIRType &struct_type,
 		if (type.basetype == SPIRType::Struct)
 			return get_declared_struct_size(type);
 
-		if (columns == 1) // An unpacked 3-element vector is the same size as a 4-element vector.
-		{
-			if (!(dec_mask & (1ull << DecorationCPacked)))
-			{
-				if (vecsize == 3)
-					vecsize = 4;
-			}
-		}
-		else // For matrices, a 3-element column is the same size as a 4-element column.
-		{
-			if (dec_mask & (1ull << DecorationColMajor))
-			{
-				if (vecsize == 3)
-					vecsize = 4;
-			}
-			else if (dec_mask & (1ull << DecorationRowMajor))
-			{
-				if (columns == 3)
-					columns = 4;
-			}
-		}
+		uint32_t component_size = type.width / 8;
+		uint32_t vecsize = type.vecsize;
+		uint32_t columns = type.columns;
 
-		return vecsize * columns * component_size;
+		// An unpacked 3-element vector or matrix column is the same memory size as a 4-element.
+		if (vecsize == 3 && !has_member_decoration(struct_type.self, index, DecorationCPacked))
+			vecsize = 4;
+
+		return component_size * vecsize * columns;
 	}
 	}
 }
@@ -3443,16 +3479,13 @@ size_t CompilerMSL::get_declared_struct_member_alignment(const SPIRType &struct_
 
 	default:
 	{
-		// Alignment of packed type is the same as the underlying component size.
-		// Alignment of unpacked type is the same as the type size (or one matrix column).
+		// Alignment of packed type is the same as the underlying component or column size.
+		// Alignment of unpacked type is the same as the vector size.
+		// Alignment of 3-elements vector is the same as 4-elements (including packed using column).
 		if (member_is_packed_type(struct_type, index))
-			return type.width / 8;
+			return (type.width / 8) * (type.columns == 3 ? 4 : type.columns);
 		else
-		{
-			// Divide by array size and colum count. Runtime arrays will have zero size so force to min of one.
-			uint32_t array_size = type.array.empty() ? 1 : max(type.array.back(), 1U);
-			return get_declared_struct_member_size(struct_type, index) / (type.columns * array_size);
-		}
+			return (type.width / 8) * (type.vecsize == 3 ? 4 : type.vecsize);
 	}
 	}
 }
diff --git a/spirv_msl.hpp b/spirv_msl.hpp
index 6f66f31..cfa16d1 100644
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@@ -210,7 +210,7 @@ protected:
 	void declare_undefined_values() override;
 	bool is_non_native_row_major_matrix(uint32_t id) override;
 	bool member_is_non_native_row_major_matrix(const SPIRType &type, uint32_t index) override;
-	std::string convert_row_major_matrix(std::string exp_str, const SPIRType &exp_type) override;
+	std::string convert_row_major_matrix(std::string exp_str, const SPIRType &exp_type, bool is_packed) override;
 
 	void preprocess_op_codes();
 	void localize_global_variables();
@@ -264,6 +264,7 @@ protected:
 	                         bool op1_is_pointer = false, uint32_t op2 = 0);
 	const char *get_memory_order(uint32_t spv_mem_sem);
 	void add_pragma_line(const std::string &line);
+	void add_typedef_line(const std::string &line);
 	void emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem);
 
 	Options options;
@@ -272,6 +273,7 @@ protected:
 	std::map<uint32_t, uint32_t> non_stage_in_input_var_ids;
 	std::unordered_map<MSLStructMemberKey, uint32_t> struct_member_padding;
 	std::set<std::string> pragma_lines;
+	std::set<std::string> typedef_lines;
 	std::vector<MSLResourceBinding *> resource_bindings;
 	MSLResourceBinding next_metal_resource_index;
 	uint32_t stage_in_var_id = 0;