From 9436cd30367540b9b0f5179cc4c8033c50bdbb34 Mon Sep 17 00:00:00 2001
From: Hans-Kristian Arntzen <post@arntzen-software.no>
Date: Tue, 27 Aug 2019 13:16:16 +0200
Subject: [PATCH] MSL: Deal with array copies from and to threadgroup.

---
 .../frag/lut-promotion-initializer.asm.frag   |  57 +++++-
 ...input-array-builtin-array.invalid.asm.tesc |  57 +++++-
 .../comp/composite-array-initialization.comp  |  57 +++++-
 .../shaders-msl/comp/composite-construct.comp |  57 +++++-
 .../opt/shaders-msl/frag/lut-promotion.frag   |  55 +++++-
 .../comp/array-copy-threadgroup-memory.comp   |  79 +++++++++
 .../vert/pass-array-by-value.vert             |  59 ++++++-
 .../frag/lut-promotion-initializer.asm.frag   |  57 +++++-
 .../frag/single-function-private-lut.asm.frag |  55 +++++-
 ...input-array-builtin-array.invalid.asm.tesc |  59 ++++++-
 .../comp/composite-array-initialization.comp  |  57 +++++-
 .../shaders-msl/comp/composite-construct.comp |  57 +++++-
 .../comp/copy-array-of-arrays.comp            | 163 +++++++++++++++---
 .../shaders-msl/frag/constant-composites.frag |  55 +++++-
 reference/shaders-msl/frag/lut-promotion.frag |  57 +++++-
 reference/shaders-msl/vert/return-array.vert  |  59 ++++++-
 .../comp/array-copy-threadgroup-memory.comp   |  18 ++
 spirv_cross.cpp                               |   9 +
 spirv_cross.hpp                               |   1 +
 spirv_glsl.cpp                                |   9 +-
 spirv_glsl.hpp                                |   3 +-
 spirv_msl.cpp                                 |  83 ++++++---
 spirv_msl.hpp                                 |   3 +-
 23 files changed, 998 insertions(+), 168 deletions(-)
 create mode 100644 reference/shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp
 create mode 100644 shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp
diff --git a/reference/opt/shaders-msl/asm/frag/lut-promotion-initializer.asm.frag b/reference/opt/shaders-msl/asm/frag/lut-promotion-initializer.asm.frag
index 610d447a..650887df 100644
--- a/reference/opt/shaders-msl/asm/frag/lut-promotion-initializer.asm.frag
+++ b/reference/opt/shaders-msl/asm/frag/lut-promotion-initializer.asm.frag
@@ -19,17 +19,58 @@ struct main0_in
     int index [[user(locn0)]];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 fragment main0_out main0(main0_in in [[stage_in]])
@@ -61,7 +102,7 @@ fragment main0_out main0(main0_in in [[stage_in]])
     }
     int _37 = in.index & 3;
     out.FragColor += foobar[_37].z;
-    spvArrayCopyFromConstant1(baz, _90);
+    spvArrayCopyFromConstantToStack1(baz, _90);
     out.FragColor += baz[_37].z;
     return out;
 }
diff --git a/reference/opt/shaders-msl/asm/tesc/tess-fixed-input-array-builtin-array.invalid.asm.tesc b/reference/opt/shaders-msl/asm/tesc/tess-fixed-input-array-builtin-array.invalid.asm.tesc
index bbda7be5..342172d3 100644
--- a/reference/opt/shaders-msl/asm/tesc/tess-fixed-input-array-builtin-array.invalid.asm.tesc
+++ b/reference/opt/shaders-msl/asm/tesc/tess-fixed-input-array-builtin-array.invalid.asm.tesc
@@ -33,17 +33,58 @@ struct main0_in
     float4 gl_Position [[attribute(1)]];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_in_threadgroup]], uint gl_PrimitiveID [[threadgroup_position_in_grid]], device main0_out* spvOut [[buffer(28)]], constant uint* spvIndirectParams [[buffer(29)]], device MTLTriangleTessellationFactorsHalf* spvTessLevel [[buffer(26)]], threadgroup main0_in* gl_in [[threadgroup(0)]])
@@ -56,7 +97,7 @@ kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_
         return;
     VertexOutput _223[3] = { VertexOutput{ gl_in[0].gl_Position, gl_in[0].VertexOutput_uv }, VertexOutput{ gl_in[1].gl_Position, gl_in[1].VertexOutput_uv }, VertexOutput{ gl_in[2].gl_Position, gl_in[2].VertexOutput_uv } };
     VertexOutput param[3];
-    spvArrayCopyFromStack1(param, _223);
+    spvArrayCopyFromStackToStack1(param, _223);
     gl_out[gl_InvocationID].gl_Position = param[gl_InvocationID].pos;
     gl_out[gl_InvocationID]._entryPointOutput.uv = param[gl_InvocationID].uv;
     threadgroup_barrier(mem_flags::mem_device);
diff --git a/reference/opt/shaders-msl/comp/composite-array-initialization.comp b/reference/opt/shaders-msl/comp/composite-array-initialization.comp
index 8dec8bdd..bc60fbc2 100644
--- a/reference/opt/shaders-msl/comp/composite-array-initialization.comp
+++ b/reference/opt/shaders-msl/comp/composite-array-initialization.comp
@@ -29,24 +29,65 @@ constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(2u, 1u, 1u);
 
 constant Data _25[2] = { Data{ 1.0, 2.0 }, Data{ 3.0, 4.0 } };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 kernel void main0(device SSBO& _53 [[buffer(0)]], uint3 gl_WorkGroupID [[threadgroup_position_in_grid]], uint3 gl_LocalInvocationID [[thread_position_in_threadgroup]])
 {
     Data _31[2] = { Data{ X, 2.0 }, Data{ 3.0, 5.0 } };
     Data data2[2];
-    spvArrayCopyFromStack1(data2, _31);
+    spvArrayCopyFromStackToStack1(data2, _31);
     _53.outdata[gl_WorkGroupID.x].a = _25[gl_LocalInvocationID.x].a + data2[gl_LocalInvocationID.x].a;
     _53.outdata[gl_WorkGroupID.x].b = _25[gl_LocalInvocationID.x].b + data2[gl_LocalInvocationID.x].b;
 }
diff --git a/reference/opt/shaders-msl/comp/composite-construct.comp b/reference/opt/shaders-msl/comp/composite-construct.comp
index 6d44fc57..3c8e0956 100644
--- a/reference/opt/shaders-msl/comp/composite-construct.comp
+++ b/reference/opt/shaders-msl/comp/composite-construct.comp
@@ -15,24 +15,65 @@ struct SSBO1
     float4 bs[1];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 kernel void main0(device SSBO0& _16 [[buffer(0)]], device SSBO1& _32 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]])
 {
     float4 _37[2] = { _16.as[gl_GlobalInvocationID.x], _32.bs[gl_GlobalInvocationID.x] };
     float4 values[2];
-    spvArrayCopyFromStack1(values, _37);
+    spvArrayCopyFromStackToStack1(values, _37);
     _16.as[0] = values[gl_LocalInvocationIndex];
     _32.bs[1] = float4(40.0);
 }
diff --git a/reference/opt/shaders-msl/frag/lut-promotion.frag b/reference/opt/shaders-msl/frag/lut-promotion.frag
index c9169b79..6c79a12e 100644
--- a/reference/opt/shaders-msl/frag/lut-promotion.frag
+++ b/reference/opt/shaders-msl/frag/lut-promotion.frag
@@ -19,17 +19,58 @@ struct main0_in
     int index [[user(locn0)]];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 fragment main0_out main0(main0_in in [[stage_in]])
diff --git a/reference/shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp b/reference/shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp
new file mode 100644
index 00000000..62ee1fef
--- /dev/null
+++ b/reference/shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp
@@ -0,0 +1,79 @@
+#pragma clang diagnostic ignored "-Wmissing-prototypes"
+
+#include <metal_stdlib>
+#include <simd/simd.h>
+
+using namespace metal;
+
+constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(8u, 1u, 1u);
+
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+kernel void main0(uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]])
+{
+    threadgroup float shared_group[8][8];
+    threadgroup float shared_group_alt[8][8];
+    float blob[8];
+    for (int i = 0; i < 8; i++)
+    {
+        blob[i] = float(i);
+    }
+    spvArrayCopyFromStackToThreadGroup1(shared_group[gl_LocalInvocationIndex], blob);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    float copied_blob[8];
+    spvArrayCopyFromThreadGroupToStack1(copied_blob, shared_group[gl_LocalInvocationIndex ^ 1u]);
+    spvArrayCopyFromThreadGroupToThreadGroup1(shared_group_alt[gl_LocalInvocationIndex], shared_group[gl_LocalInvocationIndex]);
+}
+
diff --git a/reference/shaders-msl-no-opt/vert/pass-array-by-value.vert b/reference/shaders-msl-no-opt/vert/pass-array-by-value.vert
index ab563136..67d55bf2 100644
--- a/reference/shaders-msl-no-opt/vert/pass-array-by-value.vert
+++ b/reference/shaders-msl-no-opt/vert/pass-array-by-value.vert
@@ -18,25 +18,66 @@ struct main0_in
     int Index2 [[attribute(1)]];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 float4 consume_constant_arrays2(thread const float4 (&positions)[4], thread const float4 (&positions2)[4], thread int& Index1, thread int& Index2)
 {
     float4 indexable[4];
-    spvArrayCopyFromStack1(indexable, positions);
+    spvArrayCopyFromStackToStack1(indexable, positions);
     float4 indexable_1[4];
-    spvArrayCopyFromStack1(indexable_1, positions2);
+    spvArrayCopyFromStackToStack1(indexable_1, positions2);
     return indexable[Index1] + indexable_1[Index2];
 }
 
diff --git a/reference/shaders-msl/asm/frag/lut-promotion-initializer.asm.frag b/reference/shaders-msl/asm/frag/lut-promotion-initializer.asm.frag
index 48f3317d..cd87f5b0 100644
--- a/reference/shaders-msl/asm/frag/lut-promotion-initializer.asm.frag
+++ b/reference/shaders-msl/asm/frag/lut-promotion-initializer.asm.frag
@@ -19,17 +19,58 @@ struct main0_in
     int index [[user(locn0)]];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 fragment main0_out main0(main0_in in [[stage_in]])
@@ -59,7 +100,7 @@ fragment main0_out main0(main0_in in [[stage_in]])
         foobar[1].z = 20.0;
     }
     out.FragColor += foobar[in.index & 3].z;
-    spvArrayCopyFromConstant1(baz, _90);
+    spvArrayCopyFromConstantToStack1(baz, _90);
     out.FragColor += baz[in.index & 3].z;
     return out;
 }
diff --git a/reference/shaders-msl/asm/frag/single-function-private-lut.asm.frag b/reference/shaders-msl/asm/frag/single-function-private-lut.asm.frag
index 628d5c7c..15e0749e 100644
--- a/reference/shaders-msl/asm/frag/single-function-private-lut.asm.frag
+++ b/reference/shaders-msl/asm/frag/single-function-private-lut.asm.frag
@@ -24,17 +24,58 @@ Tx mod(Tx x, Ty y)
     return x - y * floor(x / y);
 }
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 fragment main0_out main0(float4 gl_FragCoord [[position]])
diff --git a/reference/shaders-msl/asm/tesc/tess-fixed-input-array-builtin-array.invalid.asm.tesc b/reference/shaders-msl/asm/tesc/tess-fixed-input-array-builtin-array.invalid.asm.tesc
index 6a765117..9beb7c00 100644
--- a/reference/shaders-msl/asm/tesc/tess-fixed-input-array-builtin-array.invalid.asm.tesc
+++ b/reference/shaders-msl/asm/tesc/tess-fixed-input-array-builtin-array.invalid.asm.tesc
@@ -45,17 +45,58 @@ struct main0_in
     float4 gl_Position [[attribute(1)]];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 HSOut _hs_main(thread const VertexOutput (&p)[3], thread const uint& i)
@@ -93,7 +134,7 @@ kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_
     p[2].uv = gl_in[2].VertexOutput_uv;
     uint i = gl_InvocationID;
     VertexOutput param[3];
-    spvArrayCopyFromStack1(param, p);
+    spvArrayCopyFromStackToStack1(param, p);
     uint param_1 = i;
     HSOut flattenTemp = _hs_main(param, param_1);
     gl_out[gl_InvocationID].gl_Position = flattenTemp.pos;
@@ -102,7 +143,7 @@ kernel void main0(main0_in in [[stage_in]], uint gl_InvocationID [[thread_index_
     if (int(gl_InvocationID) == 0)
     {
         VertexOutput param_2[3];
-        spvArrayCopyFromStack1(param_2, p);
+        spvArrayCopyFromStackToStack1(param_2, p);
         HSConstantOut _patchConstantResult = PatchHS(param_2);
         spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[0] = half(_patchConstantResult.EdgeTess[0]);
         spvTessLevel[gl_PrimitiveID].edgeTessellationFactor[1] = half(_patchConstantResult.EdgeTess[1]);
diff --git a/reference/shaders-msl/comp/composite-array-initialization.comp b/reference/shaders-msl/comp/composite-array-initialization.comp
index ac10e750..33ed8702 100644
--- a/reference/shaders-msl/comp/composite-array-initialization.comp
+++ b/reference/shaders-msl/comp/composite-array-initialization.comp
@@ -29,17 +29,58 @@ constant uint3 gl_WorkGroupSize [[maybe_unused]] = uint3(2u, 1u, 1u);
 
 constant Data _25[2] = { Data{ 1.0, 2.0 }, Data{ 3.0, 4.0 } };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 Data combine(thread const Data& a, thread const Data& b)
@@ -52,7 +93,7 @@ kernel void main0(device SSBO& _53 [[buffer(0)]], uint3 gl_WorkGroupID [[threadg
     Data data[2] = { Data{ 1.0, 2.0 }, Data{ 3.0, 4.0 } };
     Data _31[2] = { Data{ X, 2.0 }, Data{ 3.0, 5.0 } };
     Data data2[2];
-    spvArrayCopyFromStack1(data2, _31);
+    spvArrayCopyFromStackToStack1(data2, _31);
     Data param = data[gl_LocalInvocationID.x];
     Data param_1 = data2[gl_LocalInvocationID.x];
     Data _73 = combine(param, param_1);
diff --git a/reference/shaders-msl/comp/composite-construct.comp b/reference/shaders-msl/comp/composite-construct.comp
index 4b5ea37e..bc1bf8a6 100644
--- a/reference/shaders-msl/comp/composite-construct.comp
+++ b/reference/shaders-msl/comp/composite-construct.comp
@@ -23,24 +23,65 @@ struct Composite
 
 constant float4 _43[2] = { float4(20.0), float4(40.0) };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 kernel void main0(device SSBO0& _16 [[buffer(0)]], device SSBO1& _32 [[buffer(1)]], uint3 gl_GlobalInvocationID [[thread_position_in_grid]], uint gl_LocalInvocationIndex [[thread_index_in_threadgroup]])
 {
     float4 _37[2] = { _16.as[gl_GlobalInvocationID.x], _32.bs[gl_GlobalInvocationID.x] };
     float4 values[2];
-    spvArrayCopyFromStack1(values, _37);
+    spvArrayCopyFromStackToStack1(values, _37);
     Composite c = Composite{ values[0], _43[1] };
     _16.as[0] = values[gl_LocalInvocationIndex];
     _32.bs[1] = c.b;
diff --git a/reference/shaders-msl/comp/copy-array-of-arrays.comp b/reference/shaders-msl/comp/copy-array-of-arrays.comp
index 0fa6c6bf..b5f6fced 100644
--- a/reference/shaders-msl/comp/copy-array-of-arrays.comp
+++ b/reference/shaders-msl/comp/copy-array-of-arrays.comp
@@ -17,59 +17,172 @@ constant float _19[2] = { 3.0, 4.0 };
 constant float _20[2][2] = { { 1.0, 2.0 }, { 3.0, 4.0 } };
 constant float _21[2][2][2] = { { { 1.0, 2.0 }, { 3.0, 4.0 } }, { { 1.0, 2.0 }, { 3.0, 4.0 } } };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
-{
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
-}
-
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
-{
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
-}
-
-template<typename T, uint A, uint B>
-void spvArrayCopyFromStack2(thread T (&dst)[A][B], thread const T (&src)[A][B])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
     for (uint i = 0; i < A; i++)
     {
-        spvArrayCopyFromStack1(dst[i], src[i]);
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
     }
 }
 
 template<typename T, uint A, uint B>
-void spvArrayCopyFromConstant2(thread T (&dst)[A][B], constant T (&src)[A][B])
+void spvArrayCopyFromConstantToStack2(thread T (&dst)[A][B], constant T (&src)[A][B])
 {
     for (uint i = 0; i < A; i++)
     {
-        spvArrayCopyFromConstant1(dst[i], src[i]);
+        spvArrayCopyFromConstantToStack1(dst[i], src[i]);
+    }
+}
+
+template<typename T, uint A, uint B>
+void spvArrayCopyFromConstantToThreadGroup2(threadgroup T (&dst)[A][B], constant T (&src)[A][B])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        spvArrayCopyFromConstantToThreadGroup1(dst[i], src[i]);
+    }
+}
+
+template<typename T, uint A, uint B>
+void spvArrayCopyFromStackToStack2(thread T (&dst)[A][B], thread const T (&src)[A][B])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        spvArrayCopyFromStackToStack1(dst[i], src[i]);
+    }
+}
+
+template<typename T, uint A, uint B>
+void spvArrayCopyFromStackToThreadGroup2(threadgroup T (&dst)[A][B], thread const T (&src)[A][B])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        spvArrayCopyFromStackToThreadGroup1(dst[i], src[i]);
+    }
+}
+
+template<typename T, uint A, uint B>
+void spvArrayCopyFromThreadGroupToStack2(thread T (&dst)[A][B], threadgroup const T (&src)[A][B])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        spvArrayCopyFromThreadGroupToStack1(dst[i], src[i]);
+    }
+}
+
+template<typename T, uint A, uint B>
+void spvArrayCopyFromThreadGroupToThreadGroup2(threadgroup T (&dst)[A][B], threadgroup const T (&src)[A][B])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        spvArrayCopyFromThreadGroupToThreadGroup1(dst[i], src[i]);
     }
 }
 
 template<typename T, uint A, uint B, uint C>
-void spvArrayCopyFromStack3(thread T (&dst)[A][B][C], thread const T (&src)[A][B][C])
+void spvArrayCopyFromConstantToStack3(thread T (&dst)[A][B][C], constant T (&src)[A][B][C])
 {
     for (uint i = 0; i < A; i++)
     {
-        spvArrayCopyFromStack2(dst[i], src[i]);
+        spvArrayCopyFromConstantToStack2(dst[i], src[i]);
     }
 }
 
 template<typename T, uint A, uint B, uint C>
-void spvArrayCopyFromConstant3(thread T (&dst)[A][B][C], constant T (&src)[A][B][C])
+void spvArrayCopyFromConstantToThreadGroup3(threadgroup T (&dst)[A][B][C], constant T (&src)[A][B][C])
 {
     for (uint i = 0; i < A; i++)
     {
-        spvArrayCopyFromConstant2(dst[i], src[i]);
+        spvArrayCopyFromConstantToThreadGroup2(dst[i], src[i]);
+    }
+}
+
+template<typename T, uint A, uint B, uint C>
+void spvArrayCopyFromStackToStack3(thread T (&dst)[A][B][C], thread const T (&src)[A][B][C])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        spvArrayCopyFromStackToStack2(dst[i], src[i]);
+    }
+}
+
+template<typename T, uint A, uint B, uint C>
+void spvArrayCopyFromStackToThreadGroup3(threadgroup T (&dst)[A][B][C], thread const T (&src)[A][B][C])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        spvArrayCopyFromStackToThreadGroup2(dst[i], src[i]);
+    }
+}
+
+template<typename T, uint A, uint B, uint C>
+void spvArrayCopyFromThreadGroupToStack3(thread T (&dst)[A][B][C], threadgroup const T (&src)[A][B][C])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        spvArrayCopyFromThreadGroupToStack2(dst[i], src[i]);
+    }
+}
+
+template<typename T, uint A, uint B, uint C>
+void spvArrayCopyFromThreadGroupToThreadGroup3(threadgroup T (&dst)[A][B][C], threadgroup const T (&src)[A][B][C])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        spvArrayCopyFromThreadGroupToThreadGroup2(dst[i], src[i]);
     }
 }
 
 kernel void main0(device BUF& o [[buffer(0)]])
 {
     float c[2][2][2];
-    spvArrayCopyFromConstant3(c, _21);
+    spvArrayCopyFromConstantToStack3(c, _21);
     o.a = int(c[1][1][1]);
     float _43[2] = { o.b, o.c };
     float _48[2] = { o.b, o.b };
@@ -79,9 +192,9 @@ kernel void main0(device BUF& o [[buffer(0)]])
     float _60[2][2] = { { _54[0], _54[1] }, { _59[0], _59[1] } };
     float _61[2][2][2] = { { { _49[0][0], _49[0][1] }, { _49[1][0], _49[1][1] } }, { { _60[0][0], _60[0][1] }, { _60[1][0], _60[1][1] } } };
     float d[2][2][2];
-    spvArrayCopyFromStack3(d, _61);
+    spvArrayCopyFromStackToStack3(d, _61);
     float e[2][2][2];
-    spvArrayCopyFromStack3(e, d);
+    spvArrayCopyFromStackToStack3(e, d);
     o.b = e[1][0][1];
 }
 
diff --git a/reference/shaders-msl/frag/constant-composites.frag b/reference/shaders-msl/frag/constant-composites.frag
index 504beaa0..e5b3574a 100644
--- a/reference/shaders-msl/frag/constant-composites.frag
+++ b/reference/shaders-msl/frag/constant-composites.frag
@@ -24,17 +24,58 @@ struct main0_in
     int line [[user(locn0)]];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 fragment main0_out main0(main0_in in [[stage_in]])
diff --git a/reference/shaders-msl/frag/lut-promotion.frag b/reference/shaders-msl/frag/lut-promotion.frag
index f7e51edb..d71214f2 100644
--- a/reference/shaders-msl/frag/lut-promotion.frag
+++ b/reference/shaders-msl/frag/lut-promotion.frag
@@ -19,17 +19,58 @@ struct main0_in
     int index [[user(locn0)]];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 fragment main0_out main0(main0_in in [[stage_in]])
@@ -59,7 +100,7 @@ fragment main0_out main0(main0_in in [[stage_in]])
     }
     out.FragColor += foobar[in.index & 3].z;
     float4 baz[4] = { float4(0.0), float4(1.0), float4(8.0), float4(5.0) };
-    spvArrayCopyFromConstant1(baz, _104);
+    spvArrayCopyFromConstantToStack1(baz, _104);
     out.FragColor += baz[in.index & 3].z;
     return out;
 }
diff --git a/reference/shaders-msl/vert/return-array.vert b/reference/shaders-msl/vert/return-array.vert
index cd06fdda..fa0b4c69 100644
--- a/reference/shaders-msl/vert/return-array.vert
+++ b/reference/shaders-msl/vert/return-array.vert
@@ -18,22 +18,63 @@ struct main0_in
     float4 vInput1 [[attribute(1)]];
 };
 
-// Implementation of an array copy function to cover GLSL's ability to copy an array via assignment.
-template<typename T, uint N>
-void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToStack1(thread T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
-template<typename T, uint N>
-void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])
+template<typename T, uint A>
+void spvArrayCopyFromConstantToThreadGroup1(threadgroup T (&dst)[A], constant T (&src)[A])
 {
-    for (uint i = 0; i < N; dst[i] = src[i], i++);
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToStack1(thread T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromStackToThreadGroup1(threadgroup T (&dst)[A], thread const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToStack1(thread T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T, uint A>
+void spvArrayCopyFromThreadGroupToThreadGroup1(threadgroup T (&dst)[A], threadgroup const T (&src)[A])
+{
+    for (uint i = 0; i < A; i++)
+    {
+        dst[i] = src[i];
+    }
 }
 
 void test(thread float4 (&SPIRV_Cross_return_value)[2])
 {
-    spvArrayCopyFromConstant1(SPIRV_Cross_return_value, _20);
+    spvArrayCopyFromConstantToStack1(SPIRV_Cross_return_value, _20);
 }
 
 void test2(thread float4 (&SPIRV_Cross_return_value)[2], thread float4& vInput0, thread float4& vInput1)
@@ -41,7 +82,7 @@ void test2(thread float4 (&SPIRV_Cross_return_value)[2], thread float4& vInput0,
     float4 foobar[2];
     foobar[0] = vInput0;
     foobar[1] = vInput1;
-    spvArrayCopyFromStack1(SPIRV_Cross_return_value, foobar);
+    spvArrayCopyFromStackToStack1(SPIRV_Cross_return_value, foobar);
 }
 
 vertex main0_out main0(main0_in in [[stage_in]])
diff --git a/shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp b/shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp
new file mode 100644
index 00000000..081c3962
--- /dev/null
+++ b/shaders-msl-no-opt/comp/array-copy-threadgroup-memory.comp
@@ -0,0 +1,18 @@
+#version 450
+layout(local_size_x = 8) in;
+
+shared float shared_group[8][8];
+shared float shared_group_alt[8][8];
+
+void main()
+{
+	float blob[8];
+	for (int i = 0; i < 8; i++)
+		blob[i] = float(i);
+	shared_group[gl_LocalInvocationIndex] = blob;
+
+	barrier();
+
+	float copied_blob[8] = shared_group[gl_LocalInvocationIndex ^ 1u];
+	shared_group_alt[gl_LocalInvocationIndex] = shared_group[gl_LocalInvocationIndex];
+}
diff --git a/spirv_cross.cpp b/spirv_cross.cpp
index a04a9469..c8a6e5c8 100644
--- a/spirv_cross.cpp
+++ b/spirv_cross.cpp
@@ -265,6 +265,15 @@ SPIRVariable *Compiler::maybe_get_backing_variable(uint32_t chain)
 	return var;
 }
 
+StorageClass Compiler::get_backing_variable_storage(uint32_t ptr)
+{
+	auto *var = maybe_get_backing_variable(ptr);
+	if (var)
+		return var->storage;
+	else
+		return expression_type(ptr).storage;
+}
+
 void Compiler::register_read(uint32_t expr, uint32_t chain, bool forwarded)
 {
 	auto &e = get<SPIRExpression>(expr);
diff --git a/spirv_cross.hpp b/spirv_cross.hpp
index 9701d4c2..73de7a5d 100644
--- a/spirv_cross.hpp
+++ b/spirv_cross.hpp
@@ -605,6 +605,7 @@ protected:
 	bool expression_is_lvalue(uint32_t id) const;
 	bool variable_storage_is_aliased(const SPIRVariable &var);
 	SPIRVariable *maybe_get_backing_variable(uint32_t chain);
+	spv::StorageClass get_backing_variable_storage(uint32_t ptr);
 
 	void register_read(uint32_t expr, uint32_t chain, bool forwarded);
 	void register_write(uint32_t chain);
diff --git a/spirv_glsl.cpp b/spirv_glsl.cpp
index c390218d..941edb7f 100644
--- a/spirv_glsl.cpp
+++ b/spirv_glsl.cpp
@@ -7888,7 +7888,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 			// it is an array, and our backend does not support arrays as value types.
 			// Emit the temporary, and copy it explicitly.
 			e = &emit_uninitialized_temporary_expression(result_type, id);
-			emit_array_copy(to_expression(id), ptr);
+			emit_array_copy(to_expression(id), ptr, StorageClassFunction, get_backing_variable_storage(ptr));
 		}
 		else
 			e = &emit_op(result_type, id, expr, forward, !usage_tracking);
@@ -12232,7 +12232,10 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 				// If we cannot return arrays, we will have a special out argument we can write to instead.
 				// The backend is responsible for setting this up, and redirection the return values as appropriate.
 				if (ir.ids[block.return_value].get_type() != TypeUndef)
-					emit_array_copy("SPIRV_Cross_return_value", block.return_value);
+				{
+					emit_array_copy("SPIRV_Cross_return_value", block.return_value,
+					                StorageClassFunction, get_backing_variable_storage(block.return_value));
+				}
 
 				if (!cfg.node_terminates_control_flow_in_sub_graph(current_function->entry_block, block.self) ||
 				    block.loop_dominator != SPIRBlock::NoDominator)
@@ -12427,7 +12430,7 @@ uint32_t CompilerGLSL::mask_relevant_memory_semantics(uint32_t semantics)
 	                    MemorySemanticsCrossWorkgroupMemoryMask | MemorySemanticsSubgroupMemoryMask);
 }
 
-void CompilerGLSL::emit_array_copy(const string &lhs, uint32_t rhs_id)
+void CompilerGLSL::emit_array_copy(const string &lhs, uint32_t rhs_id, StorageClass, StorageClass)
 {
 	statement(lhs, " = ", to_expression(rhs_id), ";");
 }
diff --git a/spirv_glsl.hpp b/spirv_glsl.hpp
index a43cae54..7d40f551 100644
--- a/spirv_glsl.hpp
+++ b/spirv_glsl.hpp
@@ -546,7 +546,8 @@ protected:
 	std::string layout_for_variable(const SPIRVariable &variable);
 	std::string to_combined_image_sampler(uint32_t image_id, uint32_t samp_id);
 	virtual bool skip_argument(uint32_t id) const;
-	virtual void emit_array_copy(const std::string &lhs, uint32_t rhs_id);
+	virtual void emit_array_copy(const std::string &lhs, uint32_t rhs_id,
+	                             spv::StorageClass lhs_storage, spv::StorageClass rhs_storage);
 	virtual void emit_block_hints(const SPIRBlock &block);
 	virtual std::string to_initializer_expression(const SPIRVariable &var);
 
diff --git a/spirv_msl.cpp b/spirv_msl.cpp
index f3b2ded7..8d365dc9 100644
--- a/spirv_msl.cpp
+++ b/spirv_msl.cpp
@@ -3185,40 +3185,41 @@ void CompilerMSL::emit_custom_functions()
 			break;
 
 		case SPVFuncImplArrayCopy:
-			statement("// Implementation of an array copy function to cover GLSL's ability to copy an array via "
-			          "assignment.");
-			statement("template<typename T, uint N>");
-			statement("void spvArrayCopyFromStack1(thread T (&dst)[N], thread const T (&src)[N])");
-			begin_scope();
-			statement("for (uint i = 0; i < N; dst[i] = src[i], i++);");
-			end_scope();
-			statement("");
-
-			statement("template<typename T, uint N>");
-			statement("void spvArrayCopyFromConstant1(thread T (&dst)[N], constant T (&src)[N])");
-			begin_scope();
-			statement("for (uint i = 0; i < N; dst[i] = src[i], i++);");
-			end_scope();
-			statement("");
-			break;
-
 		case SPVFuncImplArrayOfArrayCopy2Dim:
 		case SPVFuncImplArrayOfArrayCopy3Dim:
 		case SPVFuncImplArrayOfArrayCopy4Dim:
 		case SPVFuncImplArrayOfArrayCopy5Dim:
 		case SPVFuncImplArrayOfArrayCopy6Dim:
 		{
+			// Unfortunately we cannot template on the address space, so combinatorial explosion it is.
 			static const char *function_name_tags[] = {
-				"FromStack",
-				"FromConstant",
+				"FromConstantToStack",
+				"FromConstantToThreadGroup",
+				"FromStackToStack",
+				"FromStackToThreadGroup",
+				"FromThreadGroupToStack",
+				"FromThreadGroupToThreadGroup",
 			};
 
 			static const char *src_address_space[] = {
-				"thread const",
 				"constant",
+				"constant",
+				"thread const",
+				"thread const",
+				"threadgroup const",
+				"threadgroup const",
 			};
 
-			for (uint32_t variant = 0; variant < 2; variant++)
+			static const char *dst_address_space[] = {
+				"thread",
+				"threadgroup",
+				"thread",
+				"threadgroup",
+				"thread",
+				"threadgroup",
+			};
+
+			for (uint32_t variant = 0; variant < 6; variant++)
 			{
 				uint32_t dimensions = spv_func - SPVFuncImplArrayCopyMultidimBase;
 				string tmp = "template<typename T";
@@ -3238,17 +3239,24 @@ void CompilerMSL::emit_custom_functions()
 					array_arg += "]";
 				}
 
-				statement("void spvArrayCopy", function_name_tags[variant], dimensions, "(thread T (&dst)", array_arg,
+				statement("void spvArrayCopy", function_name_tags[variant], dimensions, "(", dst_address_space[variant],
+				          " T (&dst)",
+				          array_arg,
 				          ", ", src_address_space[variant], " T (&src)", array_arg, ")");
 
 				begin_scope();
 				statement("for (uint i = 0; i < A; i++)");
 				begin_scope();
-				statement("spvArrayCopy", function_name_tags[variant], dimensions - 1, "(dst[i], src[i]);");
+
+				if (dimensions == 1)
+					statement("dst[i] = src[i];");
+				else
+					statement("spvArrayCopy", function_name_tags[variant], dimensions - 1, "(dst[i], src[i]);");
 				end_scope();
 				end_scope();
 				statement("");
 			}
+
 			break;
 		}
 
@@ -4890,7 +4898,8 @@ void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uin
 	flush_all_active_variables();
 }
 
-void CompilerMSL::emit_array_copy(const string &lhs, uint32_t rhs_id)
+void CompilerMSL::emit_array_copy(const string &lhs, uint32_t rhs_id,
+                                  StorageClass lhs_storage, StorageClass rhs_storage)
 {
 	// Assignment from an array initializer is fine.
 	auto &type = expression_type(rhs_id);
@@ -4932,7 +4941,29 @@ void CompilerMSL::emit_array_copy(const string &lhs, uint32_t rhs_id)
 		force_recompile();
 	}
 
-	const char *tag = is_constant ? "FromConstant" : "FromStack";
+	bool lhs_thread = lhs_storage == StorageClassFunction ||
+	                  lhs_storage == StorageClassGeneric ||
+	                  lhs_storage == StorageClassPrivate;
+	bool rhs_thread = rhs_storage == StorageClassFunction ||
+	                  rhs_storage == StorageClassGeneric ||
+	                  rhs_storage == StorageClassPrivate;
+
+	const char *tag = nullptr;
+	if (lhs_thread && is_constant)
+		tag = "FromConstantToStack";
+	else if (lhs_storage == StorageClassWorkgroup && is_constant)
+		tag = "FromConstantToThreadGroup";
+	else if (lhs_thread && rhs_thread)
+		tag = "FromStackToStack";
+	else if (lhs_storage == StorageClassWorkgroup && rhs_thread)
+		tag = "FromStackToThreadGroup";
+	else if (lhs_thread && rhs_storage == StorageClassWorkgroup)
+		tag = "FromThreadGroupToStack";
+	else if (lhs_storage == StorageClassWorkgroup && rhs_storage == StorageClassWorkgroup)
+		tag = "FromThreadGroupToThreadGroup";
+	else
+		SPIRV_CROSS_THROW("Unknown storage class used for copying arrays.");
+
 	statement("spvArrayCopy", tag, type.array.size(), "(", lhs, ", ", to_expression(rhs_id), ");");
 }
 
@@ -4969,7 +5000,7 @@ bool CompilerMSL::maybe_emit_array_assignment(uint32_t id_lhs, uint32_t id_rhs)
 	if (p_v_lhs)
 		flush_variable_declaration(p_v_lhs->self);
 
-	emit_array_copy(to_expression(id_lhs), id_rhs);
+	emit_array_copy(to_expression(id_lhs), id_rhs, get_backing_variable_storage(id_lhs), get_backing_variable_storage(id_rhs));
 	register_write(id_lhs);
 
 	return true;
diff --git a/spirv_msl.hpp b/spirv_msl.hpp
index f0858c9d..1ca9c782 100644
--- a/spirv_msl.hpp
+++ b/spirv_msl.hpp
@@ -555,7 +555,8 @@ protected:
 	void add_pragma_line(const std::string &line);
 	void add_typedef_line(const std::string &line);
 	void emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem);
-	void emit_array_copy(const std::string &lhs, uint32_t rhs_id) override;
+	void emit_array_copy(const std::string &lhs, uint32_t rhs_id,
+	                     spv::StorageClass lhs_storage, spv::StorageClass rhs_storage) override;
 	void build_implicit_builtins();
 	uint32_t build_constant_uint_array_pointer();
 	void emit_entry_point_declarations() override;