Merge pull request #152 from Microsoft/dxil-v1.0

Merge dxil-v1.0 into master
2017-03-22 17:20:18 -07:00 · 2017-03-22 17:20:18 -07:00 · 027a3dcf3d
--- a/README.md
+++ b/README.md
@ -1,5 +1,7 @@
 # DirectX Shader Compiler

+[![Build status](https://ci.appveyor.com/api/projects/status/2wsw8t8clpgt1kfm?svg=true)](https://ci.appveyor.com/project/dmpots/directxshadercompiler)
+
 The DirectX Shader Compiler project includes a compiler and related tools used to compile High-Level Shader Language (HLSL) programs into DirectX Intermediate Language (DXIL) representation. Applications that make use of DirectX for graphics, games, and computation can use it to generate shader programs.

 For more information, see the [Wiki](https://github.com/Microsoft/DirectXShaderCompiler/wiki).
--- a/appveyor.yml
+++ b/appveyor.yml
@ -0,0 +1,30 @@
+version: 1.0.{build}
+image: Visual Studio 2017
+platform: x64
+configuration: Debug
+clone_folder: c:\projects\DirectXShaderCompiler
+environment:
+  HLSL_SRC_DIR: c:\projects\DirectXShaderCompiler
+  HLSL_BLD_DIR: c:\projects\DirectXShaderCompiler.bin
+install:
+- ps: c:\projects\DirectXShaderCompiler\utils\appveyor\appveyor_setup.ps1
+build_script:
+- cmd: >-
+    cd %HLSL_SRC_DIR%
+
+    call utils\hct\hctstart %HLSL_SRC_DIR% %HLSL_BLD_DIR%
+
+    call utils\hct\hctbuild -%PLATFORM% -%CONFIGURATION% -vs2017
+test_script:
+- cmd: >-
+    cd %HLSL_SRC_DIR%
+
+    call utils\hct\hctstart %HLSL_SRC_DIR% %HLSL_BLD_DIR%
+
+    powershell utils\appveyor\appveyor_test.ps1
+
+notifications:
+- provider: GitHubPullRequest
+  on_build_success: true
+  on_build_failure: true
+  on_build_status_changed: true
--- a/cmake/modules/FindDiaSDK.cmake
+++ b/cmake/modules/FindDiaSDK.cmake
@ -3,12 +3,23 @@ get_filename_component(VS_PATH32 "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Visu
 get_filename_component(VS_PATH64 "[HKEY_LOCAL_MACHINE\\SOFTWARE\\WOW6432Node\\Microsoft\\VisualStudio\\14.0;InstallDir]" ABSOLUTE CACHE)
 # VS_PATH32 will be something like C:/Program Files (x86)/Microsoft Visual Studio 14.0/Common7/IDE

+# Also look for in vs15 install.
+# TODO: update this to be a non-hardcoded path. Registry keys were removed
+# in vs15 in favor of COM server dlls.
+# https://blogs.msdn.microsoft.com/heaths/2016/09/15/changes-to-visual-studio-15-setup/
+get_filename_component(VS15_C_PATH32 "C:/Program Files (x86)/Microsoft Visual Studio/2017/Community/Common7/IDE" ABSOLUTE CACHE)
+get_filename_component(VS15_P_PATH32 "C:/Program Files (x86)/Microsoft Visual Studio/2017/Professional/Common7/IDE" ABSOLUTE CACHE)
+get_filename_component(VS15_E_PATH32 "C:/Program Files (x86)/Microsoft Visual Studio/2017/Enterprise/Common7/IDE" ABSOLUTE CACHE)
+
 # Find the TAEF path, it will typically look something like this.
 # C:\Program Files (x86)\Microsoft Visual Studio 14.0\DIA SDK\include\dia2.h
 find_path(DIASDK_INCLUDE_DIR    # Set variable DIASDK_INCLUDE_DIR
          dia2.h                # Find a path with dia2.h
          HINTS "${VS_PATH64}/../../DIA SDK/include"
 		  HINTS "${VS_PATH32}/../../DIA SDK/include"
+		  HINTS "${VS15_C_PATH32}/../../DIA SDK/include"
+		  HINTS "${VS15_P_PATH32}/../../DIA SDK/include"
+		  HINTS "${VS15_E_PATH32}/../../DIA SDK/include"
          DOC "path to DIA SDK header files"
          HINTS
          )
@ -33,4 +44,4 @@ include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(DIASDK  DEFAULT_MSG
                                  DIASDK_LIBRARIES DIASDK_INCLUDE_DIR)

-mark_as_advanced(DIASDK_INCLUDE_DIRS DIASDK_LIBRARIES)
+mark_as_advanced(DIASDK_INCLUDE_DIRS DIASDK_LIBRARIES)
--- a/docs/DXIL.rst
+++ b/docs/DXIL.rst
@ -1919,79 +1919,79 @@ ID  Name                          Description
 17  Atan                          returns the Atan
 18  Hcos                          returns the Hcos
 19  Hsin                          returns the Hsin
-20  Exp                           returns the Exp
-21  Frc                           returns the Frc
-22  Log                           returns the Log
-23  Sqrt                          returns the Sqrt
-24  Rsqrt                         returns the Rsqrt
-25  Round_ne                      returns the Round_ne
-26  Round_ni                      returns the Round_ni
-27  Round_pi                      returns the Round_pi
-28  Round_z                       returns the Round_z
-29  Bfrev                         returns the reverse bit pattern of the input value
-30  Countbits                     returns the Countbits
-31  FirstbitLo                    returns the FirstbitLo
-32  FirstbitHi                    returns src != 0? (BitWidth-1 - FirstbitHi) : -1
-33  FirstbitSHi                   returns src != 0? (BitWidth-1 - FirstbitSHi) : -1
-34  FMax                          returns the FMax of the input values
-35  FMin                          returns the FMin of the input values
-36  IMax                          returns the IMax of the input values
-37  IMin                          returns the IMin of the input values
-38  UMax                          returns the UMax of the input values
-39  UMin                          returns the UMin of the input values
-40  IMul                          returns the IMul of the input values
-41  UMul                          returns the UMul of the input values
-42  UDiv                          returns the UDiv of the input values
-43  IAddc                         returns the IAddc of the input values
+20  Htan                          returns the Htan
+21  Exp                           returns the Exp
+22  Frc                           returns the Frc
+23  Log                           returns the Log
+24  Sqrt                          returns the Sqrt
+25  Rsqrt                         returns the Rsqrt
+26  Round_ne                      returns the Round_ne
+27  Round_ni                      returns the Round_ni
+28  Round_pi                      returns the Round_pi
+29  Round_z                       returns the Round_z
+30  Bfrev                         returns the reverse bit pattern of the input value
+31  Countbits                     returns the Countbits
+32  FirstbitLo                    returns the FirstbitLo
+33  FirstbitHi                    returns src != 0? (BitWidth-1 - FirstbitHi) : -1
+34  FirstbitSHi                   returns src != 0? (BitWidth-1 - FirstbitSHi) : -1
+35  FMax                          returns a if a >= b, else b
+36  FMin                          returns a if a < b, else b
+37  IMax                          returns the IMax of the input values
+38  IMin                          returns the IMin of the input values
+39  UMax                          returns the UMax of the input values
+40  UMin                          returns the UMin of the input values
+41  IMul                          returns the IMul of the input values
+42  UMul                          returns the UMul of the input values
+43  UDiv                          returns the UDiv of the input values
 44  UAddc                         returns the UAddc of the input values
-45  ISubc                         returns the ISubc of the input values
-46  USubc                         returns the USubc of the input values
-47  FMad                          performs a fused multiply add (FMA) of the form a * b + c
-48  Fma                           performs a fused multiply add (FMA) of the form a * b + c
-49  IMad                          performs an integral IMad
-50  UMad                          performs an integral UMad
-51  Msad                          performs an integral Msad
-52  Ibfe                          performs an integral Ibfe
-53  Ubfe                          performs an integral Ubfe
-54  Bfi                           given a bit range from the LSB of a number, places that number of bits in another number at any offset
-55  Dot2                          two-dimensional vector dot-product
-56  Dot3                          three-dimensional vector dot-product
-57  Dot4                          four-dimensional vector dot-product
-58  CreateHandle                  creates the handle to a resource
-59  CBufferLoad                   loads a value from a constant buffer resource
-60  CBufferLoadLegacy             loads a value from a constant buffer resource
-61  Sample                        samples a texture
-62  SampleBias                    samples a texture after applying the input bias to the mipmap level
-63  SampleLevel                   samples a texture using a mipmap-level offset
-64  SampleGrad                    samples a texture using a gradient to influence the way the sample location is calculated
-65  SampleCmp                     samples a texture and compares a single component against the specified comparison value
-66  SampleCmpLevelZero            samples a texture and compares a single component against the specified comparison value
-67  TextureLoad                   reads texel data without any filtering or sampling
-68  TextureStore                  reads texel data without any filtering or sampling
-69  BufferLoad                    reads from a TypedBuffer
-70  BufferStore                   writes to a RWTypedBuffer
-71  BufferUpdateCounter           atomically increments/decrements the hidden 32-bit counter stored with a Count or Append UAV
-72  CheckAccessFullyMapped        determines whether all values from a Sample, Gather, or Load operation accessed mapped tiles in a tiled resource
-73  GetDimensions                 gets texture size information
-74  TextureGather                 gathers the four texels that would be used in a bi-linear filtering operation
-75  TextureGatherCmp              same as TextureGather, except this instrution performs comparison on texels, similar to SampleCmp
-76  ToDelete5                     reserved
-77  ToDelete6                     reserved
-78  Texture2DMSGetSamplePosition  gets the position of the specified sample
-79  RenderTargetGetSamplePosition gets the position of the specified sample
-80  RenderTargetGetSampleCount    gets the number of samples for a render target
-81  AtomicBinOp                   performs an atomic operation on two operands
-82  AtomicCompareExchange         atomic compare and exchange to memory
-83  Barrier                       inserts a memory barrier in the shader
-84  CalculateLOD                  calculates the level of detail
-85  Discard                       discard the current pixel
-86  DerivCoarseX                  computes the rate of change of components per stamp
-87  DerivCoarseY                  computes the rate of change of components per stamp
-88  DerivFineX                    computes the rate of change of components per pixel
-89  DerivFineY                    computes the rate of change of components per pixel
-90  EvalSnapped                   evaluates an input attribute at pixel center with an offset
-91  EvalSampleIndex               evaluates an input attribute at a sample location
-92  EvalCentroid                  evaluates an input attribute at pixel center
+45  USubb                         returns the USubb of the input values
+46  FMad                          performs a fused multiply add (FMA) of the form a * b + c
+47  Fma                           performs a fused multiply add (FMA) of the form a * b + c
+48  IMad                          performs an integral IMad
+49  UMad                          performs an integral UMad
+50  Msad                          performs an integral Msad
+51  Ibfe                          performs an integral Ibfe
+52  Ubfe                          performs an integral Ubfe
+53  Bfi                           given a bit range from the LSB of a number, places that number of bits in another number at any offset
+54  Dot2                          two-dimensional vector dot-product
+55  Dot3                          three-dimensional vector dot-product
+56  Dot4                          four-dimensional vector dot-product
+57  CreateHandle                  creates the handle to a resource
+58  CBufferLoad                   loads a value from a constant buffer resource
+59  CBufferLoadLegacy             loads a value from a constant buffer resource
+60  Sample                        samples a texture
+61  SampleBias                    samples a texture after applying the input bias to the mipmap level
+62  SampleLevel                   samples a texture using a mipmap-level offset
+63  SampleGrad                    samples a texture using a gradient to influence the way the sample location is calculated
+64  SampleCmp                     samples a texture and compares a single component against the specified comparison value
+65  SampleCmpLevelZero            samples a texture and compares a single component against the specified comparison value
+66  TextureLoad                   reads texel data without any filtering or sampling
+67  TextureStore                  reads texel data without any filtering or sampling
+68  BufferLoad                    reads from a TypedBuffer
+69  BufferStore                   writes to a RWTypedBuffer
+70  BufferUpdateCounter           atomically increments/decrements the hidden 32-bit counter stored with a Count or Append UAV
+71  CheckAccessFullyMapped        determines whether all values from a Sample, Gather, or Load operation accessed mapped tiles in a tiled resource
+72  GetDimensions                 gets texture size information
+73  TextureGather                 gathers the four texels that would be used in a bi-linear filtering operation
+74  TextureGatherCmp              same as TextureGather, except this instrution performs comparison on texels, similar to SampleCmp
+75  Texture2DMSGetSamplePosition  gets the position of the specified sample
+76  RenderTargetGetSamplePosition gets the position of the specified sample
+77  RenderTargetGetSampleCount    gets the number of samples for a render target
+78  AtomicBinOp                   performs an atomic operation on two operands
+79  AtomicCompareExchange         atomic compare and exchange to memory
+80  Barrier                       inserts a memory barrier in the shader
+81  CalculateLOD                  calculates the level of detail
+82  Discard                       discard the current pixel
+83  DerivCoarseX                  computes the rate of change of components per stamp
+84  DerivCoarseY                  computes the rate of change of components per stamp
+85  DerivFineX                    computes the rate of change of components per pixel
+86  DerivFineY                    computes the rate of change of components per pixel
+87  EvalSnapped                   evaluates an input attribute at pixel center with an offset
+88  EvalSampleIndex               evaluates an input attribute at a sample location
+89  EvalCentroid                  evaluates an input attribute at pixel center
+90  SampleIndex                   returns the sample index in a sample-frequency pixel shader
+91  Coverage                      returns the coverage mask input in a pixel shader
+92  InnerCoverage                 returns underestimated coverage input from conservative rasterization in a pixel shader
 93  ThreadId                      reads the thread ID
 94  GroupId                       reads the group ID (SV_GroupID)
 95  ThreadIdInGroup               reads the thread ID within the group (SV_GroupThreadID)
@ -1999,55 +1999,43 @@ ID  Name                          Description
 97  EmitStream                    emits a vertex to a given stream
 98  CutStream                     completes the current primitive topology at the specified stream
 99  EmitThenCutStream             equivalent to an EmitStream followed by a CutStream
-100 MakeDouble                    creates a double value
-101 ToDelete1                     reserved
-102 ToDelete2                     reserved
-103 SplitDouble                   splits a double into low and high parts
-104 ToDelete3                     reserved
-105 ToDelete4                     reserved
-106 LoadOutputControlPoint        LoadOutputControlPoint
-107 LoadPatchConstant             LoadPatchConstant
-108 DomainLocation                DomainLocation
-109 StorePatchConstant            StorePatchConstant
-110 OutputControlPointID          OutputControlPointID
-111 PrimitiveID                   PrimitiveID
-112 CycleCounterLegacy            CycleCounterLegacy
-113 Htan                          returns the hyperbolic tangent of the specified value
-114 WaveCaptureReserved           reserved
-115 WaveIsFirstLane               returns 1 for the first lane in the wave
-116 WaveGetLaneIndex              returns the index of the current lane in the wave
-117 WaveGetLaneCount              returns the number of lanes in the wave
-118 WaveIsHelperLaneReserved      reserved
-119 WaveAnyTrue                   returns 1 if any of the lane evaluates the value to true
-120 WaveAllTrue                   returns 1 if all the lanes evaluate the value to true
-121 WaveActiveAllEqual            returns 1 if all the lanes have the same value
-122 WaveActiveBallot              returns a struct with a bit set for each lane where the condition is true
-123 WaveReadLaneAt                returns the value from the specified lane
-124 WaveReadLaneFirst             returns the value from the first lane
-125 WaveActiveOp                  returns the result the operation across waves
-126 WaveActiveBit                 returns the result of the operation across all lanes
-127 WavePrefixOp                  returns the result of the operation on prior lanes
-128 WaveGetOrderedIndex           reserved
-129 GlobalOrderedCountIncReserved reserved
-130 QuadReadLaneAt                reads from a lane in the quad
-131 QuadOp                        returns the result of a quad-level operation
-132 BitcastI16toF16               bitcast between different sizes
-133 BitcastF16toI16               bitcast between different sizes
-134 BitcastI32toF32               bitcast between different sizes
-135 BitcastF32toI32               bitcast between different sizes
-136 BitcastI64toF64               bitcast between different sizes
-137 BitcastF64toI64               bitcast between different sizes
-138 GSInstanceID                  GSInstanceID
-139 LegacyF32ToF16                legacy fuction to convert float (f32) to half (f16) (this is not related to min-precision)
-140 LegacyF16ToF32                legacy fuction to convert half (f16) to float (f32) (this is not related to min-precision)
-141 LegacyDoubleToFloat           legacy fuction to convert double to float
-142 LegacyDoubleToSInt32          legacy fuction to convert double to int32
-143 LegacyDoubleToUInt32          legacy fuction to convert double to uint32
-144 WaveAllBitCount               returns the count of bits set to 1 across the wave
-145 WavePrefixBitCount            returns the count of bits set to 1 on prior lanes
-146 SampleIndex                   returns the sample index in a sample-frequency pixel shader
-147 Coverage                      returns the coverage mask input in a pixel shader
-148 InnerCoverage                 returns underestimated coverage input from conservative rasterization in a pixel shader
+100 GSInstanceID                  GSInstanceID
+101 MakeDouble                    creates a double value
+102 SplitDouble                   splits a double into low and high parts
+103 LoadOutputControlPoint        LoadOutputControlPoint
+104 LoadPatchConstant             LoadPatchConstant
+105 DomainLocation                DomainLocation
+106 StorePatchConstant            StorePatchConstant
+107 OutputControlPointID          OutputControlPointID
+108 PrimitiveID                   PrimitiveID
+109 CycleCounterLegacy            CycleCounterLegacy
+110 WaveIsFirstLane               returns 1 for the first lane in the wave
+111 WaveGetLaneIndex              returns the index of the current lane in the wave
+112 WaveGetLaneCount              returns the number of lanes in the wave
+113 WaveAnyTrue                   returns 1 if any of the lane evaluates the value to true
+114 WaveAllTrue                   returns 1 if all the lanes evaluate the value to true
+115 WaveActiveAllEqual            returns 1 if all the lanes have the same value
+116 WaveActiveBallot              returns a struct with a bit set for each lane where the condition is true
+117 WaveReadLaneAt                returns the value from the specified lane
+118 WaveReadLaneFirst             returns the value from the first lane
+119 WaveActiveOp                  returns the result the operation across waves
+120 WaveActiveBit                 returns the result of the operation across all lanes
+121 WavePrefixOp                  returns the result of the operation on prior lanes
+122 QuadReadLaneAt                reads from a lane in the quad
+123 QuadOp                        returns the result of a quad-level operation
+124 BitcastI16toF16               bitcast between different sizes
+125 BitcastF16toI16               bitcast between different sizes
+126 BitcastI32toF32               bitcast between different sizes
+127 BitcastF32toI32               bitcast between different sizes
+128 BitcastI64toF64               bitcast between different sizes
+129 BitcastF64toI64               bitcast between different sizes
+130 LegacyF32ToF16                legacy fuction to convert float (f32) to half (f16) (this is not related to min-precision)
+131 LegacyF16ToF32                legacy fuction to convert half (f16) to float (f32) (this is not related to min-precision)
+132 LegacyDoubleToFloat           legacy fuction to convert double to float
+133 LegacyDoubleToSInt32          legacy fuction to convert double to int32
+134 LegacyDoubleToUInt32          legacy fuction to convert double to uint32
+135 WaveAllBitCount               returns the count of bits set to 1 across the wave
+136 WavePrefixBitCount            returns the count of bits set to 1 on prior lanes
 === ============================= ================================================================================================================


@ -2071,6 +2059,54 @@ FAbs
 The FAbs instruction takes simply forces the sign of the number(s) on the source operand positive, including on INF values.
 Applying FAbs on NaN preserves NaN, although the particular NaN bit pattern that results is not defined.

+FMax
+~~~~
+
+>= is used instead of > so that if min(x,y) = x then max(x,y) = y.
+
+NaN has special handling: If one source operand is NaN, then the other source operand is returned.
+If both are NaN, any NaN representation is returned.
+This conforms to new IEEE 754R rules.
+
+Denorms are flushed (sign preserved) before comparison, however the result written to dest may or may not be denorm flushed.
+
+------+-----------------------------+
+| a    | b                           |
+|      +------+--------+------+------+
+|      | -inf | F      | +inf | NaN  |
+------+------+--------+------+------+
+| -inf | -inf | b      | +inf | -inf |
+------+------+--------+------+------+
+| F    | a    | a or b | +inf | a    |
+------+------+--------+------+------+
+| +inf | +inf | +inf   | +inf | +inf |
+------+------+--------+------+------+
+| NaN  | -inf | b      | +inf | NaN  |
+------+------+--------+------+------+
+
+FMin
+~~~~
+
+NaN has special handling: If one source operand is NaN, then the other source operand is returned.
+If both are NaN, any NaN representation is returned.
+This conforms to new IEEE 754R rules.
+
+Denorms are flushed (sign preserved) before comparison, however the result written to dest may or may not be denorm flushed.
+
+------+-----------------------------+
+| a    | b                           |
+|      +------+--------+------+------+
+|      | -inf | F      | +inf | NaN  |
+------+------+--------+------+------+
+| -inf | -inf | -inf   | -inf | -inf |
+------+------+--------+------+------+
+| F    | -inf | a or b |    a |    a |
+------+------+--------+------+------+
+| +inf | -inf | b      | +inf | +inf |
+------+------+--------+------+------+
+| NaN  | -inf | b      | +inf | NaN  |
+------+------+--------+------+------+
+
 Saturate
 ~~~~~~~~

@ -2116,6 +2152,11 @@ The set of validation rules that are known to hold for a DXIL program is identif
 Rule Code                             Description
 ===================================== =======================================================================================================================================================================================================================================================================================================
 BITCODE.VALID                         TODO - Module must be bitcode-valid
+CONTAINER.PARTINVALID                 DXIL Container must not contain unknown parts
+CONTAINER.PARTMATCHES                 DXIL Container Parts must match Module
+CONTAINER.PARTMISSING                 DXIL Container requires certain parts, corresponding to module
+CONTAINER.PARTREPEATED                DXIL Container must have only one of each part type
+CONTAINER.ROOTSIGNATUREINCOMPATIBLE   Root Signature in DXIL Container must be compatible with shader
 DECL.DXILFNEXTERN                     External function must be a DXIL function
 DECL.DXILNSRESERVED                   The DXIL reserved prefixes must only be used by built-in functions and types
 DECL.FNFLATTENPARAM                   Function parameters must not use struct types
@ -2188,6 +2229,7 @@ INSTR.SAMPLERMODEFORSAMPLE            sample/_l/_d/_cl_s/gather instruction requ
 INSTR.SAMPLERMODEFORSAMPLEC           sample_c_*/gather_c instructions require sampler declared in comparison mode
 INSTR.STRUCTBITCAST                   Bitcast on struct types is not allowed
 INSTR.TEXTUREOFFSET                   offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7
+INSTR.TGSMRACECOND                    Race condition writing to shared memory detected, consider making this write conditional
 INSTR.UNDEFRESULTFORGETDIMENSION      GetDimensions used undef dimension %0 on %1
 INSTR.WRITEMASKFORTYPEDUAVSTORE       store on typed uav must write to all four components of the UAV
 INSTR.WRITEMASKMATCHVALUEFORUAVSTORE  uav store write mask must match store value mask, write mask is %0 and store value mask is %1
@ -2247,7 +2289,7 @@ SM.GSOUTPUTVERTEXCOUNTRANGE           GS output vertex count must be [0..%0].  %
 SM.GSTOTALOUTPUTVERTEXDATARANGE       Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3
 SM.GSVALIDINPUTPRIMITIVE              GS input primitive unrecognized
 SM.GSVALIDOUTPUTPRIMITIVETOPOLOGY     GS output primitive topology unrecognized
-SM.HSINPUTCONTROLPOINTCOUNTRANGE      HS input control point count must be [1..%0].  %1 specified
+SM.HSINPUTCONTROLPOINTCOUNTRANGE      HS input control point count must be [0..%0].  %1 specified
 SM.HULLPASSTHRUCONTROLPOINTCOUNTMATCH For pass thru hull shader, input control point count must match output control point count
 SM.INSIDETESSFACTORSIZEMATCHDOMAIN    InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
 SM.INVALIDRESOURCECOMPTYPE            Invalid resource return type
@ -2283,7 +2325,9 @@ SM.THREADGROUPCHANNELRANGE            Declared Thread Group %0 size %1 outside v
 SM.TRIOUTPUTPRIMITIVEMISMATCH         Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain
 SM.UNDEFINEDOUTPUT                    Not all elements of output %0 were written
 SM.VALIDDOMAIN                        Invalid Tessellator Domain specified. Must be isoline, tri or quad
+SM.ZEROHSINPUTCONTROLPOINTWITHINPUT   When HS input control point count is 0, no input signature should exist
 TYPES.DEFINED                         Type must be defined based on DXIL primitives
+TYPES.I8                              I8 can only used as immediate value for intrinsic
 TYPES.INTWIDTH                        Int type must be of valid width
 TYPES.NOMULTIDIM                      Only one dimension allowed for array type
 TYPES.NOVECTOR                        Vector types must not be present
--- a/include/dxc/HLSL/DxilConstants.h
+++ b/include/dxc/HLSL/DxilConstants.h
@ -26,8 +26,8 @@ import hctdb_instrhelp

 namespace DXIL {
  // DXIL version.
-  const unsigned kDxilMajor = 0;
-  const unsigned kDxilMinor = 7;
+  const unsigned kDxilMajor = 1;
+  const unsigned kDxilMinor = 0;

  inline unsigned MakeDxilVersion(unsigned DxilMajor, unsigned DxilMinor) {
    return 0 | (DxilMajor << 8) | (DxilMinor);
@ -206,6 +206,13 @@ namespace DXIL {
  };
  // PackingKind-ENUM:END

+  enum class PackingStrategy : unsigned {
+    Default = 0, // Choose default packing algorithm based on target (currently PrefixStable)
+    PrefixStable, // Maintain assumption that all elements are packed in order and stable as new elements are added.
+    Optimized, // Optimize packing of all elements together (all elements must be present, in the same order, for identical placement of any individual element)
+    Invalid,
+  };
+
  enum class SamplerKind : unsigned {
    Default = 0,
    Comparison,
@ -246,43 +253,32 @@ namespace DXIL {
  // OPCODE-ENUM:BEGIN
  // Enumeration for operations specified by DXIL
  enum class OpCode : unsigned {
-    // 
-    GlobalOrderedCountIncReserved = 129, // reserved
-    ToDelete1 = 101, // reserved
-    ToDelete2 = 102, // reserved
-    ToDelete3 = 104, // reserved
-    ToDelete4 = 105, // reserved
-    ToDelete5 = 76, // reserved
-    ToDelete6 = 77, // reserved
-  
    // Binary float
-    FMax = 34, // returns the FMax of the input values
-    FMin = 35, // returns the FMin of the input values
-  
-    // Binary int with carry
-    IAddc = 43, // returns the IAddc of the input values
-    ISubc = 45, // returns the ISubc of the input values
-    UAddc = 44, // returns the UAddc of the input values
-    USubc = 46, // returns the USubc of the input values
+    FMax = 35, // returns a if a >= b, else b
+    FMin = 36, // returns a if a < b, else b
  
    // Binary int with two outputs
-    IMul = 40, // returns the IMul of the input values
-    UDiv = 42, // returns the UDiv of the input values
-    UMul = 41, // returns the UMul of the input values
+    IMul = 41, // returns the IMul of the input values
+    UDiv = 43, // returns the UDiv of the input values
+    UMul = 42, // returns the UMul of the input values
  
    // Binary int
-    IMax = 36, // returns the IMax of the input values
-    IMin = 37, // returns the IMin of the input values
-    UMax = 38, // returns the UMax of the input values
-    UMin = 39, // returns the UMin of the input values
+    IMax = 37, // returns the IMax of the input values
+    IMin = 38, // returns the IMin of the input values
+    UMax = 39, // returns the UMax of the input values
+    UMin = 40, // returns the UMin of the input values
+  
+    // Binary uint with carry or borrow
+    UAddc = 44, // returns the UAddc of the input values
+    USubb = 45, // returns the USubb of the input values
  
    // Bitcasts with different sizes
-    BitcastF16toI16 = 133, // bitcast between different sizes
-    BitcastF32toI32 = 135, // bitcast between different sizes
-    BitcastF64toI64 = 137, // bitcast between different sizes
-    BitcastI16toF16 = 132, // bitcast between different sizes
-    BitcastI32toF32 = 134, // bitcast between different sizes
-    BitcastI64toF64 = 136, // bitcast between different sizes
+    BitcastF16toI16 = 125, // bitcast between different sizes
+    BitcastF32toI32 = 127, // bitcast between different sizes
+    BitcastF64toI64 = 129, // bitcast between different sizes
+    BitcastI16toF16 = 124, // bitcast between different sizes
+    BitcastI32toF32 = 126, // bitcast between different sizes
+    BitcastI64toF64 = 128, // bitcast between different sizes
  
    // Compute shader
    FlattenedThreadIdInGroup = 96, // provides a flattened index for a given thread within a given group (SV_GroupIndex)
@ -291,92 +287,90 @@ namespace DXIL {
    ThreadIdInGroup = 95, // reads the thread ID within the group (SV_GroupThreadID)
  
    // Domain and hull shader
-    LoadOutputControlPoint = 106, // LoadOutputControlPoint
-    LoadPatchConstant = 107, // LoadPatchConstant
+    LoadOutputControlPoint = 103, // LoadOutputControlPoint
+    LoadPatchConstant = 104, // LoadPatchConstant
  
    // Domain shader
-    DomainLocation = 108, // DomainLocation
+    DomainLocation = 105, // DomainLocation
  
    // Dot
-    Dot2 = 55, // two-dimensional vector dot-product
-    Dot3 = 56, // three-dimensional vector dot-product
-    Dot4 = 57, // four-dimensional vector dot-product
+    Dot2 = 54, // two-dimensional vector dot-product
+    Dot3 = 55, // three-dimensional vector dot-product
+    Dot4 = 56, // four-dimensional vector dot-product
  
    // Double precision
-    LegacyDoubleToFloat = 141, // legacy fuction to convert double to float
-    LegacyDoubleToSInt32 = 142, // legacy fuction to convert double to int32
-    LegacyDoubleToUInt32 = 143, // legacy fuction to convert double to uint32
-    MakeDouble = 100, // creates a double value
-    SplitDouble = 103, // splits a double into low and high parts
-  
-    // GS
-    GSInstanceID = 138, // GSInstanceID
+    LegacyDoubleToFloat = 132, // legacy fuction to convert double to float
+    LegacyDoubleToSInt32 = 133, // legacy fuction to convert double to int32
+    LegacyDoubleToUInt32 = 134, // legacy fuction to convert double to uint32
+    MakeDouble = 101, // creates a double value
+    SplitDouble = 102, // splits a double into low and high parts
  
    // Geometry shader
    CutStream = 98, // completes the current primitive topology at the specified stream
    EmitStream = 97, // emits a vertex to a given stream
    EmitThenCutStream = 99, // equivalent to an EmitStream followed by a CutStream
+    GSInstanceID = 100, // GSInstanceID
  
    // Hull shader
-    OutputControlPointID = 110, // OutputControlPointID
-    PrimitiveID = 111, // PrimitiveID
-    StorePatchConstant = 109, // StorePatchConstant
+    OutputControlPointID = 107, // OutputControlPointID
+    PrimitiveID = 108, // PrimitiveID
+    StorePatchConstant = 106, // StorePatchConstant
  
    // Legacy floating-point
-    LegacyF16ToF32 = 140, // legacy fuction to convert half (f16) to float (f32) (this is not related to min-precision)
-    LegacyF32ToF16 = 139, // legacy fuction to convert float (f32) to half (f16) (this is not related to min-precision)
+    LegacyF16ToF32 = 131, // legacy fuction to convert half (f16) to float (f32) (this is not related to min-precision)
+    LegacyF32ToF16 = 130, // legacy fuction to convert float (f32) to half (f16) (this is not related to min-precision)
  
    // Other
-    CycleCounterLegacy = 112, // CycleCounterLegacy
+    CycleCounterLegacy = 109, // CycleCounterLegacy
  
    // Pixel shader
-    CalculateLOD = 84, // calculates the level of detail
-    Coverage = 147, // returns the coverage mask input in a pixel shader
-    DerivCoarseX = 86, // computes the rate of change of components per stamp
-    DerivCoarseY = 87, // computes the rate of change of components per stamp
-    DerivFineX = 88, // computes the rate of change of components per pixel
-    DerivFineY = 89, // computes the rate of change of components per pixel
-    Discard = 85, // discard the current pixel
-    EvalCentroid = 92, // evaluates an input attribute at pixel center
-    EvalSampleIndex = 91, // evaluates an input attribute at a sample location
-    EvalSnapped = 90, // evaluates an input attribute at pixel center with an offset
-    InnerCoverage = 148, // returns underestimated coverage input from conservative rasterization in a pixel shader
-    SampleIndex = 146, // returns the sample index in a sample-frequency pixel shader
+    CalculateLOD = 81, // calculates the level of detail
+    Coverage = 91, // returns the coverage mask input in a pixel shader
+    DerivCoarseX = 83, // computes the rate of change of components per stamp
+    DerivCoarseY = 84, // computes the rate of change of components per stamp
+    DerivFineX = 85, // computes the rate of change of components per pixel
+    DerivFineY = 86, // computes the rate of change of components per pixel
+    Discard = 82, // discard the current pixel
+    EvalCentroid = 89, // evaluates an input attribute at pixel center
+    EvalSampleIndex = 88, // evaluates an input attribute at a sample location
+    EvalSnapped = 87, // evaluates an input attribute at pixel center with an offset
+    InnerCoverage = 92, // returns underestimated coverage input from conservative rasterization in a pixel shader
+    SampleIndex = 90, // returns the sample index in a sample-frequency pixel shader
  
    // Quaternary
-    Bfi = 54, // given a bit range from the LSB of a number, places that number of bits in another number at any offset
+    Bfi = 53, // given a bit range from the LSB of a number, places that number of bits in another number at any offset
  
    // Resources - gather
-    TextureGather = 74, // gathers the four texels that would be used in a bi-linear filtering operation
-    TextureGatherCmp = 75, // same as TextureGather, except this instrution performs comparison on texels, similar to SampleCmp
+    TextureGather = 73, // gathers the four texels that would be used in a bi-linear filtering operation
+    TextureGatherCmp = 74, // same as TextureGather, except this instrution performs comparison on texels, similar to SampleCmp
  
    // Resources - sample
-    RenderTargetGetSampleCount = 80, // gets the number of samples for a render target
-    RenderTargetGetSamplePosition = 79, // gets the position of the specified sample
-    Sample = 61, // samples a texture
-    SampleBias = 62, // samples a texture after applying the input bias to the mipmap level
-    SampleCmp = 65, // samples a texture and compares a single component against the specified comparison value
-    SampleCmpLevelZero = 66, // samples a texture and compares a single component against the specified comparison value
-    SampleGrad = 64, // samples a texture using a gradient to influence the way the sample location is calculated
-    SampleLevel = 63, // samples a texture using a mipmap-level offset
-    Texture2DMSGetSamplePosition = 78, // gets the position of the specified sample
+    RenderTargetGetSampleCount = 77, // gets the number of samples for a render target
+    RenderTargetGetSamplePosition = 76, // gets the position of the specified sample
+    Sample = 60, // samples a texture
+    SampleBias = 61, // samples a texture after applying the input bias to the mipmap level
+    SampleCmp = 64, // samples a texture and compares a single component against the specified comparison value
+    SampleCmpLevelZero = 65, // samples a texture and compares a single component against the specified comparison value
+    SampleGrad = 63, // samples a texture using a gradient to influence the way the sample location is calculated
+    SampleLevel = 62, // samples a texture using a mipmap-level offset
+    Texture2DMSGetSamplePosition = 75, // gets the position of the specified sample
  
    // Resources
-    BufferLoad = 69, // reads from a TypedBuffer
-    BufferStore = 70, // writes to a RWTypedBuffer
-    BufferUpdateCounter = 71, // atomically increments/decrements the hidden 32-bit counter stored with a Count or Append UAV
-    CBufferLoad = 59, // loads a value from a constant buffer resource
-    CBufferLoadLegacy = 60, // loads a value from a constant buffer resource
-    CheckAccessFullyMapped = 72, // determines whether all values from a Sample, Gather, or Load operation accessed mapped tiles in a tiled resource
-    CreateHandle = 58, // creates the handle to a resource
-    GetDimensions = 73, // gets texture size information
-    TextureLoad = 67, // reads texel data without any filtering or sampling
-    TextureStore = 68, // reads texel data without any filtering or sampling
+    BufferLoad = 68, // reads from a TypedBuffer
+    BufferStore = 69, // writes to a RWTypedBuffer
+    BufferUpdateCounter = 70, // atomically increments/decrements the hidden 32-bit counter stored with a Count or Append UAV
+    CBufferLoad = 58, // loads a value from a constant buffer resource
+    CBufferLoadLegacy = 59, // loads a value from a constant buffer resource
+    CheckAccessFullyMapped = 71, // determines whether all values from a Sample, Gather, or Load operation accessed mapped tiles in a tiled resource
+    CreateHandle = 57, // creates the handle to a resource
+    GetDimensions = 72, // gets texture size information
+    TextureLoad = 66, // reads texel data without any filtering or sampling
+    TextureStore = 67, // reads texel data without any filtering or sampling
  
    // Synchronization
-    AtomicBinOp = 81, // performs an atomic operation on two operands
-    AtomicCompareExchange = 82, // atomic compare and exchange to memory
-    Barrier = 83, // inserts a memory barrier in the shader
+    AtomicBinOp = 78, // performs an atomic operation on two operands
+    AtomicCompareExchange = 79, // atomic compare and exchange to memory
+    Barrier = 80, // inserts a memory barrier in the shader
  
    // Temporary, indexable, input, output registers
    LoadInput = 4, // loads the value from shader input
@ -387,73 +381,70 @@ namespace DXIL {
    TempRegStore = 1, // helper store operation
  
    // Tertiary float
-    FMad = 47, // performs a fused multiply add (FMA) of the form a * b + c
-    Fma = 48, // performs a fused multiply add (FMA) of the form a * b + c
+    FMad = 46, // performs a fused multiply add (FMA) of the form a * b + c
+    Fma = 47, // performs a fused multiply add (FMA) of the form a * b + c
  
    // Tertiary int
-    IMad = 49, // performs an integral IMad
-    Ibfe = 52, // performs an integral Ibfe
-    Msad = 51, // performs an integral Msad
-    UMad = 50, // performs an integral UMad
-    Ubfe = 53, // performs an integral Ubfe
+    IMad = 48, // performs an integral IMad
+    Ibfe = 51, // performs an integral Ibfe
+    Msad = 50, // performs an integral Msad
+    UMad = 49, // performs an integral UMad
+    Ubfe = 52, // performs an integral Ubfe
  
    // Unary float - rounding
-    Round_ne = 25, // returns the Round_ne
-    Round_ni = 26, // returns the Round_ni
-    Round_pi = 27, // returns the Round_pi
-    Round_z = 28, // returns the Round_z
+    Round_ne = 26, // returns the Round_ne
+    Round_ni = 27, // returns the Round_ni
+    Round_pi = 28, // returns the Round_pi
+    Round_z = 29, // returns the Round_z
  
    // Unary float
    Acos = 15, // returns the Acos
    Asin = 16, // returns the Asin
    Atan = 17, // returns the Atan
    Cos = 12, // returns cosine(theta) for theta in radians.
-    Exp = 20, // returns the Exp
+    Exp = 21, // returns the Exp
    FAbs = 6, // returns the absolute value of the input value.
-    Frc = 21, // returns the Frc
+    Frc = 22, // returns the Frc
    Hcos = 18, // returns the Hcos
    Hsin = 19, // returns the Hsin
-    Htan = 113, // returns the hyperbolic tangent of the specified value
+    Htan = 20, // returns the Htan
    IsFinite = 10, // returns the IsFinite
    IsInf = 9, // returns the IsInf
    IsNaN = 8, // returns the IsNaN
    IsNormal = 11, // returns the IsNormal
-    Log = 22, // returns the Log
-    Rsqrt = 24, // returns the Rsqrt
+    Log = 23, // returns the Log
+    Rsqrt = 25, // returns the Rsqrt
    Saturate = 7, // clamps the result of a single or double precision floating point value to [0.0f...1.0f]
    Sin = 13, // returns the Sin
-    Sqrt = 23, // returns the Sqrt
+    Sqrt = 24, // returns the Sqrt
    Tan = 14, // returns the Tan
  
    // Unary int
-    Bfrev = 29, // returns the reverse bit pattern of the input value
-    Countbits = 30, // returns the Countbits
-    FirstbitHi = 32, // returns src != 0? (BitWidth-1 - FirstbitHi) : -1
-    FirstbitLo = 31, // returns the FirstbitLo
-    FirstbitSHi = 33, // returns src != 0? (BitWidth-1 - FirstbitSHi) : -1
+    Bfrev = 30, // returns the reverse bit pattern of the input value
+    Countbits = 31, // returns the Countbits
+    FirstbitHi = 33, // returns src != 0? (BitWidth-1 - FirstbitHi) : -1
+    FirstbitLo = 32, // returns the FirstbitLo
+    FirstbitSHi = 34, // returns src != 0? (BitWidth-1 - FirstbitSHi) : -1
  
    // Wave
-    QuadOp = 131, // returns the result of a quad-level operation
-    QuadReadLaneAt = 130, // reads from a lane in the quad
-    WaveActiveAllEqual = 121, // returns 1 if all the lanes have the same value
-    WaveActiveBallot = 122, // returns a struct with a bit set for each lane where the condition is true
-    WaveActiveBit = 126, // returns the result of the operation across all lanes
-    WaveActiveOp = 125, // returns the result the operation across waves
-    WaveAllBitCount = 144, // returns the count of bits set to 1 across the wave
-    WaveAllTrue = 120, // returns 1 if all the lanes evaluate the value to true
-    WaveAnyTrue = 119, // returns 1 if any of the lane evaluates the value to true
-    WaveCaptureReserved = 114, // reserved
-    WaveGetLaneCount = 117, // returns the number of lanes in the wave
-    WaveGetLaneIndex = 116, // returns the index of the current lane in the wave
-    WaveGetOrderedIndex = 128, // reserved
-    WaveIsFirstLane = 115, // returns 1 for the first lane in the wave
-    WaveIsHelperLaneReserved = 118, // reserved
-    WavePrefixBitCount = 145, // returns the count of bits set to 1 on prior lanes
-    WavePrefixOp = 127, // returns the result of the operation on prior lanes
-    WaveReadLaneAt = 123, // returns the value from the specified lane
-    WaveReadLaneFirst = 124, // returns the value from the first lane
+    QuadOp = 123, // returns the result of a quad-level operation
+    QuadReadLaneAt = 122, // reads from a lane in the quad
+    WaveActiveAllEqual = 115, // returns 1 if all the lanes have the same value
+    WaveActiveBallot = 116, // returns a struct with a bit set for each lane where the condition is true
+    WaveActiveBit = 120, // returns the result of the operation across all lanes
+    WaveActiveOp = 119, // returns the result the operation across waves
+    WaveAllBitCount = 135, // returns the count of bits set to 1 across the wave
+    WaveAllTrue = 114, // returns 1 if all the lanes evaluate the value to true
+    WaveAnyTrue = 113, // returns 1 if any of the lane evaluates the value to true
+    WaveGetLaneCount = 112, // returns the number of lanes in the wave
+    WaveGetLaneIndex = 111, // returns the index of the current lane in the wave
+    WaveIsFirstLane = 110, // returns 1 for the first lane in the wave
+    WavePrefixBitCount = 136, // returns the count of bits set to 1 on prior lanes
+    WavePrefixOp = 121, // returns the result of the operation on prior lanes
+    WaveReadLaneAt = 117, // returns the value from the specified lane
+    WaveReadLaneFirst = 118, // returns the value from the first lane
  
-    NumOpCodes = 149 // exclusive last value of enumeration
+    NumOpCodes = 137 // exclusive last value of enumeration
  };
  // OPCODE-ENUM:END

@ -461,18 +452,15 @@ namespace DXIL {
  // OPCODECLASS-ENUM:BEGIN
  // Groups for DXIL operations with equivalent function templates
  enum class OpCodeClass : unsigned {
-    // 
-    Reserved,
-  
-    // Binary int with carry
-    BinaryWithCarry,
-  
    // Binary int with two outputs
    BinaryWithTwoOuts,
  
    // Binary int
    Binary,
  
+    // Binary uint with carry or borrow
+    BinaryWithCarryOrBorrow,
+  
    // Bitcasts with different sizes
    BitcastF16toI16,
    BitcastF32toI32,
@ -506,13 +494,11 @@ namespace DXIL {
    MakeDouble,
    SplitDouble,
  
-    // GS
-    GSInstanceID,
-  
    // Geometry shader
    CutStream,
    EmitStream,
    EmitThenCutStream,
+    GSInstanceID,
  
    // Hull shader
    OutputControlPointID,
@ -538,6 +524,7 @@ namespace DXIL {
    EvalSnapped,
    InnerCoverage,
    SampleIndex,
+    Unary,
  
    // Quaternary
    Quaternary,
@ -587,7 +574,6 @@ namespace DXIL {
  
    // Unary float
    IsSpecialFloat,
-    Unary,
  
    // Unary int
    UnaryBits,
@ -609,7 +595,7 @@ namespace DXIL {
    WaveReadLaneAt,
    WaveReadLaneFirst,
  
-    NumOpClasses = 94 // exclusive last value of enumeration
+    NumOpClasses = 93 // exclusive last value of enumeration
  };
  // OPCODECLASS-ENUM:END

@ -645,6 +631,11 @@ namespace DXIL {
    // DomainLocation.
    const unsigned kDomainLocationColOpIdx = 1;

+    // BufferLoad.
+    const unsigned kBufferLoadHandleOpIdx = 1;
+    const unsigned kBufferLoadCoord0OpIdx = 2;
+    const unsigned kBufferLoadCoord1OpIdx = 3;
+
    // BufferStore.
    const unsigned kBufferStoreHandleOpIdx = 1;
    const unsigned kBufferStoreCoord0OpIdx = 2;
@ -680,6 +671,18 @@ namespace DXIL {
    // TextureGatherCmp.
    const unsigned kTextureGatherCmpCmpValOpIdx = 11;

+    // TextureSample.
+    const unsigned kTextureSampleTexHandleOpIdx = 1;
+    const unsigned kTextureSampleSamplerHandleOpIdx = 2;
+    const unsigned kTextureSampleCoord0OpIdx = 3;
+    const unsigned kTextureSampleCoord1OpIdx = 4;
+    const unsigned kTextureSampleCoord2OpIdx = 5;
+    const unsigned kTextureSampleCoord3OpIdx = 6;
+    const unsigned kTextureSampleOffset0OpIdx = 7;
+    const unsigned kTextureSampleOffset1OpIdx = 8;
+    const unsigned kTextureSampleOffset2OpIdx = 9;
+    const unsigned kTextureSampleClampOpIdx = 10;
+
    // AtomicBinOp.
    const unsigned kAtomicBinOpCoord0OpIdx = 3;
    const unsigned kAtomicBinOpCoord1OpIdx = 4;
--- a/include/dxc/HLSL/DxilContainer.h
+++ b/include/dxc/HLSL/DxilContainer.h
@ -16,6 +16,7 @@

 #include <stdint.h>
 #include <iterator>
+#include <functional>
 #include "dxc/HLSL/DxilConstants.h"

 struct IDxcContainerReflection;
@ -23,6 +24,10 @@ namespace llvm { class Module; }

 namespace hlsl {

+class AbstractMemoryStream;
+class RootSignatureHandle;
+class DxilModule;
+
 #pragma pack(push, 1)

 static const size_t DxilContainerHashSize = 16;
@ -368,10 +373,31 @@ inline uint32_t EncodeVersion(DXIL::ShaderKind shaderType, uint32_t major,
  return ((unsigned)shaderType << 16) | (major << 4) | minor;
 }

-class AbstractMemoryStream;
-void SerializeDxilContainerForModule(llvm::Module *pModule,
+class DxilPartWriter {
+public:
+  virtual uint32_t size() const = 0;
+  virtual void write(AbstractMemoryStream *pStream) = 0;
+};
+
+DxilPartWriter *NewProgramSignatureWriter(const DxilModule &M, DXIL::SignatureKind Kind);
+DxilPartWriter *NewRootSignatureWriter(const RootSignatureHandle &S);
+DxilPartWriter *NewFeatureInfoWriter(const DxilModule &M);
+DxilPartWriter *NewPSVWriter(const DxilModule &M);
+
+class DxilContainerWriter : public DxilPartWriter  {
+public:
+  typedef std::function<void(AbstractMemoryStream*)> WriteFn;
+  virtual void AddPart(uint32_t FourCC, uint32_t Size, WriteFn Write) = 0;
+};
+
+DxilContainerWriter *NewDxilContainerWriter();
+
+void SerializeDxilContainerForModule(hlsl::DxilModule *pModule,
                                     AbstractMemoryStream *pModuleBitcode,
                                     AbstractMemoryStream *pStream);
+void SerializeDxilContainerForRootSignature(hlsl::RootSignatureHandle *pRootSigHandle,
+                                     AbstractMemoryStream *pStream);
+
 void CreateDxcContainerReflection(IDxcContainerReflection **ppResult);

 // Converts uint32_t partKind to char array object.
@ -384,6 +410,16 @@ inline char * PartKindToCharArray(uint32_t partKind, _Out_writes_(5) char* pText
  return pText;
 }

+inline size_t GetOffsetTableSize(uint32_t partCount) {
+  return sizeof(uint32_t) * partCount;
+}
+// Compute total size of the dxil container from parts information
+inline size_t GetDxilContainerSizeFromParts(uint32_t partCount, uint32_t partsSize) {
+  return partsSize + (uint32_t)sizeof(DxilContainerHeader) +
+         GetOffsetTableSize(partCount) +
+         (uint32_t)sizeof(DxilPartHeader) * partCount;
+}
+
 } // namespace hlsl

 #endif // __DXC_CONTAINER__
--- a/include/dxc/HLSL/DxilGenerationPass.h
+++ b/include/dxc/HLSL/DxilGenerationPass.h
@ -43,6 +43,7 @@ ModulePass *createHLEmitMetadataPass();
 ModulePass *createHLEnsureMetadataPass();
 ModulePass *createDxilEmitMetadataPass();
 ModulePass *createDxilPrecisePropagatePass();
+FunctionPass *createDxilLegalizeSampleOffsetPass();
 FunctionPass *createSimplifyInstPass();

 void initializeDxilCondenseResourcesPass(llvm::PassRegistry&);
@ -51,6 +52,7 @@ void initializeHLEnsureMetadataPass(llvm::PassRegistry&);
 void initializeHLEmitMetadataPass(llvm::PassRegistry&);
 void initializeDxilEmitMetadataPass(llvm::PassRegistry&);
 void initializeDxilPrecisePropagatePassPass(llvm::PassRegistry&);
+void initializeDxilLegalizeSampleOffsetPassPass(llvm::PassRegistry&);
 void initializeSimplifyInstPass(llvm::PassRegistry&);

 bool AreDxilResourcesDense(llvm::Module *M, hlsl::DxilResourceBase **ppNonDense);
--- a/include/dxc/HLSL/DxilInstructions.h
+++ b/include/dxc/HLSL/DxilInstructions.h
@ -1058,6 +1058,24 @@ struct DxilInst_Hsin {
  llvm::Value *get_value() const { return Instr->getOperand(1); }
 };

+/// This instruction returns the Htan
+struct DxilInst_Htan {
+  const llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_Htan(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::Htan);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (2 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+  // Accessors
+  llvm::Value *get_value() const { return Instr->getOperand(1); }
+};
+
 /// This instruction returns the Exp
 struct DxilInst_Exp {
  const llvm::Instruction *Instr;
@ -1310,7 +1328,7 @@ struct DxilInst_FirstbitSHi {
  llvm::Value *get_value() const { return Instr->getOperand(1); }
 };

-/// This instruction returns the FMax of the input values
+/// This instruction returns a if a >= b, else b
 struct DxilInst_FMax {
  const llvm::Instruction *Instr;
  // Construction and identification
@ -1329,7 +1347,7 @@ struct DxilInst_FMax {
  llvm::Value *get_b() const { return Instr->getOperand(2); }
 };

-/// This instruction returns the FMin of the input values
+/// This instruction returns a if a < b, else b
 struct DxilInst_FMin {
  const llvm::Instruction *Instr;
  // Construction and identification
@ -1481,25 +1499,6 @@ struct DxilInst_UDiv {
  llvm::Value *get_b() const { return Instr->getOperand(2); }
 };

-/// This instruction returns the IAddc of the input values
-struct DxilInst_IAddc {
-  const llvm::Instruction *Instr;
-  // Construction and identification
-  DxilInst_IAddc(llvm::Instruction *pInstr) : Instr(pInstr) {}
-  operator bool() const {
-    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::IAddc);
-  }
-  // Validation support
-  bool isAllowed() const { return true; }
-  bool isArgumentListValid() const {
-    if (3 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
-    return true;
-  }
-  // Accessors
-  llvm::Value *get_a() const { return Instr->getOperand(1); }
-  llvm::Value *get_b() const { return Instr->getOperand(2); }
-};
-
 /// This instruction returns the UAddc of the input values
 struct DxilInst_UAddc {
  const llvm::Instruction *Instr;
@ -1519,32 +1518,13 @@ struct DxilInst_UAddc {
  llvm::Value *get_b() const { return Instr->getOperand(2); }
 };

-/// This instruction returns the ISubc of the input values
-struct DxilInst_ISubc {
+/// This instruction returns the USubb of the input values
+struct DxilInst_USubb {
  const llvm::Instruction *Instr;
  // Construction and identification
-  DxilInst_ISubc(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  DxilInst_USubb(llvm::Instruction *pInstr) : Instr(pInstr) {}
  operator bool() const {
-    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::ISubc);
-  }
-  // Validation support
-  bool isAllowed() const { return true; }
-  bool isArgumentListValid() const {
-    if (3 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
-    return true;
-  }
-  // Accessors
-  llvm::Value *get_a() const { return Instr->getOperand(1); }
-  llvm::Value *get_b() const { return Instr->getOperand(2); }
-};
-
-/// This instruction returns the USubc of the input values
-struct DxilInst_USubc {
-  const llvm::Instruction *Instr;
-  // Construction and identification
-  DxilInst_USubc(llvm::Instruction *pInstr) : Instr(pInstr) {}
-  operator bool() const {
-    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::USubc);
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::USubb);
  }
  // Validation support
  bool isAllowed() const { return true; }
@ -2519,6 +2499,54 @@ struct DxilInst_EvalCentroid {
  llvm::Value *get_inputColIndex() const { return Instr->getOperand(3); }
 };

+/// This instruction returns the sample index in a sample-frequency pixel shader
+struct DxilInst_SampleIndex {
+  const llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_SampleIndex(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::SampleIndex);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+};
+
+/// This instruction returns the coverage mask input in a pixel shader
+struct DxilInst_Coverage {
+  const llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_Coverage(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::Coverage);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+};
+
+/// This instruction returns underestimated coverage input from conservative rasterization in a pixel shader
+struct DxilInst_InnerCoverage {
+  const llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_InnerCoverage(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::InnerCoverage);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+};
+
 /// This instruction reads the thread ID
 struct DxilInst_ThreadId {
  const llvm::Instruction *Instr;
@ -2643,6 +2671,22 @@ struct DxilInst_EmitThenCutStream {
  llvm::Value *get_streamId() const { return Instr->getOperand(1); }
 };

+/// This instruction GSInstanceID
+struct DxilInst_GSInstanceID {
+  const llvm::Instruction *Instr;
+  // Construction and identification
+  DxilInst_GSInstanceID(llvm::Instruction *pInstr) : Instr(pInstr) {}
+  operator bool() const {
+    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::GSInstanceID);
+  }
+  // Validation support
+  bool isAllowed() const { return true; }
+  bool isArgumentListValid() const {
+    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
+    return true;
+  }
+};
+
 /// This instruction creates a double value
 struct DxilInst_MakeDouble {
  const llvm::Instruction *Instr;
@ -2809,24 +2853,6 @@ struct DxilInst_CycleCounterLegacy {
  }
 };

-/// This instruction returns the hyperbolic tangent of the specified value
-struct DxilInst_Htan {
-  const llvm::Instruction *Instr;
-  // Construction and identification
-  DxilInst_Htan(llvm::Instruction *pInstr) : Instr(pInstr) {}
-  operator bool() const {
-    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::Htan);
-  }
-  // Validation support
-  bool isAllowed() const { return true; }
-  bool isArgumentListValid() const {
-    if (2 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
-    return true;
-  }
-  // Accessors
-  llvm::Value *get_value() const { return Instr->getOperand(1); }
-};
-
 /// This instruction returns 1 for the first lane in the wave
 struct DxilInst_WaveIsFirstLane {
  const llvm::Instruction *Instr;
@ -3196,22 +3222,6 @@ struct DxilInst_BitcastF64toI64 {
  llvm::Value *get_value() const { return Instr->getOperand(1); }
 };

-/// This instruction GSInstanceID
-struct DxilInst_GSInstanceID {
-  const llvm::Instruction *Instr;
-  // Construction and identification
-  DxilInst_GSInstanceID(llvm::Instruction *pInstr) : Instr(pInstr) {}
-  operator bool() const {
-    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::GSInstanceID);
-  }
-  // Validation support
-  bool isAllowed() const { return true; }
-  bool isArgumentListValid() const {
-    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
-    return true;
-  }
-};
-
 /// This instruction legacy fuction to convert float (f32) to half (f16) (this is not related to min-precision)
 struct DxilInst_LegacyF32ToF16 {
  const llvm::Instruction *Instr;
@ -3337,53 +3347,5 @@ struct DxilInst_WavePrefixBitCount {
  // Accessors
  llvm::Value *get_value() const { return Instr->getOperand(1); }
 };
-
-/// This instruction returns the sample index in a sample-frequency pixel shader
-struct DxilInst_SampleIndex {
-  const llvm::Instruction *Instr;
-  // Construction and identification
-  DxilInst_SampleIndex(llvm::Instruction *pInstr) : Instr(pInstr) {}
-  operator bool() const {
-    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::SampleIndex);
-  }
-  // Validation support
-  bool isAllowed() const { return true; }
-  bool isArgumentListValid() const {
-    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
-    return true;
-  }
-};
-
-/// This instruction returns the coverage mask input in a pixel shader
-struct DxilInst_Coverage {
-  const llvm::Instruction *Instr;
-  // Construction and identification
-  DxilInst_Coverage(llvm::Instruction *pInstr) : Instr(pInstr) {}
-  operator bool() const {
-    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::Coverage);
-  }
-  // Validation support
-  bool isAllowed() const { return true; }
-  bool isArgumentListValid() const {
-    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
-    return true;
-  }
-};
-
-/// This instruction returns underestimated coverage input from conservative rasterization in a pixel shader
-struct DxilInst_InnerCoverage {
-  const llvm::Instruction *Instr;
-  // Construction and identification
-  DxilInst_InnerCoverage(llvm::Instruction *pInstr) : Instr(pInstr) {}
-  operator bool() const {
-    return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::InnerCoverage);
-  }
-  // Validation support
-  bool isAllowed() const { return true; }
-  bool isArgumentListValid() const {
-    if (1 != llvm::dyn_cast<llvm::CallInst>(Instr)->getNumArgOperands()) return false;
-    return true;
-  }
-};
 // INSTR-HELPER:END
 } // namespace hlsl
--- a/include/dxc/HLSL/DxilMetadataHelper.h
+++ b/include/dxc/HLSL/DxilMetadataHelper.h
@ -72,6 +72,9 @@ public:
  // Entry points.
  static const char kDxilEntryPointsMDName[];

+  // Root Signature, for intermediate use, not valid in final DXIL module.
+  static const char kDxilRootSignatureMDName[];
+
  static const unsigned kDxilEntryPointNumFields  = 5;
  static const unsigned kDxilEntryPointFunction   = 0;  // Entry point function symbol.
  static const unsigned kDxilEntryPointName       = 1;  // Entry point unmangled name.
@ -181,7 +184,6 @@ public:
  static const unsigned kDxilDSStateTag         = 2;
  static const unsigned kDxilHSStateTag         = 3;
  static const unsigned kDxilNumThreadsTag      = 4;
-  static const unsigned kDxilRootSignatureTag   = 5;

  // GSState.
  static const unsigned kDxilGSStateNumFields               = 5;
@ -261,11 +263,11 @@ public:
  void LoadDxilSignatures(const llvm::MDOperand &MDO, DxilSignature &InputSig, 
                          DxilSignature &OutputSig, DxilSignature &PCSig);
  llvm::MDTuple *EmitSignatureMetadata(const DxilSignature &Sig);
-  llvm::Metadata *EmitRootSignature(RootSignatureHandle &RootSig);
+  void EmitRootSignature(RootSignatureHandle &RootSig);
  void LoadSignatureMetadata(const llvm::MDOperand &MDO, DxilSignature &Sig);
  llvm::MDTuple *EmitSignatureElement(const DxilSignatureElement &SE);
  void LoadSignatureElement(const llvm::MDOperand &MDO, DxilSignatureElement &SE);
-  void LoadRootSignature(const llvm::MDOperand &MDO, RootSignatureHandle &RootSig);
+  void LoadRootSignature(RootSignatureHandle &RootSig);

  // Resources.
  llvm::MDTuple *EmitDxilResourceTuple(llvm::MDTuple *pSRVs, llvm::MDTuple *pUAVs, 
--- a/include/dxc/HLSL/DxilModule.h
+++ b/include/dxc/HLSL/DxilModule.h
@ -87,6 +87,7 @@ public:
  const std::vector<std::unique_ptr<DxilResource> > &GetUAVs() const;

  void RemoveUnusedResources();
+  void RemoveFunction(llvm::Function *F);

  // Signatures.
  DxilSignature &GetInputSignature();
@ -97,6 +98,9 @@ public:
  const DxilSignature &GetPatchConstantSignature() const;
  const RootSignatureHandle &GetRootSignature() const;

+  // Remove Root Signature from module metadata
+  void StripRootSignatureFromMetadata();
+
  // DXIL type system.
  DxilTypeSystem &GetTypeSystem();

@ -118,9 +122,13 @@ public:
  void ResetPatchConstantSignature(DxilSignature *pValue);
  void ResetRootSignature(RootSignatureHandle *pValue);
  void ResetTypeSystem(DxilTypeSystem *pValue);
+  void ResetOP(hlsl::OP *hlslOP);

  void StripDebugRelatedCode();
  llvm::DebugInfoFinder &GetOrCreateDebugInfoFinder();
+
+  static DxilModule *TryGetDxilModule(llvm::Module *pModule);
+
 public:
  // Shader properties.
  class ShaderFlags {
--- a/include/dxc/HLSL/DxilOperations.h
+++ b/include/dxc/HLSL/DxilOperations.h
@ -23,6 +23,7 @@ class Instruction;
 #include "llvm/IR/Attributes.h"

 #include "DxilConstants.h"
+#include <unordered_map>

 namespace hlsl {

@ -37,6 +38,9 @@ public:
  OP(llvm::LLVMContext &Ctx, llvm::Module *pModule);

  llvm::Function *GetOpFunc(OpCode OpCode, llvm::Type *pOverloadType);
+  llvm::ArrayRef<llvm::Function *> GetOpFuncList(OpCode OpCode) const;
+  void RemoveFunction(llvm::Function *F);
+  llvm::Type *GetOverloadType(OpCode OpCode, llvm::Function *F);
  llvm::LLVMContext &GetCtx() { return m_Ctx; }
  llvm::Type *GetHandleType() const;
  llvm::Type *GetDimensionsType() const;
@ -96,7 +100,8 @@ private:
    llvm::Function *pOverloads[kNumTypeOverloads];
  };
  OpCodeCacheItem m_OpCodeClassCache[(unsigned)OpCodeClass::NumOpClasses];
-
+  std::unordered_map<llvm::Function *, OpCodeClass> m_FunctionToOpClass;
+  void RefreshCache(llvm::Module *pModule);
 private:
  // Static properties.
  struct OpCodeProperty {
--- a/include/dxc/HLSL/DxilPipelineStateValidation.h
+++ b/include/dxc/HLSL/DxilPipelineStateValidation.h
@ -12,6 +12,9 @@
 #ifndef __DXIL_PIPELINE_STATE_VALIDATION__H__
 #define __DXIL_PIPELINE_STATE_VALIDATION__H__

+#include <stdint.h>
+#include <string.h>
+
 // Versioning is additive and based on size
 struct PSVRuntimeInfo0
 {
@ -20,20 +23,20 @@ struct PSVRuntimeInfo0
      char OutputPositionPresent;
    } VS;
    struct HSInfo {
-      UINT InputControlPointCount;      // max control points == 32
-      UINT OutputControlPointCount;     // max control points == 32
-      UINT TessellatorDomain;           // hlsl::DXIL::TessellatorDomain/D3D11_SB_TESSELLATOR_DOMAIN
-      UINT TessellatorOutputPrimitive;  // hlsl::DXIL::TessellatorOutputPrimitive/D3D11_SB_TESSELLATOR_OUTPUT_PRIMITIVE
+      uint32_t InputControlPointCount;      // max control points == 32
+      uint32_t OutputControlPointCount;     // max control points == 32
+      uint32_t TessellatorDomain;           // hlsl::DXIL::TessellatorDomain/D3D11_SB_TESSELLATOR_DOMAIN
+      uint32_t TessellatorOutputPrimitive;  // hlsl::DXIL::TessellatorOutputPrimitive/D3D11_SB_TESSELLATOR_OUTPUT_PRIMITIVE
    } HS;
    struct DSInfo {
-      UINT InputControlPointCount;      // max control points == 32
+      uint32_t InputControlPointCount;      // max control points == 32
      char OutputPositionPresent;
-      UINT TessellatorDomain;           // hlsl::DXIL::TessellatorDomain/D3D11_SB_TESSELLATOR_DOMAIN
+      uint32_t TessellatorDomain;           // hlsl::DXIL::TessellatorDomain/D3D11_SB_TESSELLATOR_DOMAIN
    } DS;
    struct GSInfo {
-      UINT InputPrimitive;              // hlsl::DXIL::InputPrimitive/D3D10_SB_PRIMITIVE
-      UINT OutputTopology;              // hlsl::DXIL::PrimitiveTopology/D3D10_SB_PRIMITIVE_TOPOLOGY
-      UINT OutputStreamMask;            // max streams == 4
+      uint32_t InputPrimitive;              // hlsl::DXIL::InputPrimitive/D3D10_SB_PRIMITIVE
+      uint32_t OutputTopology;              // hlsl::DXIL::PrimitiveTopology/D3D10_SB_PRIMITIVE_TOPOLOGY
+      uint32_t OutputStreamMask;            // max streams == 4
      char OutputPositionPresent;
    } GS;
    struct PSInfo {
@ -41,8 +44,8 @@ struct PSVRuntimeInfo0
      char SampleFrequency;
    } PS;
  };
-  UINT MinimumExpectedWaveLaneCount;  // minimum lane count required, 0 if unused
-  UINT MaximumExpectedWaveLaneCount;  // maximum lane count required, 0xffffffff if unused
+  uint32_t MinimumExpectedWaveLaneCount;  // minimum lane count required, 0 if unused
+  uint32_t MaximumExpectedWaveLaneCount;  // maximum lane count required, 0xffffffff if unused
 };
 // PSVRuntimeInfo1 would derive and extend

@ -66,21 +69,21 @@ enum class PSVResourceType
 // Versioning is additive and based on size
 struct PSVResourceBindInfo0
 {
-  UINT ResType;     // PSVResourceType
-  UINT Space;
-  UINT LowerBound;
-  UINT UpperBound;
+  uint32_t ResType;     // PSVResourceType
+  uint32_t Space;
+  uint32_t LowerBound;
+  uint32_t UpperBound;
 };
 // PSVResourceBindInfo1 would derive and extend

 class DxilPipelineStateValidation
 {
-  UINT m_uPSVRuntimeInfoSize;
+  uint32_t m_uPSVRuntimeInfoSize;
  PSVRuntimeInfo0* m_pPSVRuntimeInfo0;
-  UINT m_uResourceCount;
-  UINT m_uPSVResourceBindInfoSize;
+  uint32_t m_uResourceCount;
+  uint32_t m_uPSVResourceBindInfoSize;
  void* m_pPSVResourceBindInfo;
-  UINT m_uSize;
+  uint32_t m_uSize;

 public:
  DxilPipelineStateValidation() : 
@ -93,47 +96,47 @@ public:
  }

  // Init() from PSV0 blob part that looks like:
-  // UINT PSVRuntimeInfo_size
+  // uint32_t PSVRuntimeInfo_size
  // { PSVRuntimeInfoN structure }
-  // UINT ResourceCount
+  // uint32_t ResourceCount
  // ---  end of blob if ResourceCount == 0  ---
-  // UINT PSVResourceBindInfo_size
+  // uint32_t PSVResourceBindInfo_size
  // { PSVResourceBindInfoN structure } * ResourceCount
  // returns true if no errors occurred.
-  bool InitFromPSV0(const void* pBits, UINT size) {
+  bool InitFromPSV0(const void* pBits, uint32_t size) {
    if(!(pBits != nullptr)) return false;
-    const BYTE* pCurBits = (BYTE*)pBits;
-    UINT minsize = sizeof(PSVRuntimeInfo0) + sizeof(UINT) * 2;
+    const uint8_t* pCurBits = (uint8_t*)pBits;
+    uint32_t minsize = sizeof(PSVRuntimeInfo0) + sizeof(uint32_t) * 2;
    if(!(size >= minsize)) return false;
-    m_uPSVRuntimeInfoSize = *((const UINT*)pCurBits);
+    m_uPSVRuntimeInfoSize = *((const uint32_t*)pCurBits);
    if(!(m_uPSVRuntimeInfoSize >= sizeof(PSVRuntimeInfo0))) return false;
-    pCurBits += sizeof(UINT);
-    minsize = m_uPSVRuntimeInfoSize + sizeof(UINT) * 2;
+    pCurBits += sizeof(uint32_t);
+    minsize = m_uPSVRuntimeInfoSize + sizeof(uint32_t) * 2;
    if(!(size >= minsize)) return false;
    m_pPSVRuntimeInfo0 = const_cast<PSVRuntimeInfo0*>((const PSVRuntimeInfo0*)pCurBits);
    pCurBits += m_uPSVRuntimeInfoSize;
-    m_uResourceCount = *(const UINT*)pCurBits;
-    pCurBits += sizeof(UINT);
+    m_uResourceCount = *(const uint32_t*)pCurBits;
+    pCurBits += sizeof(uint32_t);
    if (m_uResourceCount > 0) {
-      minsize += sizeof(UINT);
+      minsize += sizeof(uint32_t);
      if(!(size >= minsize)) return false;
-      m_uPSVResourceBindInfoSize = *(const UINT*)pCurBits;
-      pCurBits += sizeof(UINT);
+      m_uPSVResourceBindInfoSize = *(const uint32_t*)pCurBits;
+      pCurBits += sizeof(uint32_t);
      minsize += m_uPSVResourceBindInfoSize * m_uResourceCount;
      if(!(m_uPSVResourceBindInfoSize >= sizeof(PSVResourceBindInfo0))) return false;
      if(!(size >= minsize)) return false;
-      m_pPSVResourceBindInfo = static_cast<void*>(const_cast<BYTE*>(pCurBits));
+      m_pPSVResourceBindInfo = static_cast<void*>(const_cast<uint8_t*>(pCurBits));
    }
    return true;
  }

  // Initialize a new buffer
  // call with null pBuffer to get required size
-  bool InitNew(UINT ResourceCount, void *pBuffer, UINT *pSize) {
+  bool InitNew(uint32_t ResourceCount, void *pBuffer, uint32_t *pSize) {
    if(!(pSize)) return false;
-    UINT size = sizeof(PSVRuntimeInfo0) + sizeof(UINT) * 2;
+    uint32_t size = sizeof(PSVRuntimeInfo0) + sizeof(uint32_t) * 2;
    if (ResourceCount) {
-      size += sizeof(UINT) + (sizeof(PSVResourceBindInfo0) * ResourceCount);
+      size += sizeof(uint32_t) + (sizeof(PSVResourceBindInfo0) * ResourceCount);
    }
    if (pBuffer) {
      if(!(*pSize >= size)) return false;
@ -141,22 +144,22 @@ public:
      *pSize = size;
      return true;
    }
-    ::ZeroMemory(pBuffer, size);
+    memset(pBuffer, 0, size);
    m_uPSVRuntimeInfoSize = sizeof(PSVRuntimeInfo0);
-    BYTE* pCurBits = (BYTE*)pBuffer;
-    *(UINT*)pCurBits = sizeof(PSVRuntimeInfo0);
-    pCurBits += sizeof(UINT);
+    uint8_t* pCurBits = (uint8_t*)pBuffer;
+    *(uint32_t*)pCurBits = sizeof(PSVRuntimeInfo0);
+    pCurBits += sizeof(uint32_t);
    m_pPSVRuntimeInfo0 = (PSVRuntimeInfo0*)pCurBits;
    pCurBits += sizeof(PSVRuntimeInfo0);

    // Set resource info:
    m_uResourceCount = ResourceCount;
-    *(UINT*)pCurBits = ResourceCount;
-    pCurBits += sizeof(UINT);
+    *(uint32_t*)pCurBits = ResourceCount;
+    pCurBits += sizeof(uint32_t);
    if (ResourceCount > 0) {
      m_uPSVResourceBindInfoSize = sizeof(PSVResourceBindInfo0);
-      *(UINT*)pCurBits = m_uPSVResourceBindInfoSize;
-      pCurBits += sizeof(UINT);
+      *(uint32_t*)pCurBits = m_uPSVResourceBindInfoSize;
+      pCurBits += sizeof(uint32_t);
      m_pPSVResourceBindInfo = pCurBits;
    }
    return true;
@ -166,14 +169,14 @@ public:
    return m_pPSVRuntimeInfo0;
  }

-  UINT GetBindCount() const {
+  uint32_t GetBindCount() const {
    return m_uResourceCount;
  }

-  PSVResourceBindInfo0* GetPSVResourceBindInfo0(UINT index) {
+  PSVResourceBindInfo0* GetPSVResourceBindInfo0(uint32_t index) {
    if (index < m_uResourceCount && m_pPSVResourceBindInfo &&
        sizeof(PSVResourceBindInfo0) <= m_uPSVResourceBindInfoSize) {
-      return (PSVResourceBindInfo0*)((BYTE*)m_pPSVResourceBindInfo +
+      return (PSVResourceBindInfo0*)((uint8_t*)m_pPSVResourceBindInfo +
        (index * m_uPSVResourceBindInfoSize));
    }
    return nullptr;
--- a/include/dxc/HLSL/DxilRootSignature.h
+++ b/include/dxc/HLSL/DxilRootSignature.h
@ -19,6 +19,10 @@
 struct IDxcBlob;
 struct IDxcBlobEncoding;

+namespace llvm {
+  class raw_ostream;
+}
+
 namespace hlsl {

 // Forward declarations.
@ -318,13 +322,17 @@ public:
  bool IsEmpty() const {
    return m_pDesc == nullptr && m_pSerialized == nullptr;
  }
+  IDxcBlob *GetSerialized() const { return m_pSerialized; }
  const uint8_t *GetSerializedBytes() const;
  unsigned GetSerializedSize() const;

  void Assign(const DxilVersionedRootSignatureDesc *pDesc, IDxcBlob *pSerialized);
  void Clear();
-  void LoadSerialized(const uint8_t *pData, unsigned length);
+  void LoadSerialized(const uint8_t *pData, uint32_t length);
  void EnsureSerializedAvailable();
+  void Deserialize();
+
+  const DxilVersionedRootSignatureDesc *GetDesc() const { return m_pDesc; }
 };

 void DeleteRootSignature(const DxilVersionedRootSignatureDesc *pRootSignature);  
@ -334,10 +342,20 @@ void ConvertRootSignature(const DxilVersionedRootSignatureDesc* pRootSignatureIn
                          DxilRootSignatureVersion RootSignatureVersionOut,  
                          const DxilVersionedRootSignatureDesc ** ppRootSignatureOut);

-void SerializeRootSignature(
-    const DxilVersionedRootSignatureDesc *pRootSignature,
-    _Outptr_ IDxcBlob **ppBlob, _Outptr_ IDxcBlobEncoding **ppErrorBlob,
-    bool bAllowReservedRegisterSpace);
+void SerializeRootSignature(const DxilVersionedRootSignatureDesc *pRootSignature,
+                            _Outptr_ IDxcBlob **ppBlob, _Outptr_ IDxcBlobEncoding **ppErrorBlob,
+                            bool bAllowReservedRegisterSpace);
+
+void DeserializeRootSignature(__in_bcount(SrcDataSizeInBytes) const void *pSrcData,
+                              __in uint32_t SrcDataSizeInBytes,
+                              __out const DxilVersionedRootSignatureDesc **ppRootSignature);
+
+// Takes PSV - pipeline state validation data, not shader container.
+bool VerifyRootSignatureWithShaderPSV(__in const DxilVersionedRootSignatureDesc *pDesc,
+                                      __in DXIL::ShaderKind ShaderKind,
+                                      _In_reads_bytes_(PSVSize) const void *pPSVData,
+                                      __in uint32_t PSVSize,
+                                      __in llvm::raw_ostream &DiagStream);

 } // namespace hlsl

--- a/include/dxc/HLSL/DxilShaderModel.h
+++ b/include/dxc/HLSL/DxilShaderModel.h
@ -53,6 +53,7 @@ public:
  unsigned SupportsUAV() const { return m_bUAVs; }
  unsigned SupportsTypedUAVs() const { return m_bTypedUavs; }
  unsigned GetUAVRegLimit() const { return m_NumUAVRegs; }
+  DXIL::PackingStrategy GetDefaultPackingStrategy() const { return DXIL::PackingStrategy::PrefixStable; }

  static unsigned Count() { return kNumShaderModels - 1; }
  static const ShaderModel *Get(unsigned Idx);
--- a/include/dxc/HLSL/DxilSignature.h
+++ b/include/dxc/HLSL/DxilSignature.h
@ -40,7 +40,7 @@ public:
  const std::vector<std::unique_ptr<DxilSignatureElement> > &GetElements() const;

  // Packs the signature elements per DXIL constraints and returns the number of rows used for the signature
-  unsigned PackElements();
+  unsigned PackElements(DXIL::PackingStrategy packing);

  // Returns true if all signature elements that should be allocated are allocated
  bool IsFullyAllocated();
--- a/include/dxc/HLSL/DxilSignatureAllocator.h
+++ b/include/dxc/HLSL/DxilSignatureAllocator.h
@ -74,11 +74,16 @@ public:
  ConflictType DetectColConflict(const DxilSignatureElement *SE, unsigned row, unsigned col);
  void PlaceElement(const DxilSignatureElement *SE, unsigned row, unsigned col);

-  // Simple greedy in-order packer used by PackMain
+  unsigned PackNext(DxilSignatureElement* SE, unsigned startRow, unsigned numRows, unsigned startCol = 0);
+
+  // Simple greedy in-order packer used by PackOptimized
  unsigned PackGreedy(std::vector<DxilSignatureElement*> elements, unsigned startRow, unsigned numRows, unsigned startCol = 0);

-  // Main packing algorithm
-  unsigned PackMain(std::vector<DxilSignatureElement*> elements, unsigned startRow, unsigned numRows);
+  // Optimized packing algorithm - appended elements may affect positions of prior elements.
+  unsigned PackOptimized(std::vector<DxilSignatureElement*> elements, unsigned startRow, unsigned numRows);
+
+  // Pack in a prefix-stable way - appended elements do not affect positions of prior elements.
+  unsigned PackPrefixStable(std::vector<DxilSignatureElement*> elements, unsigned startRow, unsigned numRows);

 };

--- a/include/dxc/HLSL/DxilTypeSystem.h
+++ b/include/dxc/HLSL/DxilTypeSystem.h
@ -11,13 +11,13 @@

 #pragma once
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/MapVector.h"
 #include "dxc/HLSL/DxilCompType.h"
 #include "dxc/HLSL/DxilInterpolationMode.h"

 #include <memory>
 #include <string>
 #include <vector>
-#include <map>

 namespace llvm {
 class LLVMContext;
@ -152,8 +152,8 @@ private:
 /// Use this class to represent structure type annotations in HL and DXIL.
 class DxilTypeSystem {
 public:
-  using StructAnnotationMap = std::map<const llvm::StructType *, std::unique_ptr<DxilStructAnnotation> >;
-  using FunctionAnnotationMap = std::map<const llvm::Function *, std::unique_ptr<DxilFunctionAnnotation> >;
+  using StructAnnotationMap = llvm::MapVector<const llvm::StructType *, std::unique_ptr<DxilStructAnnotation> >;
+  using FunctionAnnotationMap = llvm::MapVector<const llvm::Function *, std::unique_ptr<DxilFunctionAnnotation> >;

  DxilTypeSystem(llvm::Module *pModule);

--- a/include/dxc/HLSL/DxilValidation.h
+++ b/include/dxc/HLSL/DxilValidation.h
@ -11,10 +11,14 @@

 #pragma once

-#include <system_error>
+#include <memory>
+#include "dxc/Support/Global.h"
+#include "dxc/HLSL/DxilConstants.h"

 namespace llvm {
 class Module;
+class LLVMContext;
+class raw_ostream;
 }

 namespace hlsl {
@ -26,6 +30,13 @@ enum class ValidationRule : unsigned {
  // Bitcode
  BitcodeValid, // TODO - Module must be bitcode-valid

+  // Container
+  ContainerPartInvalid, // DXIL Container must not contain unknown parts
+  ContainerPartMatches, // DXIL Container Parts must match Module
+  ContainerPartMissing, // DXIL Container requires certain parts, corresponding to module
+  ContainerPartRepeated, // DXIL Container must have only one of each part type
+  ContainerRootSignatureIncompatible, // Root Signature in DXIL Container must be compatible with shader
+
  // Declaration
  DeclDxilFnExtern, // External function must be a DXIL function
  DeclDxilNsReserved, // The DXIL reserved prefixes must only be used by built-in functions and types
@ -96,6 +107,7 @@ enum class ValidationRule : unsigned {
  InstrSamplerModeForSample, // sample/_l/_d/_cl_s/gather instruction requires sampler declared in default mode
  InstrSamplerModeForSampleC, // sample_c_*/gather_c instructions require sampler declared in comparison mode
  InstrStructBitCast, // Bitcast on struct types is not allowed
+  InstrTGSMRaceCond, // Race condition writing to shared memory detected, consider making this write conditional
  InstrTextureOffset, // offset texture instructions must take offset which can resolve to integer literal in the range -8 to 7
  InstrUndefResultForGetDimension, // GetDimensions used undef dimension %0 on %1
  InstrWriteMaskForTypedUAVStore, // store on typed uav must write to all four components of the UAV
@ -166,7 +178,7 @@ enum class ValidationRule : unsigned {
  SmGSTotalOutputVertexDataRange, // Declared output vertex count (%0) multiplied by the total number of declared scalar components of output data (%1) equals %2.  This value cannot be greater than %3
  SmGSValidInputPrimitive, // GS input primitive unrecognized
  SmGSValidOutputPrimitiveTopology, // GS output primitive topology unrecognized
-  SmHSInputControlPointCountRange, // HS input control point count must be [1..%0].  %1 specified
+  SmHSInputControlPointCountRange, // HS input control point count must be [0..%0].  %1 specified
  SmHullPassThruControlPointCountMatch, // For pass thru hull shader, input control point count must match output control point count
  SmInsideTessFactorSizeMatchDomain, // InsideTessFactor rows, columns (%0, %1) invalid for domain %2.  Expected %3 rows and 1 column.
  SmInvalidResourceCompType, // Invalid resource return type
@ -202,9 +214,11 @@ enum class ValidationRule : unsigned {
  SmTriOutputPrimitiveMismatch, // Hull Shader declared with Tri Domain must specify output primitive point, triangle_cw or triangle_ccw. Line output is not compatible with the Tri domain
  SmUndefinedOutput, // Not all elements of output %0 were written
  SmValidDomain, // Invalid Tessellator Domain specified. Must be isoline, tri or quad
+  SmZeroHSInputControlPointWithInput, // When HS input control point count is 0, no input signature should exist

  // Type system
  TypesDefined, // Type must be defined based on DXIL primitives
+  TypesI8, // I8 can only used as immediate value for intrinsic
  TypesIntWidth, // Int type must be of valid width
  TypesNoMultiDim, // Only one dimension allowed for array type
  TypesNoVector, // Vector types must not be present
@ -216,7 +230,47 @@ enum class ValidationRule : unsigned {

 const char *GetValidationRuleText(ValidationRule value);
 void GetValidationVersion(_Out_ unsigned *pMajor, _Out_ unsigned *pMinor);
-std::error_code ValidateDxilModule(_In_ llvm::Module *pModule,
-                                   _In_opt_ llvm::Module *pDebugModule);
+HRESULT ValidateDxilModule(_In_ llvm::Module *pModule,
+                           _In_opt_ llvm::Module *pDebugModule);
+
+// DXIL Container Verification Functions (return false on failure)
+
+bool VerifySignatureMatches(_In_ llvm::Module *pModule,
+                            hlsl::DXIL::SignatureKind SigKind,
+                            _In_reads_bytes_(SigSize) const void *pSigData,
+                            _In_ uint32_t SigSize);
+
+// PSV = data for Pipeline State Validation
+bool VerifyPSVMatches(_In_ llvm::Module *pModule,
+                      _In_reads_bytes_(PSVSize) const void *pPSVData,
+                      _In_ uint32_t PSVSize);
+
+bool VerifyFeatureInfoMatches(_In_ llvm::Module *pModule,
+                              _In_reads_bytes_(FeatureInfoSize) const void *pFeatureInfoData,
+                              _In_ uint32_t FeatureInfoSize);
+
+// Validate the container parts, assuming supplied module is valid, loaded from the container provided
+struct DxilContainerHeader;
+HRESULT ValidateDxilContainerParts(_In_ llvm::Module *pModule,
+                                   _In_opt_ llvm::Module *pDebugModule,
+                                   _In_reads_bytes_(ContainerSize) const DxilContainerHeader *pContainer,
+                                   _In_ uint32_t ContainerSize);
+
+// Loads module, validating load, but not module.
+HRESULT ValidateLoadModule(_In_reads_bytes_(ILLength) const char *pIL,
+                           _In_ uint32_t ILLength,
+                           _In_ std::unique_ptr<llvm::Module> &pModule,
+                           _In_ llvm::LLVMContext &Ctx,
+                           _In_ llvm::raw_ostream &DiagStream);
+
+// Load and validate Dxil module from bitcode.
+HRESULT ValidateDxilBitcode(_In_reads_bytes_(ILLength) const char *pIL,
+                            _In_ uint32_t ILLength,
+                            _In_ llvm::raw_ostream &DiagStream);
+
+// Full container validation, including ValidateDxilModule
+HRESULT ValidateDxilContainer(_In_reads_bytes_(ContainerSize) const void *pContainer,
+                              _In_ uint32_t ContainerSize,
+                              _In_ llvm::raw_ostream &DiagStream);

 }
--- a/include/dxc/HLSL/HLModule.h
+++ b/include/dxc/HLSL/HLModule.h
@ -87,7 +87,7 @@ struct HLFunctionProps {
 struct HLOptions {
  HLOptions()
      : bDefaultRowMajor(false), bIEEEStrict(false), bDisableOptimizations(false),
-        bLegacyCBufferLoad(false), unused(0) {
+        bLegacyCBufferLoad(false), PackingStrategy(0), unused(0) {
  }
  uint32_t GetHLOptionsRaw() const;
  void SetHLOptionsRaw(uint32_t data);
@ -96,7 +96,9 @@ struct HLOptions {
  unsigned bAllResourcesBound      : 1;
  unsigned bDisableOptimizations   : 1;
  unsigned bLegacyCBufferLoad      : 1;
-  unsigned unused                  : 27;
+  unsigned PackingStrategy         : 2;
+  static_assert((unsigned)DXIL::PackingStrategy::Invalid < 4, "otherwise 2 bits is not enough to store PackingStrategy");
+  unsigned unused                  : 25;
 };

 /// Use this class to manipulate HLDXIR of a shader.
@ -228,6 +230,7 @@ public:
  DxilSignature *ReleaseOutputSignature();
  DxilSignature *ReleasePatchConstantSignature();
  DxilTypeSystem *ReleaseTypeSystem();
+  OP *ReleaseOP();
  RootSignatureHandle *ReleaseRootSignature();

  llvm::DebugInfoFinder &GetOrCreateDebugInfoFinder();
--- a/include/dxc/HLSL/HLOperationLowerExtension.h
+++ b/include/dxc/HLSL/HLOperationLowerExtension.h
@ -14,15 +14,19 @@
 #include "dxc/HLSL/HLSLExtensionsCodegenHelper.h"
 #include "llvm/ADT/StringRef.h"
 #include <string>
+#include <unordered_map>

 namespace llvm {
  class Value;
  class CallInst;
  class Function;
  class StringRef;
+  class Instruction;
 }

 namespace hlsl {
+  class OP;
+
  // Lowers HLSL extensions from HL operation to DXIL operation.
  class ExtensionLowering {
  public:
@ -32,11 +36,14 @@ namespace hlsl {
      NoTranslation,  // Propagate the call arguments as is down to dxil.
      Replicate,      // Scalarize the vector arguments and replicate the call.
      Pack,           // Convert the vector arguments into structs.
+      Resource,       // Convert return value to resource return and explode vectors.
    };

+    typedef std::unordered_map<llvm::Instruction *, llvm::Value *> HandleMap;
+
    // Create the lowering using the given strategy and custom codegen helper.
-    ExtensionLowering(llvm::StringRef strategy, HLSLExtensionsCodegenHelper *helper);
-    ExtensionLowering(Strategy strategy, HLSLExtensionsCodegenHelper *helper);
+    ExtensionLowering(llvm::StringRef strategy, HLSLExtensionsCodegenHelper *helper, const HandleMap &handleMap, OP& hlslOp);
+    ExtensionLowering(Strategy strategy, HLSLExtensionsCodegenHelper *helper, const HandleMap &handleMap, OP& hlslOp);

    // Translate the HL op call to a DXIL op call.
    // Returns a new value if translation was successful.
@ -62,24 +69,13 @@ namespace hlsl {
  private:
    Strategy m_strategy;
    HLSLExtensionsCodegenHelper *m_helper;
+    const HandleMap &m_handleMap;
+    OP &m_hlslOp;

    llvm::Value *Unknown(llvm::CallInst *CI);
    llvm::Value *NoTranslation(llvm::CallInst *CI);
    llvm::Value *Replicate(llvm::CallInst *CI);
    llvm::Value *Pack(llvm::CallInst *CI);
-
-    // Translate the HL call by replicating the call for each vector element.
-    //
-    // For example,
-    //
-    //    <2xi32> %r = call @ext.foo(i32 %op, <2xi32> %v)
-    //    ==>
-    //    %r.1 = call @ext.foo.s(i32 %op, i32 %v.1)
-    //    %r.2 = call @ext.foo.s(i32 %op, i32 %v.2)
-    //    <2xi32> %r.v.1 = insertelement %r.1, 0, <2xi32> undef
-    //    <2xi32> %r.v.2 = insertelement %r.2, 1, %r.v.1
-    //
-    // You can then RAWU %r with %r.v.2. The RAWU is not done by the translate function.
-    static llvm::Value *TranslateReplicating(llvm::CallInst *CI, llvm::Function *ReplicatedFunction);
+    llvm::Value *Resource(llvm::CallInst *CI);
  };
 }
--- a/include/dxc/HLSL/HLSLExtensionsCodegenHelper.h
+++ b/include/dxc/HLSL/HLSLExtensionsCodegenHelper.h
@ -27,9 +27,10 @@ namespace hlsl {
 //  1. You can mark certain defines as "semantic" defines which
 //     will be preserved as metadata in the final DXIL.
 //  2. You can add new HLSL intrinsic functions.
+//  3. You can read a root signature from a custom define.
 //
 // This class provides an interface for generating the DXIL bitcode
-// needed for the two types of extensions above.
+// needed for the types of extensions above.
 //  
 class HLSLExtensionsCodegenHelper {
 public:
@ -64,6 +65,16 @@ public:
  // Get the name to use for the dxil intrinsic function.
  virtual std::string GetIntrinsicName(unsigned opcode) = 0;

+  // Struct to hold a root signature that is read from a define.
+  struct CustomRootSignature {
+    std::string RootSignature;
+    unsigned  EncodedSourceLocation;
+    enum Status { NOT_FOUND = 0, FOUND };
+  };
+
+  // Get custom defined root signature.
+  virtual CustomRootSignature::Status GetCustomRootSignature(CustomRootSignature *out) = 0;
+
  // Virtual destructor.
  virtual ~HLSLExtensionsCodegenHelper() {};
 };
--- a/include/dxc/Support/ErrorCodes.h
+++ b/include/dxc/Support/ErrorCodes.h
@ -77,3 +77,21 @@

 // 0x80AA0010 - Error parsing DDI signature.
 #define DXC_E_INCORRECT_DDI_SIGNATURE                 DXC_MAKE_HRESULT(DXC_SEVERITY_ERROR,FACILITY_DXC,(0x0010))
+
+// 0x80AA0011 - Duplicate part exists in dxil container.
+#define DXC_E_DUPLICATE_PART                          DXC_MAKE_HRESULT(DXC_SEVERITY_ERROR,FACILITY_DXC,(0x0011))
+
+// 0x80AA0012 - Error finding part in dxil container.
+#define DXC_E_MISSING_PART                            DXC_MAKE_HRESULT(DXC_SEVERITY_ERROR,FACILITY_DXC,(0x0012))
+
+// 0x80AA0013 - Malformed DXIL Container.
+#define DXC_E_MALFORMED_CONTAINER                     DXC_MAKE_HRESULT(DXC_SEVERITY_ERROR,FACILITY_DXC,(0x0013))
+
+// 0x80AA0014 - Incorrect Root Signature for shader.
+#define DXC_E_INCORRECT_ROOT_SIGNATURE                DXC_MAKE_HRESULT(DXC_SEVERITY_ERROR,FACILITY_DXC,(0x0014))
+
+// 0X80AA0015 - DXIL container is missing DebugInfo part.
+#define DXC_E_CONTAINER_MISSING_DEBUG                 DXC_MAKE_HRESULT(DXC_SEVERITY_ERROR,FACILITY_DXC,(0x0015))
+
+// 0X80AA0016 - Unexpected failure in macro expansion.
+#define DXC_E_MACRO_EXPANSION_FAILURE                 DXC_MAKE_HRESULT(DXC_SEVERITY_ERROR,FACILITY_DXC,(0x0016))
--- a/include/dxc/Support/Global.h
+++ b/include/dxc/Support/Global.h
@ -43,6 +43,7 @@ void CheckLLVMErrorCode(const std::error_code &ec);
 #define IFC(x)      { hr = (x); if (DXC_FAILED(hr)) goto Cleanup; }
 #define IFR(x)      { HRESULT __hr = (x); if (DXC_FAILED(__hr)) return __hr; }
 #define IFRBOOL(x,y){ if (!(x)) return (y); }
+#define IFCBOOL(x,y){ if (!(x)) { hr = (y); goto Cleanup; } }
 #define IFCOOM(x)   { if (nullptr == (x)) { hr = E_OUTOFMEMORY; goto Cleanup; } }
 #define IFROOM(x)   { if (nullptr == (x)) { return E_OUTOFMEMORY; } }
 #define IFCPTR(x)   { if (nullptr == (x)) { hr = E_POINTER; goto Cleanup; }}
@ -53,13 +54,14 @@ void CheckLLVMErrorCode(const std::error_code &ec);
 #define IFTARG(x)   { if (!(x)) { throw ::hlsl::Exception(E_INVALIDARG); }}
 #define IFTLLVM(x)  { CheckLLVMErrorCode(x); }
 #define IFTMSG(x, msg) { HRESULT __hr = (x); if (DXC_FAILED(__hr)) throw ::hlsl::Exception(__hr, msg); }
+#define IFTBOOLMSG(x, y, msg) { if (!(x)) throw ::hlsl::Exception(y, msg); }

 // Propagate an C++ exception into an HRESULT.
 #define CATCH_CPP_ASSIGN_HRESULT() \
  catch (std::bad_alloc&)                   { hr = E_OUTOFMEMORY; } \
  catch (hlsl::Exception& _hlsl_exception_) {                       \
    _Analysis_assume_(DXC_FAILED(_hlsl_exception_.hr));             \
-    return hr = _hlsl_exception_.hr;                                \
+    hr = _hlsl_exception_.hr;                                       \
  }                                                                 \
  catch (...)                               { hr = E_FAIL; }
 #define CATCH_CPP_RETURN_HRESULT() \
--- a/include/dxc/Support/HLSLOptions.h
+++ b/include/dxc/Support/HLSLOptions.h
@ -39,7 +39,7 @@ enum HlslFlags {
  DriverOption = (1 << 13),
  NoArgumentUnused = (1 << 14),
  CoreOption = (1 << 15),
-  ISenseOption = (1 << 15),
+  ISenseOption = (1 << 16),
 };

 enum ID {
@ -93,7 +93,7 @@ public:
  llvm::StringRef EntryPoint;   // OPT_entrypoint
  llvm::StringRef ExternalFn;   // OPT_external_fn
  llvm::StringRef ExternalLib;  // OPT_external_lib
-  llvm::StringRef ExtractRootSignatureFile; // OPT_extractrootsignature
+  llvm::StringRef ExtractPrivateFile; // OPT_getprivate
  llvm::StringRef ForceRootSigVer; // OPT_force_rootsig_ver
  llvm::StringRef InputFile; // OPT_INPUT
  llvm::StringRef OutputHeader; // OPT_Fh
@ -102,6 +102,10 @@ public:
  llvm::StringRef Preprocess; // OPT_P
  llvm::StringRef TargetProfile; // OPT_target_profile
  llvm::StringRef VariableName; // OPT_Vn
+  llvm::StringRef PrivateSource; // OPT_setprivate
+  llvm::StringRef RootSignatureSource; // OPT_setrootsignature
+  llvm::StringRef VerifyRootSignatureSource; //OPT_verifyrootsignature
+  llvm::StringRef RootSignatureDefine; // OPT_rootsig_define

  bool AllResourcesBound; // OPT_all_resources_bound
  bool AstDump; // OPT_ast_dump
@ -109,7 +113,6 @@ public:
  bool CodeGenHighLevel; // OPT_fcgl
  bool DebugInfo; // OPT__SLASH_Zi
  bool DumpBin;        // OPT_dumpbin
-  bool EnableUnboundedDescriptorTables; // OPT_enable_unbounded_descriptor_tables
  bool WarningAsError; // OPT__SLASH_WX
  bool IEEEStrict;     // OPT_Gis
  bool DefaultColMajor;  // OPT_Zpc
@ -130,8 +133,20 @@ public:
  bool UseInstructionByteOffsets; // OPT_No
  bool UseInstructionNumbers; // OPT_Ni
  bool NotUseLegacyCBufLoad;  // OPT_not_use_legacy_cbuf_load
+  bool PackPrefixStable;  // OPT_pack_prefix_stable
+  bool PackOptimized;  // OPT_pack_optimized
  bool DisplayIncludeProcess; // OPT__vi
  bool RecompileFromBinary; // OPT _Recompile (Recompiling the DXBC binary file not .hlsl file)
+  bool StripDebug; // OPT Qstrip_debug
+  bool StripRootSignature; // OPT_Qstrip_rootsignature
+  bool StripPrivate; // OPT_Qstrip_priv
+  bool StripReflection; // OPT_Qstrip_reflect
+  bool ExtractRootSignature; // OPT_extractrootsignature
+  bool DisassembleColorCoded; // OPT_Cc
+  bool DisassembleInstNumbers; //OPT_Ni
+  bool DisassembleByteOffset; //OPT_No
+  bool DisaseembleHex; //OPT_Lx
+  bool IsRootSignatureProfile();
 };

 /// Use this class to capture, convert and handle the lifetime for the
--- a/include/dxc/Support/HLSLOptions.td
+++ b/include/dxc/Support/HLSLOptions.td
@ -87,27 +87,27 @@ def H : Flag<["-"], "H">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
 def I : JoinedOrSeparate<["-", "/"], "I">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
    HelpText<"Add directory to include search path">;
 def O0 : Flag<["-", "/"], "O0">, Group<hlsloptz_Group>, Flags<[CoreOption]>,
-    HelpText<"Optimization Level 0 - Disables instruction reordering">;
+    HelpText<"Optimization Level 0">;
 def O1 : Flag<["-", "/"], "O1">, Group<hlsloptz_Group>, Flags<[CoreOption]>,
-    HelpText<"Optimization Level 1 - All standard optimizations">;
+    HelpText<"Optimization Level 1">;
 def O2 : Flag<["-", "/"], "O2">, Group<hlsloptz_Group>, Flags<[CoreOption]>,
-    HelpText<"Optimization Level 2 - Same as O1. Reserved for future use.">;
+    HelpText<"Optimization Level 2">;
 def O3 : Flag<["-", "/"], "O3">, Group<hlsloptz_Group>, Flags<[CoreOption]>,
-    HelpText<"Optimization Level 3 - Same as O1. Reserved for future use.">;
+    HelpText<"Optimization Level 3 (Default)">;
 def O4 : Flag<["-", "/"], "O4">, Group<hlsloptz_Group>, Flags<[CoreOption]>,
-    HelpText<"Optimization Level 4 - Same as O1. Reserved for future use.">;
+    HelpText<"Optimization Level 4">;
 def Odump : Flag<["-", "/"], "Odump">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
    HelpText<"Print the optimizer commands.">;
 def Qunused_arguments : Flag<["-"], "Qunused-arguments">, Group<hlslcore_Group>, Flags<[CoreOption]>,
  HelpText<"Don't emit warning for unused driver arguments">;
 def Wall : Flag<["-"], "Wall">, Group<hlslcomp_Group>, Flags<[CoreOption]>;
 def Wdeprecated : Flag<["-"], "Wdeprecated">, Group<hlslcomp_Group>, Flags<[CoreOption]>;
-def W_Joined : Joined<["-"], "W">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
-  MetaVarName<"<warning>">, HelpText<"Enable the specified warning">;
+//def W_Joined : Joined<["-"], "W">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
+//  MetaVarName<"<warning>">, HelpText<"Enable the specified warning">;
 def d_Flag : Flag<["-"], "d">, Group<d_Group>;
 def d_Joined : Joined<["-"], "d">, Group<d_Group>;
-def fcolor_diagnostics : Flag<["-"], "fcolor-diagnostics">, Group<hlslcomp_Group>,
-  Flags<[CoreOption, DriverOption]>, HelpText<"Use colors in diagnostics">;
+//def fcolor_diagnostics : Flag<["-"], "fcolor-diagnostics">, Group<hlslcomp_Group>,
+//  Flags<[CoreOption, DriverOption]>, HelpText<"Use colors in diagnostics">;
 def fdiagnostics_color : Flag<["-"], "fdiagnostics-color">, Group<hlslcomp_Group>,
  Flags<[CoreOption, DriverOption]>;
 def fdiagnostics_color_EQ : Joined<["-"], "fdiagnostics-color=">, Group<hlslcomp_Group>;
@ -117,8 +117,8 @@ def fconstexpr_depth_EQ : Joined<["-"], "fconstexpr-depth=">, Group<f_Group>;
 def fconstexpr_steps_EQ : Joined<["-"], "fconstexpr-steps=">, Group<f_Group>;
 def fconstexpr_backtrace_limit_EQ : Joined<["-"], "fconstexpr-backtrace-limit=">,
                                    Group<f_Group>;
-def fdiagnostics_show_option : Flag<["-"], "fdiagnostics-show-option">, Group<hlslcomp_Group>,
-    Flags<[CoreOption]>, HelpText<"Print option name with mappable diagnostics">;
+//def fdiagnostics_show_option : Flag<["-"], "fdiagnostics-show-option">, Group<hlslcomp_Group>,
+//    Flags<[CoreOption]>, HelpText<"Print option name with mappable diagnostics">;
 def fdiagnostics_show_category_EQ : Joined<["-"], "fdiagnostics-show-category=">, Group<hlslcomp_Group>;
 def ferror_limit_EQ : Joined<["-"], "ferror-limit=">, Group<hlslcomp_Group>, Flags<[CoreOption]>;

@ -128,27 +128,29 @@ def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">
  Group<hlsloptz_Group>;
 def fassociative_math : Flag<["-"], "fassociative-math">, Group<hlsloptz_Group>;
 def fno_associative_math : Flag<["-"], "fno-associative-math">, Group<hlsloptz_Group>;
-def freciprocal_math :
-  Flag<["-"], "freciprocal-math">, Group<hlsloptz_Group>, Flags<[CoreOption]>,
-  HelpText<"Allow division operations to be reassociated">;
+//def freciprocal_math :
+//  Flag<["-"], "freciprocal-math">, Group<hlsloptz_Group>, Flags<[CoreOption]>,
+//  HelpText<"Allow division operations to be reassociated">;
 def fno_reciprocal_math : Flag<["-"], "fno-reciprocal-math">, Group<hlsloptz_Group>;
 def ffinite_math_only : Flag<["-"], "ffinite-math-only">, Group<hlsloptz_Group>, Flags<[CoreOption]>;
 def fno_finite_math_only : Flag<["-"], "fno-finite-math-only">, Group<hlsloptz_Group>;
 def fsigned_zeros : Flag<["-"], "fsigned-zeros">, Group<hlsloptz_Group>;
-def fno_signed_zeros :
-  Flag<["-"], "fno-signed-zeros">, Group<hlsloptz_Group>, Flags<[CoreOption]>,
-  HelpText<"Allow optimizations that ignore the sign of floating point zeros">;
+//def fno_signed_zeros :
+//  Flag<["-"], "fno-signed-zeros">, Group<hlsloptz_Group>, Flags<[CoreOption]>,
+//  HelpText<"Allow optimizations that ignore the sign of floating point zeros">;
 def fhonor_nans : Flag<["-"], "fhonor-nans">, Group<hlsloptz_Group>;
 def fno_honor_nans : Flag<["-"], "fno-honor-nans">, Group<hlsloptz_Group>;
 def fhonor_infinities : Flag<["-"], "fhonor-infinities">, Group<hlsloptz_Group>;
 def fno_honor_infinities : Flag<["-"], "fno-honor-infinities">, Group<hlsloptz_Group>;
 //def ftrapping_math : Flag<["-"], "ftrapping-math">, Group<f_Group>;
 //def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group<f_Group>;
-def ffp_contract : Joined<["-"], "ffp-contract=">, Group<hlsloptz_Group>,
-  Flags<[CoreOption]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
-  " | on (according to FP_CONTRACT pragma, default) | off (never fuse)">;
+//def ffp_contract : Joined<["-"], "ffp-contract=">, Group<hlsloptz_Group>,
+//  Flags<[CoreOption]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
+//  " | on (according to FP_CONTRACT pragma, default) | off (never fuse)">;
 def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<hlsloptz_Group>;

+
+/*
 def fno_caret_diagnostics : Flag<["-"], "fno-caret-diagnostics">, Group<hlslcomp_Group>,
 Flags<[CoreOption]>;
 def fno_color_diagnostics : Flag<["-"], "fno-color-diagnostics">, Group<hlslcomp_Group>,
@ -171,7 +173,7 @@ def freroll_loops : Flag<["-"], "freroll-loops">, Group<hlsloptz_Group>,
  HelpText<"Turn on loop reroller">, Flags<[CoreOption]>;
 def fno_reroll_loops : Flag<["-"], "fno-reroll-loops">, Group<hlsloptz_Group>,
  HelpText<"Turn off loop reroller">;
-
+*/
 def help : Flag<["-", "--", "/"], "help">, Flags<[DriverOption]>, Group<hlslcore_Group>,
  HelpText<"Display available options">;
 /*
@ -214,11 +216,17 @@ def external_fn : Separate<["-", "/"], "external-fn">, Group<hlslcore_Group>, Fl
 def fcgl : Flag<["-", "/"], "fcgl">, Group<hlslcore_Group>, Flags<[CoreOption, HelpHidden]>,
  HelpText<"Generate high-level code only">;
 def not_use_legacy_cbuf_load : Flag<["-", "/"], "not_use_legacy_cbuf_load">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
-  HelpText<"Not use legacy cbuffer load">;  
+  HelpText<"Do not use legacy cbuffer load">;
+def pack_prefix_stable : Flag<["-", "/"], "pack_prefix_stable">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
+  HelpText<"(default) Pack signatures preserving prefix-stable property - appended elements will not disturb placement of prior elements">;
+def pack_optimized : Flag<["-", "/"], "pack_optimized">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
+  HelpText<"Optimize signature packing assuming identical signature provided for each connecting stage">;
 def hlsl_version : Separate<["-", "/"], "HV">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
-  HelpText<"HLSL version (2015, 2016)">;
+  HelpText<"HLSL version (Only supports 2016 for now)">;
 def no_warnings : Flag<["-", "/"], "no-warnings">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
  HelpText<"Suppress warnings">;
+def rootsig_define : Separate<["-", "/"], "rootsig-define">, Group<hlslcomp_Group>, Flags<[CoreOption]>,
+  HelpText<"Read root signature from a #define">;

 //////////////////////////////////////////////////////////////////////////////
 // fxc-based flags that don't match those previously defined.
@ -239,7 +247,7 @@ def VD : Flag<["-", "/"], "Vd">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
 def _SLASH_Zi : Flag<["-", "/"], "Zi">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
  HelpText<"Enable debug information">;
 def recompile : Flag<["-", "/"], "recompile">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
-  HelpText<"recompile from Container or DXIL Bitcode file (not .hlsl file)">;
+  HelpText<"recompile from DXIL container with Debug Info or Debug Info bitcode file">;
 def Zpr : Flag<["-", "/"], "Zpr">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
  HelpText<"Pack matrices in row-major order">;
 def Zpc : Flag<["-", "/"], "Zpc">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
@ -258,12 +266,12 @@ def Fc : JoinedOrSeparate<["-", "/"], "Fc">, MetaVarName<"<file>">, HelpText<"Ou
 //def Fx : JoinedOrSeparate<["-", "/"], "Fx">, MetaVarName<"<file>">, HelpText<"Output assembly code and hex listing file">;
 def Fh : JoinedOrSeparate<["-", "/"], "Fh">, MetaVarName<"<file>">, HelpText<"Output header file containing object code">, Flags<[DriverOption]>, Group<hlslcomp_Group>;
 def Fe : JoinedOrSeparate<["-", "/"], "Fe">, MetaVarName<"<file>">, HelpText<"Output warnings and errors to a specific file">, Flags<[DriverOption]>, Group<hlslcomp_Group>;
-def Fd : JoinedOrSeparate<["-", "/"], "Fd">, MetaVarName<"<file>">, HelpText<"Extract shader PDB and write to given file">, Flags<[DriverOption]>, Group<hlslcomp_Group>;
+def Fd : JoinedOrSeparate<["-", "/"], "Fd">, MetaVarName<"<file>">, HelpText<"Extract LLVM Debug IR and write to given file">, Flags<[DriverOption]>, Group<hlslcomp_Group>;
 def Vn : JoinedOrSeparate<["-", "/"], "Vn">, MetaVarName<"<name>">, HelpText<"Use <name> as variable name in header file">, Flags<[DriverOption]>, Group<hlslcomp_Group>;
-def Cc : Flag<["-", "/"], "Cc">, HelpText<"Output color coded assembly listings">, Group<hlslcomp_Group>;
-def Ni : Flag<["-", "/"], "Ni">, HelpText<"Output instruction numbers in assembly listings">, Group<hlslcomp_Group>;
-def No : Flag<["-", "/"], "No">, HelpText<"Output instruction byte offsets in assembly listings">, Group<hlslcomp_Group>;
-def Lx : Flag<["-", "/"], "Lx">, HelpText<"Output hexadecimal literals">, Group<hlslcomp_Group>;
+def Cc : Flag<["-", "/"], "Cc">, HelpText<"Output color coded assembly listings">, Group<hlslcomp_Group>, Flags<[DriverOption]>;
+def Ni : Flag<["-", "/"], "Ni">, HelpText<"Output instruction numbers in assembly listings">, Group<hlslcomp_Group>, Flags<[DriverOption]>;
+def No : Flag<["-", "/"], "No">, HelpText<"Output instruction byte offsets in assembly listings">, Group<hlslcomp_Group>, Flags<[DriverOption]>;
+def Lx : Flag<["-", "/"], "Lx">, HelpText<"Output hexadecimal literals">, Group<hlslcomp_Group>, Flags<[DriverOption]>;

 // In place of 'E' for clang; fxc uses 'E' for entry point.
 def P : Separate<["-", "/"], "P">, Flags<[DriverOption]>, Group<hlslutil_Group>,
@ -273,19 +281,20 @@ def P : Separate<["-", "/"], "P">, Flags<[DriverOption]>, Group<hlslutil_Group>,

 def dumpbin : Flag<["-", "/"], "dumpbin">, Flags<[DriverOption]>, Group<hlslutil_Group>,
  HelpText<"Load a binary file rather than compiling">;
-def Qstrip_reflect : Flag<["-", "/"], "Qstrip_reflect">, Group<hlslutil_Group>,
-  HelpText<"Strip reflection data from shader bytecode">;
-def Qstrip_debug : Flag<["-", "/"], "Qstrip_debug">, Group<hlslutil_Group>,
-  HelpText<"Strip debug information from 4_0+ shader bytecode">;
-def Qstrip_priv : Flag<["-", "/"], "Qstrip_priv">, Group<hlslutil_Group>,
-  HelpText<"Strip private data from shader bytecode">;
+def Qstrip_reflect : Flag<["-", "/"], "Qstrip_reflect">, Flags<[DriverOption]>, Group<hlslutil_Group>,
+  HelpText<"Strip reflection data from shader bytecode  (must be used with /Fo <file>)">;
+def Qstrip_debug : Flag<["-", "/"], "Qstrip_debug">, Flags<[DriverOption]>, Group<hlslutil_Group>,
+  HelpText<"Strip debug information from 4_0+ shader bytecode  (must be used with /Fo <file>)">;
+def Qstrip_priv : Flag<["-", "/"], "Qstrip_priv">, Flags<[DriverOption]>, Group<hlslutil_Group>,
+  HelpText<"Strip private data from shader bytecode  (must be used with /Fo <file>)">;

-def Qstrip_rootsignature : Flag<["-", "/"], "Qstrip_rootsignature">,     Group<hlslutil_Group>, HelpText<"Strip root signature data from shader bytecode">;
-def setrootsignature     : JoinedOrSeparate<["-", "/"], "setrootsignature">,     MetaVarName<"<file>">, Group<hlslutil_Group>, HelpText<"Attach root signature to shader bytecode">;
-def extractrootsignature : JoinedOrSeparate<["-", "/"], "extractrootsignature">, MetaVarName<"<file>">, Group<hlslutil_Group>, HelpText<"Extract root signature from shader bytecode">;
-def verifyrootsignature  : JoinedOrSeparate<["-", "/"], "verifyrootsignature">,  MetaVarName<"<file>">, Group<hlslutil_Group>, HelpText<"Verify shader bytecode with root signature">;
+def Qstrip_rootsignature : Flag<["-", "/"], "Qstrip_rootsignature">, Flags<[DriverOption]>, Group<hlslutil_Group>, HelpText<"Strip root signature data from shader bytecode  (must be used with /Fo <file>)">;
+def setrootsignature     : JoinedOrSeparate<["-", "/"], "setrootsignature">,     MetaVarName<"<file>">, Flags<[DriverOption]>, Group<hlslutil_Group>, HelpText<"Attach root signature to shader bytecode">;
+def extractrootsignature : Flag<["-", "/"], "extractrootsignature">, Flags<[DriverOption]>, Group<hlslutil_Group>, HelpText<"Extract root signature from shader bytecode (must be used with /Fo <file>)">;
+def verifyrootsignature  : JoinedOrSeparate<["-", "/"], "verifyrootsignature">,  MetaVarName<"<file>">, Flags<[DriverOption]>, Group<hlslutil_Group>, HelpText<"Verify shader bytecode with root signature">;
 def force_rootsig_ver    : JoinedOrSeparate<["-", "/"], "force_rootsig_ver">,    Flags<[CoreOption]>, MetaVarName<"<profile>">, Group<hlslcomp_Group>, HelpText<"force root signature version (rootsig_1_1 if omitted)">;

+/*
 def shtemplate : JoinedOrSeparate<["-", "/"], "shtemplate">, MetaVarName<"<file>">, Group<hlslcomp_Group>,
  HelpText<"Template shader file for merging/matching resources">;
 def mergeUAVs : JoinedOrSeparate<["-", "/"], "mergeUAVs">, MetaVarName<"<file>">, Group<hlslcomp_Group>,
@ -296,15 +305,16 @@ def res_may_alias : Flag<["-", "/"], "res_may_alias">, Flags<[CoreOption]>, Grou
  HelpText<"Assume that UAVs/SRVs may alias">;
 def enable_unbounded_descriptor_tables : Flag<["-", "/"], "enable_unbounded_descriptor_tables">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
  HelpText<"Enables unbounded descriptor tables">;
+*/
 def all_resources_bound : Flag<["-", "/"], "all_resources_bound">, Flags<[CoreOption]>, Group<hlslcomp_Group>,
  HelpText<"Enables agressive flattening">;

-def setprivate : JoinedOrSeparate<["-", "/"], "setprivate">, MetaVarName<"<file>">, Group<hlslutil_Group>,
+def setprivate : JoinedOrSeparate<["-", "/"], "setprivate">, Flags<[DriverOption]>, MetaVarName<"<file>">, Group<hlslutil_Group>,
  HelpText<"Private data to add to compiled shader blob">;
-def getprivate : JoinedOrSeparate<["-", "/"], "getprivate">, MetaVarName<"<file>">, Group<hlslutil_Group>,
+def getprivate : JoinedOrSeparate<["-", "/"], "getprivate">, Flags<[DriverOption]>, MetaVarName<"<file>">, Group<hlslutil_Group>,
  HelpText<"Save private data from shader blob">;

-def nologo : Flag<["-", "/"], "nologo">, Group<hlslcore_Group>,
+def nologo : Flag<["-", "/"], "nologo">, Group<hlslcore_Group>, Flags<[DriverOption]>,
  HelpText<"Suppress copyright message">;

 // Also removed: compress, decompress, /Gch (child effect), /Gec (back compat), /Gpp (partial precision)
--- a/include/dxc/Support/dxcapi.use.h
+++ b/include/dxc/Support/dxcapi.use.h
@ -62,9 +62,13 @@ public:

  template <typename TInterface>
  HRESULT CreateInstance(REFCLSID clsid, _Outptr_ TInterface** pResult) {
+    return CreateInstance(clsid, __uuidof(TInterface), (IUnknown**)pResult);
+  }
+
+  HRESULT CreateInstance(REFCLSID clsid, REFIID riid, _Outptr_ IUnknown **pResult) {
    if (pResult == nullptr) return E_POINTER;
    if (m_dll == nullptr) return E_FAIL;
-    HRESULT hr = m_createFn(clsid, __uuidof(TInterface), (LPVOID*)pResult);
+    HRESULT hr = m_createFn(clsid, riid, (LPVOID*)pResult);
    return hr;
  }

@ -79,6 +83,12 @@ public:
      m_dll = nullptr;
    }
  }
+
+  HMODULE Detach() {
+    HMODULE module = m_dll;
+    m_dll = nullptr;
+    return module;
+  }
 };

 inline DxcDefine GetDefine(_In_ LPCWSTR name, LPCWSTR value) {
@ -94,13 +104,13 @@ void IFT_Data(HRESULT hr, _In_opt_ LPCWSTR data);
 void EnsureEnabled(DxcDllSupport &dxcSupport);
 void ReadFileIntoBlob(DxcDllSupport &dxcSupport, _In_ LPCWSTR pFileName,
                      _Outptr_ IDxcBlobEncoding **ppBlobEncoding);
-void WriteBlobToConsole(_In_opt_ IDxcBlob *pBlob);
+void WriteBlobToConsole(_In_opt_ IDxcBlob *pBlob, DWORD streamType = STD_OUTPUT_HANDLE);
 void WriteBlobToFile(_In_opt_ IDxcBlob *pBlob, _In_ LPCWSTR pFileName);
 void WriteBlobToHandle(_In_opt_ IDxcBlob *pBlob, HANDLE hFile, _In_opt_ LPCWSTR pFileName);
 void WriteUtf8ToConsole(_In_opt_count_(charCount) const char *pText,
-                        int charCount);
+                        int charCount, DWORD streamType = STD_OUTPUT_HANDLE);
 void WriteUtf8ToConsoleSizeT(_In_opt_count_(charCount) const char *pText,
-                             size_t charCount);
+                             size_t charCount, DWORD streamType = STD_OUTPUT_HANDLE);
 void WriteOperationErrorsToConsole(_In_ IDxcOperationResult *pResult,
                                   bool outputWarnings);
 void WriteOperationResultToConsole(_In_ IDxcOperationResult *pRewriteResult,
--- a/include/dxc/dxcapi.h
+++ b/include/dxc/dxcapi.h
@ -163,7 +163,8 @@ IDxcCompiler : public IUnknown {

 static const UINT32 DxcValidatorFlags_Default = 0;
 static const UINT32 DxcValidatorFlags_InPlaceEdit = 1;  // Validator is allowed to update shader blob in-place.
-static const UINT32 DxcValidatorFlags_ValidMask = 0x1;
+static const UINT32 DxcValidatorFlags_RootSignatureOnly = 2;
+static const UINT32 DxcValidatorFlags_ValidMask = 0x3;

 struct __declspec(uuid("A6E82BD2-1FD7-4826-9811-2857E797F49A"))
 IDxcValidator : public IUnknown {
@ -175,6 +176,14 @@ IDxcValidator : public IUnknown {
    ) = 0;
 };

+struct __declspec(uuid("334b1f50-2292-4b35-99a1-25588d8c17fe"))
+IDxcContainerBuilder : public IUnknown {
+  virtual HRESULT STDMETHODCALLTYPE Load(_In_ IDxcBlob *pDxilContainerHeader) = 0;                // Loads DxilContainer to the builder
+  virtual HRESULT STDMETHODCALLTYPE AddPart(_In_ UINT32 fourCC, _In_ IDxcBlob *pSource) = 0;      // Part to add to the container
+  virtual HRESULT STDMETHODCALLTYPE RemovePart(_In_ UINT32 fourCC) = 0;                           // Remove the part with fourCC
+  virtual HRESULT STDMETHODCALLTYPE SerializeContainer(_Out_ IDxcOperationResult **ppResult) = 0; // Builds a container of the given container builder state
+};
+
 struct __declspec(uuid("091f7a26-1c1f-4948-904b-e6e3a8a771d5"))
 IDxcAssembler : public IUnknown {
  // Assemble dxil in ll or llvm bitcode to DXIL container.
@ -278,4 +287,11 @@ __declspec(selectany) extern const GUID CLSID_DxcOptimizer = {
    {0x9b, 0x6b, 0xb1, 0x24, 0xe7, 0xa5, 0x20, 0x4c}
 };

+// {94134294-411f-4574-b4d0-8741e25240d2}
+__declspec(selectany) extern const GUID CLSID_DxcContainerBuilder = {
+  0x94134294,
+  0x411f,
+  0x4574,  
+  { 0xb4, 0xd0, 0x87, 0x41, 0xe2, 0x52, 0x40, 0xd2 }
+};
 #endif
--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
@ -122,9 +122,11 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
 } // End llvm namespace

 namespace llvm {
+  class PassRegistry; // HLSL Change
  class FunctionPass;
  FunctionPass *createCFGPrinterPass ();
  FunctionPass *createCFGOnlyPrinterPass ();
+  void initializeCFGPrinterPasses(PassRegistry &Registry); // HLSL Change
 } // End llvm namespace

 #endif
--- a/lib/Analysis/CFGPrinter.cpp
+++ b/lib/Analysis/CFGPrinter.cpp
@ -26,7 +26,7 @@ namespace {
  struct CFGViewer : public FunctionPass {
    static char ID; // Pass identifcation, replacement for typeid
    CFGViewer() : FunctionPass(ID) {
-      initializeCFGOnlyViewerPass(*PassRegistry::getPassRegistry());
+      // initializeCFGOnlyViewerPass(*PassRegistry::getPassRegistry()); // HLSL Change - initialize up front
    }

    bool runOnFunction(Function &F) override {
@ -56,7 +56,7 @@ namespace {
  struct CFGOnlyViewer : public FunctionPass {
    static char ID; // Pass identifcation, replacement for typeid
    CFGOnlyViewer() : FunctionPass(ID) {
-      initializeCFGOnlyViewerPass(*PassRegistry::getPassRegistry());
+      // initializeCFGOnlyViewerPass(*PassRegistry::getPassRegistry()); // HLSL Change - initialize up front
    }

    bool runOnFunction(Function &F) override {
@ -87,7 +87,7 @@ namespace {
  struct CFGPrinter : public FunctionPass {
    static char ID; // Pass identification, replacement for typeid
    CFGPrinter() : FunctionPass(ID) {
-      initializeCFGPrinterPass(*PassRegistry::getPassRegistry());
+      // initializeCFGPrinterPass(*PassRegistry::getPassRegistry()); // HLSL Change - initialize up front
    }

    bool runOnFunction(Function &F) override {
@ -129,7 +129,7 @@ namespace {
  struct CFGOnlyPrinter : public FunctionPass {
    static char ID; // Pass identification, replacement for typeid
    CFGOnlyPrinter() : FunctionPass(ID) {
-      initializeCFGOnlyPrinterPass(*PassRegistry::getPassRegistry());
+      // initializeCFGOnlyPrinterPass(*PassRegistry::getPassRegistry()); // HLSL Change - initialize up front
    }

    bool runOnFunction(Function &F) override {
@ -192,3 +192,11 @@ FunctionPass *llvm::createCFGOnlyPrinterPass () {
  return new CFGOnlyPrinter();
 }

+// HLSL Change Starts
+void llvm::initializeCFGPrinterPasses(PassRegistry &Registry) {
+  initializeCFGPrinterPass(Registry);
+  initializeCFGOnlyPrinterPass(Registry);
+  initializeCFGViewerPass(Registry);
+  initializeCFGOnlyViewerPass(Registry);
+}
+// HLSL Change Ends
--- a/lib/DxcSupport/HLSLOptions.cpp
+++ b/lib/DxcSupport/HLSLOptions.cpp
@ -113,6 +113,11 @@ void DxcDefines::BuildDefines() {
  }
 }

+bool DxcOpts::IsRootSignatureProfile() {
+  return TargetProfile == "rootsig_1_0" ||
+      TargetProfile == "rootsig_1_1";
+}
+
 MainArgs::MainArgs(int argc, const wchar_t **argv, int skipArgCount) {
  if (argc > skipArgCount) {
    Utf8StringVector.reserve(argc - skipArgCount);
@ -208,7 +213,8 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
  DXASSERT(opts.ExternalLib.empty() == opts.ExternalFn.empty(),
           "else flow above is incorrect");

-  opts.OutputWarnings = Args.hasFlag(OPT_no_warnings, OPT_INVALID, true);
+  // when no-warnings option is present, do not output warnings.
+  opts.OutputWarnings = Args.hasFlag(OPT_INVALID, OPT_no_warnings, true);
  opts.EntryPoint = Args.getLastArgValue(OPT_entrypoint);
  // Entry point is required in arguments only for drivers; APIs take this through an argument.
  // The value should default to 'main', but we let the caller apply this policy.
@ -236,7 +242,7 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
  // OutputLibrary not supported (Fl)
  opts.AssemblyCode = Args.getLastArgValue(OPT_Fc);
  opts.DebugFile = Args.getLastArgValue(OPT_Fd);
-  opts.ExtractRootSignatureFile = Args.getLastArgValue(OPT_extractrootsignature);
+  opts.ExtractPrivateFile = Args.getLastArgValue(OPT_getprivate);
  opts.OutputObject = Args.getLastArgValue(OPT_Fo);
  opts.OutputHeader = Args.getLastArgValue(OPT_Fh);
  opts.OutputWarningsFile = Args.getLastArgValue(OPT_Fe);
@ -251,6 +257,10 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
  opts.VariableName = Args.getLastArgValue(OPT_Vn);
  opts.InputFile = Args.getLastArgValue(OPT_INPUT);
  opts.ForceRootSigVer = Args.getLastArgValue(OPT_force_rootsig_ver);
+  opts.PrivateSource = Args.getLastArgValue(OPT_setprivate);
+  opts.RootSignatureSource = Args.getLastArgValue(OPT_setrootsignature);
+  opts.VerifyRootSignatureSource = Args.getLastArgValue(OPT_verifyrootsignature);
+  opts.RootSignatureDefine = Args.getLastArgValue(OPT_rootsig_define);

  if (!opts.ForceRootSigVer.empty() && opts.ForceRootSigVer != "rootsig_1_0" &&
      opts.ForceRootSigVer != "rootsig_1_1") {
@ -284,13 +294,24 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
  opts.DefaultRowMajor = Args.hasFlag(OPT_Zpr, OPT_INVALID, false);
  opts.DefaultColMajor = Args.hasFlag(OPT_Zpc, OPT_INVALID, false);
  opts.DumpBin = Args.hasFlag(OPT_dumpbin, OPT_INVALID, false);
-  opts.EnableUnboundedDescriptorTables = Args.hasFlag(OPT_enable_unbounded_descriptor_tables, OPT_INVALID, false);
  opts.NotUseLegacyCBufLoad = Args.hasFlag(OPT_not_use_legacy_cbuf_load, OPT_INVALID, false);
+  opts.PackPrefixStable = Args.hasFlag(OPT_pack_prefix_stable, OPT_INVALID, false);
+  opts.PackOptimized = Args.hasFlag(OPT_pack_optimized, OPT_INVALID, false);
  opts.DisplayIncludeProcess = Args.hasFlag(OPT_H, OPT_INVALID, false);
  opts.WarningAsError = Args.hasFlag(OPT__SLASH_WX, OPT_INVALID, false);
  opts.AvoidFlowControl = Args.hasFlag(OPT_Gfa, OPT_INVALID, false);
  opts.PreferFlowControl = Args.hasFlag(OPT_Gfp, OPT_INVALID, false);
  opts.RecompileFromBinary = Args.hasFlag(OPT_recompile, OPT_INVALID, false);
+  opts.StripDebug = Args.hasFlag(OPT_Qstrip_debug, OPT_INVALID, false);
+  opts.StripRootSignature = Args.hasFlag(OPT_Qstrip_rootsignature, OPT_INVALID, false);
+  opts.StripPrivate = Args.hasFlag(OPT_Qstrip_priv, OPT_INVALID, false);
+  opts.StripReflection = Args.hasFlag(OPT_Qstrip_reflect, OPT_INVALID, false);
+  opts.ExtractRootSignature = Args.hasFlag(OPT_extractrootsignature, OPT_INVALID, false);
+  opts.DisassembleColorCoded = Args.hasFlag(OPT_Cc, OPT_INVALID, false);
+  opts.DisassembleInstNumbers = Args.hasFlag(OPT_Ni, OPT_INVALID, false);
+  opts.DisassembleByteOffset = Args.hasFlag(OPT_No, OPT_INVALID, false);
+  opts.DisaseembleHex = Args.hasFlag(OPT_Lx, OPT_INVALID, false);
+
  if (opts.DefaultColMajor && opts.DefaultRowMajor) {
    errors << "Cannot specify /Zpr and /Zpc together, use /? to get usage information";
    return 1;
@ -299,6 +320,10 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
    errors << "Cannot specify /Gfa and /Gfp together, use /? to get usage information";
    return 1;
  }
+  if (opts.PackPrefixStable && opts.PackOptimized) {
+    errors << "Cannot specify /pack_prefix_stable and /pack_optimized together, use /? to get usage information";
+    return 1;
+  }
  // TODO: more fxc option check.
  // ERR_RES_MAY_ALIAS_ONLY_IN_CS_5
  // ERR_NOT_ABLE_TO_FLATTEN on if that contain side effects
@ -311,7 +336,7 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,

  if ((flagsToInclude & hlsl::options::DriverOption) && opts.InputFile.empty()) {
    // Input file is required in arguments only for drivers; APIs take this through an argument.
-    errors << "Required input file argument is missing.";
+    errors << "Required input file argument is missing. use -help to get more information.";
    return 1;
  }
  if (opts.OutputHeader.empty() && !opts.VariableName.empty()) {
@ -321,7 +346,7 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,

  if (!opts.Preprocess.empty() &&
      (!opts.OutputHeader.empty() || !opts.OutputObject.empty() ||
-       opts.OutputWarnings || !opts.OutputWarningsFile.empty())) {
+       !opts.OutputWarnings || !opts.OutputWarningsFile.empty())) {
    errors << "Preprocess cannot be specified with other options.";
    return 1;
  }
@ -334,7 +359,7 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
    if (opts.AllResourcesBound || opts.AvoidFlowControl ||
        opts.CodeGenHighLevel || opts.DebugInfo || opts.DefaultColMajor ||
        opts.DefaultRowMajor || opts.Defines.size() != 0 ||
-        opts.DisableOptimizations || opts.EnableUnboundedDescriptorTables ||
+        opts.DisableOptimizations || 
        !opts.EntryPoint.empty() || !opts.ForceRootSigVer.empty() ||
        opts.PreferFlowControl || !opts.TargetProfile.empty()) {
      errors << "Cannot specify compilation options when reading a binary file.";
--- a/lib/DxcSupport/dxcapi.use.cpp
+++ b/lib/DxcSupport/dxcapi.use.cpp
@ -76,7 +76,7 @@ void WriteOperationErrorsToConsole(_In_ IDxcOperationResult *pResult,
    CComPtr<IDxcBlobEncoding> pErrors;
    IFT(pResult->GetErrorBuffer(&pErrors));
    if (pErrors.p != nullptr) {
-      WriteBlobToConsole(pErrors);
+      WriteBlobToConsole(pErrors, STD_ERROR_HANDLE);
    }
  }
 }
@ -87,16 +87,16 @@ void WriteOperationResultToConsole(_In_ IDxcOperationResult *pRewriteResult,

  CComPtr<IDxcBlob> pBlob;
  IFT(pRewriteResult->GetResult(&pBlob));
-  WriteBlobToConsole(pBlob);
+  WriteBlobToConsole(pBlob, STD_OUTPUT_HANDLE);
 }

-void WriteBlobToConsole(_In_opt_ IDxcBlob *pBlob) {
+void WriteBlobToConsole(_In_opt_ IDxcBlob *pBlob, DWORD streamType) {
  if (pBlob == nullptr) {
    return;
  }

  // Assume UTF-8 for now, which is typically the case for dxcompiler ouput.
-  WriteUtf8ToConsoleSizeT((char *)pBlob->GetBufferPointer(), pBlob->GetBufferSize());
+  WriteUtf8ToConsoleSizeT((char *)pBlob->GetBufferPointer(), pBlob->GetBufferSize(), streamType);
 }

 void WriteBlobToFile(_In_opt_ IDxcBlob *pBlob, _In_ LPCWSTR pFileName) {
@ -125,7 +125,7 @@ void WriteBlobToHandle(_In_opt_ IDxcBlob *pBlob, _In_ HANDLE hFile, _In_opt_ LPC
 }

 void WriteUtf8ToConsole(_In_opt_count_(charCount) const char *pText,
-                        int charCount) {
+                        int charCount, DWORD streamType) {
  if (charCount == 0 || pText == nullptr) {
    return;
  }
@ -139,19 +139,28 @@ void WriteUtf8ToConsole(_In_opt_count_(charCount) const char *pText,

  std::string consoleMessage;
  Unicode::UTF16ToConsoleString(utf16Message, &consoleMessage, &lossy);
-  printf("%s\n", consoleMessage.c_str());
+  if (streamType == STD_OUTPUT_HANDLE) {
+    fprintf(stdout, "%s\n", consoleMessage.c_str());
+  }
+  else if (streamType == STD_ERROR_HANDLE) {
+    fprintf(stderr, "%s\n", consoleMessage.c_str());
+  }
+  else {
+    throw hlsl::Exception(E_INVALIDARG);
+  }
+
  delete[] utf16Message;
 }

 void WriteUtf8ToConsoleSizeT(_In_opt_count_(charCount) const char *pText,
-  size_t charCount) {
+  size_t charCount, DWORD streamType) {
  if (charCount == 0) {
    return;
  }

  int charCountInt;
  IFT(SizeTToInt(charCount, &charCountInt));
-  WriteUtf8ToConsole(pText, charCountInt);
+  WriteUtf8ToConsole(pText, charCountInt, streamType);
 }

 } // namespace dxc
--- a/lib/HLSL/CMakeLists.txt
+++ b/lib/HLSL/CMakeLists.txt
@ -9,6 +9,7 @@ add_llvm_library(LLVMHLSL
  DxilContainerReflection.cpp
  DxilGenerationPass.cpp
  DxilInterpolationMode.cpp
+  DxilLegalizeSampleOffsetPass.cpp
  DxilMetadataHelper.cpp
  DxilModule.cpp
  DXILOperations.cpp
--- a/lib/HLSL/DxcOptimizer.cpp
+++ b/lib/HLSL/DxcOptimizer.cpp
@ -31,6 +31,7 @@
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"

 #include <algorithm>
@ -75,6 +76,7 @@ HRESULT SetupRegistryPassForHLSL() {
    initializeDxilCondenseResourcesPass(Registry);
    initializeDxilEmitMetadataPass(Registry);
    initializeDxilGenerationPassPass(Registry);
+    initializeDxilLegalizeSampleOffsetPassPass(Registry);
    initializeDxilPrecisePropagatePassPass(Registry);
    initializeDynamicIndexingVectorToArrayPass(Registry);
    initializeEarlyCSELegacyPassPass(Registry);
@ -109,6 +111,7 @@ HRESULT SetupRegistryPassForHLSL() {
    initializePromotePassPass(Registry);
    initializePruneEHPass(Registry);
    initializeReassociatePass(Registry);
+    initializeReducibilityAnalysisPass(Registry);
    initializeRegToMemHlslPass(Registry);
    initializeRewriteSymbolsPass(Registry);
    initializeSCCPPass(Registry);
@ -129,6 +132,8 @@ HRESULT SetupRegistryPassForHLSL() {
    initializeTypeBasedAliasAnalysisPass(Registry);
    initializeVerifierLegacyPassPass(Registry);
    // INIT-PASSES:END
+    // Not schematized - exclusively for compiler authors.
+    initializeCFGPrinterPasses(Registry);
  }
  CATCH_CPP_RETURN_HRESULT();
  return S_OK;
--- a/lib/HLSL/DxilCompType.cpp
+++ b/lib/HLSL/DxilCompType.cpp
@ -281,7 +281,7 @@ const char *CompType::GetName() const {

 static const char *s_TypeKindHLSLNames[(unsigned)CompType::Kind::LastEntry] = {
  "unknown",
-  "bool", "min16i", "min16i", "int", "uint", "int64_t", "uint64_t",
+  "bool", "min16i", "min16ui", "int", "uint", "int64_t", "uint64_t",
  "min16f", "float", "double",
  "snorm_min16f", "unorm_min16f", "snorm_float", "unorm_float", "snorm_double", "unorm_double",
 };
--- a/lib/HLSL/DxilCondenseResources.cpp
+++ b/lib/HLSL/DxilCondenseResources.cpp
@ -66,7 +66,8 @@ public:
    DxilModule &DM = M.GetOrCreateDxilModule();

    // Switch tbuffers to SRVs, as they have been treated as cbuffers up to this point.
-    PatchTBuffers(DM);
+    if (DM.GetCBuffers().size())
+      PatchTBuffers(DM);

    // Remove unused resource.
    DM.RemoveUnusedResources();
--- a/lib/HLSL/DxilContainerAssembler.cpp
+++ b/lib/HLSL/DxilContainerAssembler.cpp
@ -132,7 +132,7 @@ struct sort_sig {
  }
 };

-class DxilProgramSignatureWriter {
+class DxilProgramSignatureWriter : public DxilPartWriter {
 private:
  const DxilSignature &m_signature;
  DXIL::TessellatorDomain m_domain;
@ -174,6 +174,7 @@ private:
      eltRows = pElement->GetRows() / eltCount;

    DxilProgramSignatureElement sig;
+    memset(&sig, 0, sizeof(DxilProgramSignatureElement));
    sig.Stream = pElement->GetOutputStream();
    sig.SemanticName = GetSemanticOffset(pElement);
    sig.SystemValue = KindToSystemValue(pElement->GetKind(), m_domain);
@ -229,11 +230,11 @@ public:
    calcSizes();
  }

-  uint32_t size() const {
+  __override uint32_t size() const {
    return m_lastOffset;
  }

-  void write(AbstractMemoryStream *pStream) {
+  __override void write(AbstractMemoryStream *pStream) {
    UINT64 startPos = pStream->GetPosition();
    const std::vector<std::unique_ptr<hlsl::DxilSignatureElement>> &elements = m_signature.GetElements();

@ -274,7 +275,22 @@ public:
  }
 };

-class DxilProgramRootSignatureWriter {
+DxilPartWriter *hlsl::NewProgramSignatureWriter(const DxilModule &M, DXIL::SignatureKind Kind) {
+  switch (Kind) {
+  case DXIL::SignatureKind::Input:
+    return new DxilProgramSignatureWriter(M.GetInputSignature(),
+      M.GetTessellatorDomain(), true);
+  case DXIL::SignatureKind::Output:
+    return new DxilProgramSignatureWriter(M.GetOutputSignature(),
+      M.GetTessellatorDomain(), false);
+  case DXIL::SignatureKind::PatchConstant:
+    return new DxilProgramSignatureWriter(M.GetPatchConstantSignature(),
+      M.GetTessellatorDomain(), /*IsInput*/ M.GetShaderModel()->IsDS());
+  }
+  return nullptr;
+}
+
+class DxilProgramRootSignatureWriter : public DxilPartWriter {
 private:
  const RootSignatureHandle &m_Sig;
 public:
@ -288,7 +304,11 @@ public:
  }
 };

-class DxilFeatureInfoWriter {
+DxilPartWriter *hlsl::NewRootSignatureWriter(const RootSignatureHandle &S) {
+  return new DxilProgramRootSignatureWriter(S);
+}
+
+class DxilFeatureInfoWriter : public DxilPartWriter  {
 private:
  // Only save the shader properties after create class for it.
  DxilShaderFeatureInfo featureInfo;
@ -296,24 +316,28 @@ public:
  DxilFeatureInfoWriter(const DxilModule &M) {
    featureInfo.FeatureFlags = M.m_ShaderFlags.GetFeatureInfo();
  }
-  uint32_t size() const {
+  __override uint32_t size() const {
    return sizeof(DxilShaderFeatureInfo);
  }
-  void write(AbstractMemoryStream *pStream) {
+  __override void write(AbstractMemoryStream *pStream) {
    IFT(WriteStreamValue(pStream, featureInfo.FeatureFlags));
  }
 };

-class DxilPSVWriter {
+DxilPartWriter *hlsl::NewFeatureInfoWriter(const DxilModule &M) {
+  return new DxilFeatureInfoWriter(M);
+}
+
+class DxilPSVWriter : public DxilPartWriter  {
 private:
-  DxilModule &m_Module;
+  const DxilModule &m_Module;
  UINT m_uTotalResources;
  DxilPipelineStateValidation m_PSV;
  uint32_t m_PSVBufferSize;
  SmallVector<char, 512> m_PSVBuffer;

 public:
-  DxilPSVWriter(DxilModule &module) : m_Module(module) {
+  DxilPSVWriter(const DxilModule &module) : m_Module(module) {
    UINT uCBuffers = m_Module.GetCBuffers().size();
    UINT uSamplers = m_Module.GetSamplers().size();
    UINT uSRVs = m_Module.GetSRVs().size();
@ -321,11 +345,11 @@ public:
    m_uTotalResources = uCBuffers + uSamplers + uSRVs + uUAVs;
    m_PSV.InitNew(m_uTotalResources, nullptr, &m_PSVBufferSize);
  }
-  size_t size() {
+  __override uint32_t size() const {
    return m_PSVBufferSize;
  }

-  void write(AbstractMemoryStream *pStream) {
+  __override void write(AbstractMemoryStream *pStream) {
    m_PSVBuffer.resize(m_PSVBufferSize);
    m_PSV.InitNew(m_uTotalResources, m_PSVBuffer.data(), &m_PSVBufferSize);
    DXASSERT_NOMSG(m_PSVBuffer.size() == m_PSVBufferSize);
@ -339,7 +363,7 @@ public:
    switch (SM->GetKind()) {
      case ShaderModel::Kind::Vertex: {
        pInfo->VS.OutputPositionPresent = 0;
-        DxilSignature &S = m_Module.GetOutputSignature();
+        const DxilSignature &S = m_Module.GetOutputSignature();
        for (auto &&E : S.GetElements()) {
          if (E->GetKind() == Semantic::Kind::Position) {
            // Ideally, we might check never writes mask here,
@ -360,7 +384,7 @@ public:
      case ShaderModel::Kind::Domain: {
        pInfo->DS.InputControlPointCount = (UINT)m_Module.GetInputControlPointCount();
        pInfo->DS.OutputPositionPresent = 0;
-        DxilSignature &S = m_Module.GetOutputSignature();
+        const DxilSignature &S = m_Module.GetOutputSignature();
        for (auto &&E : S.GetElements()) {
          if (E->GetKind() == Semantic::Kind::Position) {
            // Ideally, we might check never writes mask here,
@ -382,7 +406,7 @@ public:
          pInfo->GS.OutputStreamMask = 1; // This is what runtime expects.
        }
        pInfo->GS.OutputPositionPresent = 0;
-        DxilSignature &S = m_Module.GetOutputSignature();
+        const DxilSignature &S = m_Module.GetOutputSignature();
        for (auto &&E : S.GetElements()) {
          if (E->GetKind() == Semantic::Kind::Position) {
            // Ideally, we might check never writes mask here,
@ -397,7 +421,7 @@ public:
        pInfo->PS.DepthOutput = 0;
        pInfo->PS.SampleFrequency = 0;
        {
-          DxilSignature &S = m_Module.GetInputSignature();
+          const DxilSignature &S = m_Module.GetInputSignature();
          for (auto &&E : S.GetElements()) {
            if (E->GetInterpolationMode()->IsAnySample() ||
                E->GetKind() == Semantic::Kind::SampleIndex) {
@ -406,7 +430,7 @@ public:
          }
        }
        {
-          DxilSignature &S = m_Module.GetOutputSignature();
+          const DxilSignature &S = m_Module.GetOutputSignature();
          for (auto &&E : S.GetElements()) {
            if (E->IsAnyDepth()) {
              pInfo->PS.DepthOutput = 1;
@ -483,10 +507,11 @@ public:
  }
 };

-class DxilContainerWriter {
-public:
-  typedef std::function<void(AbstractMemoryStream*)> WriteFn;
+DxilPartWriter *hlsl::NewPSVWriter(const DxilModule &M) {
+  return new DxilPSVWriter(M);
+}

+class DxilContainerWriter_impl : public DxilContainerWriter  {
 private:
  class DxilPart {
  public:
@ -501,24 +526,26 @@ private:
  llvm::SmallVector<DxilPart, 8> m_Parts;

 public:
-  void AddPart(uint32_t FourCC, uint32_t Size, WriteFn Write) {
+  __override void AddPart(uint32_t FourCC, uint32_t Size, WriteFn Write) {
    m_Parts.emplace_back(FourCC, Size, Write);
  }

-  void write(AbstractMemoryStream *pStream) {
+  __override uint32_t size() const {
+    uint32_t partSize = 0;
+    for (auto &part : m_Parts) {
+      partSize += part.Header.PartSize;
+    }
+    return (uint32_t)GetDxilContainerSizeFromParts((uint32_t)m_Parts.size(), partSize);
+  }
+
+  __override void write(AbstractMemoryStream *pStream) {
    DxilContainerHeader header;
    const uint32_t PartCount = (uint32_t)m_Parts.size();
-    const uint32_t OffsetTableSize = sizeof(uint32_t) * PartCount;
-    uint32_t containerSizeInBytes =
-      (uint32_t)sizeof(DxilContainerHeader) + OffsetTableSize +
-      (uint32_t)sizeof(DxilPartHeader) * PartCount;
-    for (auto &&part : m_Parts) {
-      containerSizeInBytes += part.Header.PartSize;
-    }
+    uint32_t containerSizeInBytes = size();
    InitDxilContainer(&header, PartCount, containerSizeInBytes);
    IFT(pStream->Reserve(header.ContainerSizeInBytes));
    IFT(WriteStreamValue(pStream, header));
-    uint32_t offset = sizeof(header) + OffsetTableSize;
+    uint32_t offset = sizeof(header) + (uint32_t)GetOffsetTableSize(PartCount);
    for (auto &&part : m_Parts) {
      IFT(WriteStreamValue(pStream, offset));
      offset += sizeof(DxilPartHeader) + part.Header.PartSize;
@ -533,6 +560,10 @@ public:
  }
 };

+DxilContainerWriter *hlsl::NewDxilContainerWriter() {
+  return new DxilContainerWriter_impl();
+}
+
 static bool HasDebugInfo(const Module &M) {
  for (Module::const_named_metadata_iterator NMI = M.named_metadata_begin(),
                                             NME = M.named_metadata_end();
@ -575,7 +606,7 @@ static void WriteProgramPart(const ShaderModel *pModel,
  }
 }

-void hlsl::SerializeDxilContainerForModule(Module *pModule,
+void hlsl::SerializeDxilContainerForModule(DxilModule *pModule,
                                           AbstractMemoryStream *pModuleBitcode,
                                           AbstractMemoryStream *pFinalStream) {
  // TODO: add a flag to update the module and remove information that is not part
@ -585,21 +616,17 @@ void hlsl::SerializeDxilContainerForModule(Module *pModule,
  DXASSERT_NOMSG(pModuleBitcode != nullptr);
  DXASSERT_NOMSG(pFinalStream != nullptr);

-  CComPtr<AbstractMemoryStream> pProgramStream;
-  DxilModule dxilModule(pModule);
-  dxilModule.LoadDxilMetadata();
-
-  DxilProgramSignatureWriter inputSigWriter(dxilModule.GetInputSignature(),
-                                            dxilModule.GetTessellatorDomain(),
+  DxilProgramSignatureWriter inputSigWriter(pModule->GetInputSignature(),
+                                            pModule->GetTessellatorDomain(),
                                            /*IsInput*/ true);
-  DxilProgramSignatureWriter outputSigWriter(dxilModule.GetOutputSignature(),
-                                             dxilModule.GetTessellatorDomain(),
+  DxilProgramSignatureWriter outputSigWriter(pModule->GetOutputSignature(),
+                                             pModule->GetTessellatorDomain(),
                                             /*IsInput*/ false);
-  DxilPSVWriter PSVWriter(dxilModule);
-  DxilContainerWriter writer;
+  DxilPSVWriter PSVWriter(*pModule);
+  DxilContainerWriter_impl writer;

  // Write the feature part.
-  DxilFeatureInfoWriter featureInfoWriter(dxilModule);
+  DxilFeatureInfoWriter featureInfoWriter(*pModule);
  writer.AddPart(DFCC_FeatureInfo, featureInfoWriter.size(), [&](AbstractMemoryStream *pStream) {
    featureInfoWriter.write(pStream);
  });
@ -613,10 +640,10 @@ void hlsl::SerializeDxilContainerForModule(Module *pModule,
  });

  DxilProgramSignatureWriter patchConstantSigWriter(
-      dxilModule.GetPatchConstantSignature(), dxilModule.GetTessellatorDomain(),
-      /*IsInput*/ dxilModule.GetShaderModel()->IsDS());
+      pModule->GetPatchConstantSignature(), pModule->GetTessellatorDomain(),
+      /*IsInput*/ pModule->GetShaderModel()->IsDS());

-  if (dxilModule.GetPatchConstantSignature().GetElements().size()) {
+  if (pModule->GetPatchConstantSignature().GetElements().size()) {
    writer.AddPart(DFCC_PatchConstantSignature, patchConstantSigWriter.size(),
                   [&](AbstractMemoryStream *pStream) {
                     patchConstantSigWriter.write(pStream);
@ -629,32 +656,40 @@ void hlsl::SerializeDxilContainerForModule(Module *pModule,
  });

  // Write the root signature (RTS0) part.
-  DxilProgramRootSignatureWriter rootSigWriter(dxilModule.GetRootSignature());
-  if (!dxilModule.GetRootSignature().IsEmpty()) {
+  DxilProgramRootSignatureWriter rootSigWriter(pModule->GetRootSignature());
+  CComPtr<AbstractMemoryStream> pInputProgramStream = pModuleBitcode;
+  if (!pModule->GetRootSignature().IsEmpty()) {
    writer.AddPart(
        DFCC_RootSignature, rootSigWriter.size(),
        [&](AbstractMemoryStream *pStream) { rootSigWriter.write(pStream); });
+    pModule->StripRootSignatureFromMetadata();
+    pInputProgramStream.Release();
+    CComPtr<IMalloc> pMalloc;
+    IFT(CoGetMalloc(1, &pMalloc));
+    IFT(CreateMemoryStream(pMalloc, &pInputProgramStream));
+    raw_stream_ostream outStream(pInputProgramStream.p);
+    WriteBitcodeToFile(pModule->GetModule(), outStream, true);
  }

  // If we have debug information present, serialize it to a debug part, then use the stripped version as the canonical program version.
-  pProgramStream = pModuleBitcode;
-  if (HasDebugInfo(*pModule)) {
+  CComPtr<AbstractMemoryStream> pProgramStream = pInputProgramStream;
+  if (HasDebugInfo(*pModule->GetModule())) {
    uint32_t debugInUInt32, debugPaddingBytes;
-    GetPaddedProgramPartSize(pModuleBitcode, debugInUInt32, debugPaddingBytes);
+    GetPaddedProgramPartSize(pInputProgramStream, debugInUInt32, debugPaddingBytes);
    writer.AddPart(DFCC_ShaderDebugInfoDXIL, debugInUInt32 * sizeof(uint32_t) + sizeof(DxilProgramHeader), [&](AbstractMemoryStream *pStream) {
-      WriteProgramPart(dxilModule.GetShaderModel(), pModuleBitcode, pStream);
+      WriteProgramPart(pModule->GetShaderModel(), pInputProgramStream, pStream);
    });

    pProgramStream.Release();

-    llvm::StripDebugInfo(*pModule);
-    dxilModule.StripDebugRelatedCode();
+    llvm::StripDebugInfo(*pModule->GetModule());
+    pModule->StripDebugRelatedCode();

    CComPtr<IMalloc> pMalloc;
    IFT(CoGetMalloc(1, &pMalloc));
    IFT(CreateMemoryStream(pMalloc, &pProgramStream));
    raw_stream_ostream outStream(pProgramStream.p);
-    WriteBitcodeToFile(pModule, outStream, true);
+    WriteBitcodeToFile(pModule->GetModule(), outStream, true);
  }

  // Compute padded bitcode size.
@ -663,8 +698,23 @@ void hlsl::SerializeDxilContainerForModule(Module *pModule,

  // Write the program part.
  writer.AddPart(DFCC_DXIL, programInUInt32 * sizeof(uint32_t) + sizeof(DxilProgramHeader), [&](AbstractMemoryStream *pStream) {
-    WriteProgramPart(dxilModule.GetShaderModel(), pProgramStream, pStream);
+    WriteProgramPart(pModule->GetShaderModel(), pProgramStream, pStream);
  });

  writer.write(pFinalStream);
 }
+
+void hlsl::SerializeDxilContainerForRootSignature(hlsl::RootSignatureHandle *pRootSigHandle,
+                                     AbstractMemoryStream *pFinalStream) {
+  DXASSERT_NOMSG(pRootSigHandle != nullptr);
+  DXASSERT_NOMSG(pFinalStream != nullptr);
+  DxilContainerWriter_impl writer;
+  // Write the root signature (RTS0) part.
+  DxilProgramRootSignatureWriter rootSigWriter(*pRootSigHandle);
+  if (!pRootSigHandle->IsEmpty()) {
+    writer.AddPart(
+        DFCC_RootSignature, rootSigWriter.size(),
+        [&](AbstractMemoryStream *pStream) { rootSigWriter.write(pStream); });
+  }
+  writer.write(pFinalStream);
+}
--- a/lib/HLSL/DxilGenerationPass.cpp
+++ b/lib/HLSL/DxilGenerationPass.cpp
--- a/lib/HLSL/DxilLegalizeSampleOffsetPass.cpp
+++ b/lib/HLSL/DxilLegalizeSampleOffsetPass.cpp
@ -0,0 +1,211 @@
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// DxilSignature.cpp                                                         //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+// DxilLegalizeSampleOffsetPass implementation.                              //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+
+#include "dxc/HLSL/DxilGenerationPass.h"
+#include "dxc/HLSL/DxilModule.h"
+#include "dxc/HLSL/DxilOperations.h"
+
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include <unordered_set>
+
+using std::vector;
+using std::unique_ptr;
+using namespace llvm;
+using namespace hlsl;
+
+///////////////////////////////////////////////////////////////////////////////
+// Legalize Sample offset.
+
+namespace {
+// When optimizations are disabled, try to legalize sample offset.
+class DxilLegalizeSampleOffsetPass : public FunctionPass {
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit DxilLegalizeSampleOffsetPass() : FunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "DXIL legalize sample offset";
+  }
+
+  bool runOnFunction(Function &F) override {
+    DxilModule &DM = F.getParent()->GetOrCreateDxilModule();
+    hlsl::OP *hlslOP = DM.GetOP();
+
+    std::vector<Instruction *> illegalOffsets;
+
+    CollectIllegalOffsets(illegalOffsets, F, hlslOP);
+
+    if (illegalOffsets.empty())
+      return false;
+
+    // Loop unroll if has offset inside loop.
+    TryUnrollLoop(illegalOffsets, F);
+
+    // Collect offset again after mem2reg.
+    std::vector<Instruction *> ssaIllegalOffsets;
+    CollectIllegalOffsets(ssaIllegalOffsets, F, hlslOP);
+
+    // Run simple optimization to legalize offsets.
+    LegalizeOffsets(ssaIllegalOffsets);
+
+    // Remove PHINodes to keep code shape.
+    legacy::FunctionPassManager PM(F.getParent());
+    PM.add(createDemoteRegisterToMemoryHlslPass());
+    PM.run(F);
+
+    FinalCheck(illegalOffsets, F, hlslOP);
+
+    return true;
+  }
+
+private:
+  void TryUnrollLoop(std::vector<Instruction *> &illegalOffsets, Function &F);
+  void CollectIllegalOffsets(std::vector<Instruction *> &illegalOffsets,
+                             Function &F, hlsl::OP *hlslOP);
+  void CollectIllegalOffsets(std::vector<Instruction *> &illegalOffsets,
+                             Function &F, DXIL::OpCode opcode,
+                             hlsl::OP *hlslOP);
+  void LegalizeOffsets(const std::vector<Instruction *> &illegalOffsets);
+  void FinalCheck(std::vector<Instruction *> &illegalOffsets, Function &F,
+                  hlsl::OP *hlslOP);
+};
+
+char DxilLegalizeSampleOffsetPass::ID = 0;
+
+bool HasIllegalOffsetInLoop(std::vector<Instruction *> &illegalOffsets,
+                            Function &F) {
+  DominatorTreeAnalysis DTA;
+  DominatorTree DT = DTA.run(F);
+  LoopInfo LI;
+  LI.Analyze(DT);
+
+  bool findOffset = false;
+
+  for (Instruction *I : illegalOffsets) {
+    BasicBlock *BB = I->getParent();
+    if (LI.getLoopFor(BB)) {
+      findOffset = true;
+      break;
+    }
+  }
+  return findOffset;
+}
+
+void CollectIllegalOffset(CallInst *CI,
+                          std::vector<Instruction *> &illegalOffsets) {
+  Value *offset0 =
+      CI->getArgOperand(DXIL::OperandIndex::kTextureSampleOffset0OpIdx);
+  // No offset.
+  if (isa<UndefValue>(offset0))
+    return;
+
+  for (unsigned i = DXIL::OperandIndex::kTextureSampleOffset0OpIdx;
+       i <= DXIL::OperandIndex::kTextureSampleOffset2OpIdx; i++) {
+    Value *offset = CI->getArgOperand(i);
+    if (Instruction *I = dyn_cast<Instruction>(offset))
+      illegalOffsets.emplace_back(I);
+  }
+}
+}
+
+void DxilLegalizeSampleOffsetPass::FinalCheck(
+    std::vector<Instruction *> &illegalOffsets, Function &F, hlsl::OP *hlslOP) {
+  // Collect offset to make sure no illegal offsets.
+  std::vector<Instruction *> finalIllegalOffsets;
+  CollectIllegalOffsets(finalIllegalOffsets, F, hlslOP);
+
+  if (!finalIllegalOffsets.empty()) {
+    const StringRef kIllegalOffsetError =
+        "Offsets for Sample* must be immediated value. "
+        "Consider unroll the loop manually and use O3, it may help in some "
+        "cases\n";
+    std::string errorMsg;
+    raw_string_ostream errorStr(errorMsg);
+    for (Instruction *offset : finalIllegalOffsets) {
+      if (const DebugLoc &L = offset->getDebugLoc())
+        L.print(errorStr);
+      errorStr << " " << kIllegalOffsetError;
+    }
+    errorStr.flush();
+    F.getContext().emitError(errorMsg);
+  }
+}
+
+void DxilLegalizeSampleOffsetPass::TryUnrollLoop(
+    std::vector<Instruction *> &illegalOffsets, Function &F) {
+  legacy::FunctionPassManager PM(F.getParent());
+  // Always need mem2reg for simplify illegal offsets.
+  PM.add(createPromoteMemoryToRegisterPass());
+
+  if (HasIllegalOffsetInLoop(illegalOffsets, F)) {
+    PM.add(createCFGSimplificationPass());
+    PM.add(createLCSSAPass());
+    PM.add(createLoopSimplifyPass());
+    PM.add(createLoopRotatePass());
+    PM.add(createLoopUnrollPass(-2, -1, 0, 0));
+  }
+  PM.run(F);
+}
+
+void DxilLegalizeSampleOffsetPass::CollectIllegalOffsets(
+    std::vector<Instruction *> &illegalOffsets, Function &CurF,
+    hlsl::OP *hlslOP) {
+  CollectIllegalOffsets(illegalOffsets, CurF, DXIL::OpCode::Sample, hlslOP);
+  CollectIllegalOffsets(illegalOffsets, CurF, DXIL::OpCode::SampleBias, hlslOP);
+  CollectIllegalOffsets(illegalOffsets, CurF, DXIL::OpCode::SampleCmp, hlslOP);
+  CollectIllegalOffsets(illegalOffsets, CurF, DXIL::OpCode::SampleCmpLevelZero,
+                        hlslOP);
+  CollectIllegalOffsets(illegalOffsets, CurF, DXIL::OpCode::SampleGrad, hlslOP);
+  CollectIllegalOffsets(illegalOffsets, CurF, DXIL::OpCode::SampleLevel,
+                        hlslOP);
+}
+
+void DxilLegalizeSampleOffsetPass::CollectIllegalOffsets(
+    std::vector<Instruction *> &illegalOffsets, Function &CurF,
+    DXIL::OpCode opcode, hlsl::OP *hlslOP) {
+  ArrayRef<Function *> intrFuncList = hlslOP->GetOpFuncList(opcode);
+  for (Function *intrFunc : intrFuncList) {
+    if (!intrFunc)
+      continue;
+    for (User *U : intrFunc->users()) {
+      CallInst *CI = cast<CallInst>(U);
+      // Skip inst not in current function.
+      if (CI->getParent()->getParent() != &CurF)
+        continue;
+
+      CollectIllegalOffset(CI, illegalOffsets);
+    }
+  }
+}
+
+void DxilLegalizeSampleOffsetPass::LegalizeOffsets(
+    const std::vector<Instruction *> &illegalOffsets) {
+  for (Instruction *I : illegalOffsets)
+    llvm::recursivelySimplifyInstruction(I);
+}
+
+FunctionPass *llvm::createDxilLegalizeSampleOffsetPass() {
+  return new DxilLegalizeSampleOffsetPass();
+}
+
+INITIALIZE_PASS(DxilLegalizeSampleOffsetPass, "dxil-legalize-sample-offset",
+                "DXIL legalize sample offset", false, false)
--- a/lib/HLSL/DxilMetadataHelper.cpp
+++ b/lib/HLSL/DxilMetadataHelper.cpp
@ -17,7 +17,6 @@
 #include "dxc/HLSL/DxilSignature.h"
 #include "dxc/HLSL/DxilTypeSystem.h"
 #include "dxc/HLSL/DxilRootSignature.h"
-#include "dxc/HLSL/DxilValidation.h"

 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@ -27,6 +26,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include <array>

+#include "dxc/Support/WinIncludes.h"
+
 using namespace llvm;
 using std::string;
 using std::vector;
@ -45,6 +46,9 @@ const char DxilMDHelper::kDxilControlFlowHintMDName[]                 = "dx.cont
 const char DxilMDHelper::kDxilPreciseAttributeMDName[]                = "dx.precise";
 const char DxilMDHelper::kDxilValidatorVersionMDName[]                = "dx.valver";

+// This named metadata is not valid in final module (should be moved to DxilContainer)
+const char DxilMDHelper::kDxilRootSignatureMDName[]                   = "dx.rootSignature";
+
 static std::array<const char *, 6> DxilMDNames = {
  DxilMDHelper::kDxilVersionMDName,
  DxilMDHelper::kDxilShaderModelMDName,
@ -131,10 +135,10 @@ void DxilMDHelper::LoadDxilShaderModel(const ShaderModel *&pSM) {
  ShaderModelName += "_" + std::to_string(Major) + "_" + std::to_string(Minor);
  pSM = ShaderModel::GetByName(ShaderModelName.c_str());
  if (!pSM->IsValid()) {
-    string ErrorMsg = hlsl::GetValidationRuleText(hlsl::ValidationRule::SmName);
-    size_t offset = ErrorMsg.find("%0");
-    if (offset != string::npos)
-      ErrorMsg.replace(offset, 2, ShaderModelName);
+    char ErrorMsgTxt[40];
+    StringCchPrintfA(ErrorMsgTxt, _countof(ErrorMsgTxt),
+                     "Unknown shader model '%s'", ShaderModelName.c_str());
+    string ErrorMsg(ErrorMsgTxt);
    throw hlsl::Exception(DXC_E_INCORRECT_DXIL_METADATA, ErrorMsg);
  }
 }
@ -223,16 +227,21 @@ MDTuple *DxilMDHelper::EmitDxilSignatures(const DxilSignature &InputSig,
  return pSignatureTupleMD;
 }

-llvm::Metadata *DxilMDHelper::EmitRootSignature(RootSignatureHandle &RootSig) {
+void DxilMDHelper::EmitRootSignature(RootSignatureHandle &RootSig) {
  if (RootSig.IsEmpty()) {
-    return nullptr;
+    return;
  }

  RootSig.EnsureSerializedAvailable();
  Constant *V = llvm::ConstantDataArray::get(
      m_Ctx, llvm::ArrayRef<uint8_t>(RootSig.GetSerializedBytes(),
                                     RootSig.GetSerializedSize()));
-  return ConstantAsMetadata::get(V);
+
+  NamedMDNode *pRootSignatureNamedMD = m_pModule->getNamedMetadata(kDxilRootSignatureMDName);
+  IFTBOOL(pRootSignatureNamedMD == nullptr, DXC_E_INCORRECT_DXIL_METADATA);
+  pRootSignatureNamedMD = m_pModule->getOrInsertNamedMetadata(kDxilRootSignatureMDName);
+  pRootSignatureNamedMD->addOperand(MDNode::get(m_Ctx, {ConstantAsMetadata::get(V)}));
+  return ;
 }

 void DxilMDHelper::LoadDxilSignatures(const MDOperand &MDO, 
@ -278,10 +287,17 @@ void DxilMDHelper::LoadSignatureMetadata(const MDOperand &MDO, DxilSignature &Si
  }
 }

-void DxilMDHelper::LoadRootSignature(const MDOperand &MDO, RootSignatureHandle &Sig) {
-  if (MDO.get() == nullptr)
+void DxilMDHelper::LoadRootSignature(RootSignatureHandle &Sig) {
+  NamedMDNode *pRootSignatureNamedMD = m_pModule->getNamedMetadata(kDxilRootSignatureMDName);
+  if(!pRootSignatureNamedMD)
    return;

+  IFTBOOL(pRootSignatureNamedMD->getNumOperands() == 1, DXC_E_INCORRECT_DXIL_METADATA);
+
+  MDNode *pNode = pRootSignatureNamedMD->getOperand(0);
+  IFTBOOL(pNode->getNumOperands() == 1, DXC_E_INCORRECT_DXIL_METADATA);
+  const MDOperand &MDO = pNode->getOperand(0);
+
  const ConstantAsMetadata *pMetaData = dyn_cast<ConstantAsMetadata>(MDO.get());
  IFTBOOL(pMetaData != nullptr, DXC_E_INCORRECT_DXIL_METADATA);
  const ConstantDataArray *pData =
@ -547,15 +563,7 @@ void DxilMDHelper::EmitDxilTypeSystem(DxilTypeSystem &TypeSystem, vector<GlobalV
    // Emit struct type field annotations.
    Metadata *pMD = EmitDxilStructAnnotation(*pA);

-    // Declare a global dummy variable.
-    string GVName = string(kDxilTypeSystemHelperVariablePrefix) + std::to_string(GVIdx);
-    GlobalVariable *pGV = new GlobalVariable(*m_pModule, pStructType, true, GlobalValue::ExternalLinkage, 
-                                             nullptr, GVName, nullptr,
-                                             GlobalVariable::NotThreadLocal, DXIL::kDeviceMemoryAddrSpace);
-    // Mark GV as being used for LLVM.
-    LLVMUsed.emplace_back(pGV);
-
-    MDVals.push_back(ValueAsMetadata::get(pGV));
+    MDVals.push_back(ValueAsMetadata::get(UndefValue::get(pStructType)));
    MDVals.push_back(pMD);
  }

@ -596,11 +604,11 @@ void DxilMDHelper::LoadDxilTypeSystemNode(const llvm::MDTuple &MDT,
    IFTBOOL((MDT.getNumOperands() & 0x1) == 1, DXC_E_INCORRECT_DXIL_METADATA);

    for (unsigned i = 1; i < MDT.getNumOperands(); i += 2) {
-      GlobalVariable *pGV =
-          dyn_cast<GlobalVariable>(ValueMDToValue(MDT.getOperand(i)));
+      Constant *pGV =
+          dyn_cast<Constant>(ValueMDToValue(MDT.getOperand(i)));
      IFTBOOL(pGV != nullptr, DXC_E_INCORRECT_DXIL_METADATA);
      StructType *pGVType =
-          dyn_cast<StructType>(pGV->getType()->getPointerElementType());
+          dyn_cast<StructType>(pGV->getType());
      IFTBOOL(pGVType != nullptr, DXC_E_INCORRECT_DXIL_METADATA);

      DxilStructAnnotation *pSA = TypeSystem.AddStructAnnotation(pGVType);
--- a/lib/HLSL/DxilModule.cpp
+++ b/lib/HLSL/DxilModule.cpp
@ -22,6 +22,8 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include <unordered_set>

@ -31,6 +33,21 @@ using std::vector;
 using std::unique_ptr;


+namespace {
+class DxilErrorDiagnosticInfo : public DiagnosticInfo {
+private:
+  const char *m_message;
+public:
+  DxilErrorDiagnosticInfo(const char *str)
+    : DiagnosticInfo(DK_FirstPluginKind, DiagnosticSeverity::DS_Error),
+    m_message(str) { }
+
+  __override void print(DiagnosticPrinter &DP) const {
+    DP << m_message;
+  }
+};
+} // anon namespace
+
 namespace hlsl {

 //------------------------------------------------------------------------------
@ -66,7 +83,7 @@ DxilModule::DxilModule(Module *pModule)

  m_NumThreads[0] = m_NumThreads[1] = m_NumThreads[2] = 0;

-#ifdef _DEBUG
+#if defined(_DEBUG) || defined(DBG)
  // Pin LLVM dump methods.
  void (__thiscall Module::*pfnModuleDump)() const = &Module::dump;
  void (__thiscall Type::*pfnTypeDump)() const = &Type::dump;
@ -338,6 +355,9 @@ void DxilModule::CollectShaderFlags(ShaderFlags &Flags) {
          case DXIL::OpCode::InnerCoverage:
            hasInnerCoverage = true;
            break;
+          default:
+            // Normal opcodes.
+            break;
          }
        }
      }
@ -421,6 +441,9 @@ void DxilModule::CollectShaderFlags(ShaderFlags &Flags) {
    case DXIL::ResourceKind::StructuredBuffer:
      hasRawAndStructuredBuffer = true;
      break;
+    default:
+      // Not raw/structured.
+      break;
    }
  }
  for (auto &SRV : m_SRVs) {
@ -429,6 +452,9 @@ void DxilModule::CollectShaderFlags(ShaderFlags &Flags) {
    case DXIL::ResourceKind::StructuredBuffer:
      hasRawAndStructuredBuffer = true;
      break;
+    default:
+      // Not raw/structured.
+      break;
    }
  }
  
@ -716,6 +742,13 @@ static void ConvertUsedResource(std::unordered_set<unsigned> &immResID,
  }
 }

+void DxilModule::RemoveFunction(llvm::Function *F) {
+  DXASSERT_NOMSG(F != nullptr);
+  if (m_pTypeSystem.get()->GetFunctionAnnotation(F))
+    m_pTypeSystem.get()->EraseFunctionAnnotation(F);
+  m_pOP->RemoveFunction(F);
+}
+
 void DxilModule::RemoveUnusedResources() {
  hlsl::OP *hlslOP = GetOP();
  Function *createHandleFunc = hlslOP->GetOpFunc(DXIL::OpCode::CreateHandle, Type::getVoidTy(GetCtx()));
@ -808,6 +841,13 @@ const RootSignatureHandle &DxilModule::GetRootSignature() const {
  return *m_RootSignature;
 }

+void DxilModule::StripRootSignatureFromMetadata() {
+  NamedMDNode *pRootSignatureNamedMD = GetModule()->getNamedMetadata(DxilMDHelper::kDxilRootSignatureMDName);
+  if (pRootSignatureNamedMD) {
+    GetModule()->eraseNamedMetadata(pRootSignatureNamedMD);
+  }
+}
+
 void DxilModule::ResetInputSignature(DxilSignature *pValue) {
  m_InputSignature.reset(pValue);
 }
@ -832,6 +872,10 @@ void DxilModule::ResetTypeSystem(DxilTypeSystem *pValue) {
  m_pTypeSystem.reset(pValue);
 }

+void DxilModule::ResetOP(hlsl::OP *hlslOP) {
+  m_pOP.reset(hlslOP);
+}
+
 void DxilModule::EmitLLVMUsed() {
  if (m_LLVMUsed.empty())
    return;
@ -887,6 +931,10 @@ void DxilModule::EmitDxilMetadata() {
  vector<MDNode *> Entries;
  Entries.emplace_back(pEntry);
  m_pMDHelper->EmitDxilEntryPoints(Entries);
+
+  if (!m_RootSignature->IsEmpty()) {
+    m_pMDHelper->EmitRootSignature(*m_RootSignature.get());
+  }
 }

 bool DxilModule::IsKnownNamedMetaData(llvm::NamedMDNode &Node) {
@ -916,6 +964,8 @@ void DxilModule::LoadDxilMetadata() {
  LoadDxilResources(*pResources);
  LoadDxilShaderProperties(*pProperties);
  m_pMDHelper->LoadDxilTypeSystem(*m_pTypeSystem.get());
+
+  m_pMDHelper->LoadRootSignature(*m_RootSignature.get());
 }

 MDTuple *DxilModule::EmitDxilResources() {
@ -1062,11 +1112,6 @@ MDTuple *DxilModule::EmitDxilShaderProperties() {
    MDVals.emplace_back(pMDTuple);
  }

-  if (!m_RootSignature->IsEmpty()) {
-    MDVals.emplace_back(m_pMDHelper->Uint32ToConstMD(DxilMDHelper::kDxilRootSignatureTag));
-    MDVals.emplace_back(m_pMDHelper->EmitRootSignature(*m_RootSignature.get()));
-  }
-
  if (!MDVals.empty())
    return MDNode::get(m_Ctx, MDVals);
  else
@ -1120,10 +1165,6 @@ void DxilModule::LoadDxilShaderProperties(const MDOperand &MDO) {
                                   m_MaxTessellationFactor);
      break;

-    case DxilMDHelper::kDxilRootSignatureTag:
-      m_pMDHelper->LoadRootSignature(MDO, *m_RootSignature.get());
-      break;
-
    default:
      DXASSERT(false, "Unknown extended shader properties tag");
      break;
@ -1188,6 +1229,34 @@ DebugInfoFinder &DxilModule::GetOrCreateDebugInfoFinder() {
  }
  return *m_pDebugInfoFinder;
 }
+
+hlsl::DxilModule *hlsl::DxilModule::TryGetDxilModule(llvm::Module *pModule) {
+  LLVMContext &Ctx = pModule->getContext();
+  std::string diagStr;
+  raw_string_ostream diagStream(diagStr);
+
+  hlsl::DxilModule *pDxilModule = nullptr;
+  // TODO: add detail error in DxilMDHelper.
+  try {
+    pDxilModule = &pModule->GetOrCreateDxilModule();
+  } catch (const ::hlsl::Exception &hlslException) {
+    diagStream << "load dxil metadata failed -";
+    try {
+      const char *msg = hlslException.what();
+      if (msg == nullptr || *msg == '\0')
+        diagStream << " error code " << hlslException.hr << "\n";
+      else
+        diagStream << msg;
+    } catch (...) {
+      diagStream << " unable to retrieve error message.\n";
+    }
+    Ctx.diagnose(DxilErrorDiagnosticInfo(diagStream.str().c_str()));
+  } catch (...) {
+    Ctx.diagnose(DxilErrorDiagnosticInfo("load dxil metadata failed - unknown error.\n"));
+  }
+  return pDxilModule;
+}
+
 } // namespace hlsl

 namespace llvm {
--- a/lib/HLSL/DxilOperations.cpp
+++ b/lib/HLSL/DxilOperations.cpp
@ -63,6 +63,7 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
  {  OC::Atan,                    "Atan",                     OCC::Unary,                    "unary",                      false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
  {  OC::Hcos,                    "Hcos",                     OCC::Unary,                    "unary",                      false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
  {  OC::Hsin,                    "Hsin",                     OCC::Unary,                    "unary",                      false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
+  {  OC::Htan,                    "Htan",                     OCC::Unary,                    "unary",                      false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
  {  OC::Exp,                     "Exp",                      OCC::Unary,                    "unary",                      false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
  {  OC::Frc,                     "Frc",                      OCC::Unary,                    "unary",                      false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
  {  OC::Log,                     "Log",                      OCC::Unary,                    "unary",                      false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
@ -97,11 +98,9 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
  {  OC::UMul,                    "UMul",                     OCC::BinaryWithTwoOuts,        "binaryWithTwoOuts",          false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
  {  OC::UDiv,                    "UDiv",                     OCC::BinaryWithTwoOuts,        "binaryWithTwoOuts",          false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },

-  // Binary int with carry                                                                                                  void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::IAddc,                   "IAddc",                    OCC::BinaryWithCarry,          "binaryWithCarry",            false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
-  {  OC::UAddc,                   "UAddc",                    OCC::BinaryWithCarry,          "binaryWithCarry",            false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
-  {  OC::ISubc,                   "ISubc",                    OCC::BinaryWithCarry,          "binaryWithCarry",            false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
-  {  OC::USubc,                   "USubc",                    OCC::BinaryWithCarry,          "binaryWithCarry",            false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
+  // Binary uint with carry or borrow                                                                                       void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
+  {  OC::UAddc,                   "UAddc",                    OCC::BinaryWithCarryOrBorrow,  "binaryWithCarryOrBorrow",    false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
+  {  OC::USubb,                   "USubb",                    OCC::BinaryWithCarryOrBorrow,  "binaryWithCarryOrBorrow",    false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },

  // Tertiary float                                                                                                         void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
  {  OC::FMad,                    "FMad",                     OCC::Tertiary,                 "tertiary",                   false,  true,  true,  true, false, false, false, false, false, Attribute::ReadNone, },
@ -138,8 +137,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
  // Resources                                                                                                              void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
  {  OC::TextureLoad,             "TextureLoad",              OCC::TextureLoad,              "textureLoad",                false,  true,  true, false, false, false,  true,  true, false, Attribute::ReadOnly, },
  {  OC::TextureStore,            "TextureStore",             OCC::TextureStore,             "textureStore",               false,  true,  true, false, false, false,  true,  true, false, Attribute::None,     },
-  {  OC::BufferLoad,              "BufferLoad",               OCC::BufferLoad,               "bufferLoad",                 false,  true,  true, false, false, false,  true,  true,  true, Attribute::ReadOnly, },
-  {  OC::BufferStore,             "BufferStore",              OCC::BufferStore,              "bufferStore",                false,  true,  true, false, false, false,  true,  true,  true, Attribute::None,     },
+  {  OC::BufferLoad,              "BufferLoad",               OCC::BufferLoad,               "bufferLoad",                 false,  true,  true, false, false, false,  true,  true, false, Attribute::ReadOnly, },
+  {  OC::BufferStore,             "BufferStore",              OCC::BufferStore,              "bufferStore",                false,  true,  true, false, false, false,  true,  true, false, Attribute::None,     },
  {  OC::BufferUpdateCounter,     "BufferUpdateCounter",      OCC::BufferUpdateCounter,      "bufferUpdateCounter",         true, false, false, false, false, false, false, false, false, Attribute::None,     },
  {  OC::CheckAccessFullyMapped,  "CheckAccessFullyMapped",   OCC::CheckAccessFullyMapped,   "checkAccessFullyMapped",     false, false, false, false, false, false, false,  true, false, Attribute::ReadOnly, },
  {  OC::GetDimensions,           "GetDimensions",            OCC::GetDimensions,            "getDimensions",               true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
@ -148,10 +147,6 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
  {  OC::TextureGather,           "TextureGather",            OCC::TextureGather,            "textureGather",              false, false,  true, false, false, false, false,  true, false, Attribute::ReadOnly, },
  {  OC::TextureGatherCmp,        "TextureGatherCmp",         OCC::TextureGatherCmp,         "textureGatherCmp",           false, false,  true, false, false, false, false,  true, false, Attribute::ReadOnly, },

-  //                                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::ToDelete5,               "ToDelete5",                OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-  {  OC::ToDelete6,               "ToDelete6",                OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-
  // Resources - sample                                                                                                     void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
  {  OC::Texture2DMSGetSamplePosition, "Texture2DMSGetSamplePosition", OCC::Texture2DMSGetSamplePosition, "texture2DMSGetSamplePosition",   true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
  {  OC::RenderTargetGetSamplePosition, "RenderTargetGetSamplePosition", OCC::RenderTargetGetSamplePosition, "renderTargetGetSamplePosition",   true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
@ -172,6 +167,9 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
  {  OC::EvalSnapped,             "EvalSnapped",              OCC::EvalSnapped,              "evalSnapped",                false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
  {  OC::EvalSampleIndex,         "EvalSampleIndex",          OCC::EvalSampleIndex,          "evalSampleIndex",            false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
  {  OC::EvalCentroid,            "EvalCentroid",             OCC::EvalCentroid,             "evalCentroid",               false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
+  {  OC::SampleIndex,             "SampleIndex",              OCC::SampleIndex,              "sampleIndex",                false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
+  {  OC::Coverage,                "Coverage",                 OCC::Coverage,                 "coverage",                   false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
+  {  OC::InnerCoverage,           "InnerCoverage",            OCC::InnerCoverage,            "innerCoverage",              false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },

  // Compute shader                                                                                                         void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
  {  OC::ThreadId,                "ThreadId",                 OCC::ThreadId,                 "threadId",                   false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
@ -183,21 +181,12 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
  {  OC::EmitStream,              "EmitStream",               OCC::EmitStream,               "emitStream",                  true, false, false, false, false, false, false, false, false, Attribute::None,     },
  {  OC::CutStream,               "CutStream",                OCC::CutStream,                "cutStream",                   true, false, false, false, false, false, false, false, false, Attribute::None,     },
  {  OC::EmitThenCutStream,       "EmitThenCutStream",        OCC::EmitThenCutStream,        "emitThenCutStream",           true, false, false, false, false, false, false, false, false, Attribute::None,     },
+  {  OC::GSInstanceID,            "GSInstanceID",             OCC::GSInstanceID,             "gsInstanceID",               false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },

  // Double precision                                                                                                       void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
  {  OC::MakeDouble,              "MakeDouble",               OCC::MakeDouble,               "makeDouble",                 false, false, false,  true, false, false, false, false, false, Attribute::ReadNone, },
-
-  //                                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::ToDelete1,               "ToDelete1",                OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-  {  OC::ToDelete2,               "ToDelete2",                OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-
-  // Double precision                                                                                                       void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
  {  OC::SplitDouble,             "SplitDouble",              OCC::SplitDouble,              "splitDouble",                false, false, false,  true, false, false, false, false, false, Attribute::ReadNone, },

-  //                                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::ToDelete3,               "ToDelete3",                OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-  {  OC::ToDelete4,               "ToDelete4",                OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-
  // Domain and hull shader                                                                                                 void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
  {  OC::LoadOutputControlPoint,  "LoadOutputControlPoint",   OCC::LoadOutputControlPoint,   "loadOutputControlPoint",     false,  true,  true, false, false, false,  true,  true, false, Attribute::ReadNone, },
  {  OC::LoadPatchConstant,       "LoadPatchConstant",        OCC::LoadPatchConstant,        "loadPatchConstant",          false,  true,  true, false, false, false,  true,  true, false, Attribute::ReadNone, },
@ -211,34 +200,23 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
  {  OC::PrimitiveID,             "PrimitiveID",              OCC::PrimitiveID,              "primitiveID",                false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },

  // Other                                                                                                                  void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::CycleCounterLegacy,      "CycleCounterLegacy",       OCC::CycleCounterLegacy,       "cycleCounterLegacy",          true, false, false, false, false, false, false, false, false, Attribute::ReadNone, },
-
-  // Unary float                                                                                                            void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::Htan,                    "Htan",                     OCC::Unary,                    "unary",                      false,  true,  true, false, false, false, false, false, false, Attribute::ReadNone, },
+  {  OC::CycleCounterLegacy,      "CycleCounterLegacy",       OCC::CycleCounterLegacy,       "cycleCounterLegacy",          true, false, false, false, false, false, false, false, false, Attribute::None,     },

  // Wave                                                                                                                   void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::WaveCaptureReserved,     "WaveCaptureReserved",      OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-  {  OC::WaveIsFirstLane,         "WaveIsFirstLane",          OCC::WaveIsFirstLane,          "waveIsFirstLane",             true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
-  {  OC::WaveGetLaneIndex,        "WaveGetLaneIndex",         OCC::WaveGetLaneIndex,         "waveGetLaneIndex",            true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
-  {  OC::WaveGetLaneCount,        "WaveGetLaneCount",         OCC::WaveGetLaneCount,         "waveGetLaneCount",            true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
-  {  OC::WaveIsHelperLaneReserved, "WaveIsHelperLaneReserved", OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-  {  OC::WaveAnyTrue,             "WaveAnyTrue",              OCC::WaveAnyTrue,              "waveAnyTrue",                 true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
-  {  OC::WaveAllTrue,             "WaveAllTrue",              OCC::WaveAllTrue,              "waveAllTrue",                 true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
-  {  OC::WaveActiveAllEqual,      "WaveActiveAllEqual",       OCC::WaveActiveAllEqual,       "waveActiveAllEqual",         false,  true,  true,  true,  true,  true,  true,  true,  true, Attribute::ReadOnly, },
-  {  OC::WaveActiveBallot,        "WaveActiveBallot",         OCC::WaveActiveBallot,         "waveActiveBallot",            true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
-  {  OC::WaveReadLaneAt,          "WaveReadLaneAt",           OCC::WaveReadLaneAt,           "waveReadLaneAt",             false,  true,  true,  true,  true,  true,  true,  true,  true, Attribute::ReadOnly, },
-  {  OC::WaveReadLaneFirst,       "WaveReadLaneFirst",        OCC::WaveReadLaneFirst,        "waveReadLaneFirst",          false,  true,  true, false,  true,  true,  true,  true,  true, Attribute::ReadOnly, },
-  {  OC::WaveActiveOp,            "WaveActiveOp",             OCC::WaveActiveOp,             "waveActiveOp",               false,  true,  true,  true,  true,  true,  true,  true,  true, Attribute::ReadOnly, },
-  {  OC::WaveActiveBit,           "WaveActiveBit",            OCC::WaveActiveBit,            "waveActiveBit",              false, false, false, false, false,  true,  true,  true,  true, Attribute::ReadOnly, },
-  {  OC::WavePrefixOp,            "WavePrefixOp",             OCC::WavePrefixOp,             "wavePrefixOp",               false,  true,  true,  true, false,  true,  true,  true,  true, Attribute::ReadOnly, },
-  {  OC::WaveGetOrderedIndex,     "WaveGetOrderedIndex",      OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-
-  //                                                                                                                        void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::GlobalOrderedCountIncReserved, "GlobalOrderedCountIncReserved", OCC::Reserved,                 "reserved",                    true, false, false, false, false, false, false, false, false, Attribute::None,     },
-
-  // Wave                                                                                                                   void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::QuadReadLaneAt,          "QuadReadLaneAt",           OCC::QuadReadLaneAt,           "quadReadLaneAt",             false,  true,  true,  true,  true,  true,  true,  true,  true, Attribute::ReadOnly, },
-  {  OC::QuadOp,                  "QuadOp",                   OCC::QuadOp,                   "quadOp",                     false,  true,  true,  true, false,  true,  true,  true,  true, Attribute::ReadOnly, },
+  {  OC::WaveIsFirstLane,         "WaveIsFirstLane",          OCC::WaveIsFirstLane,          "waveIsFirstLane",             true, false, false, false, false, false, false, false, false, Attribute::None,     },
+  {  OC::WaveGetLaneIndex,        "WaveGetLaneIndex",         OCC::WaveGetLaneIndex,         "waveGetLaneIndex",            true, false, false, false, false, false, false, false, false, Attribute::ReadNone, },
+  {  OC::WaveGetLaneCount,        "WaveGetLaneCount",         OCC::WaveGetLaneCount,         "waveGetLaneCount",            true, false, false, false, false, false, false, false, false, Attribute::ReadNone, },
+  {  OC::WaveAnyTrue,             "WaveAnyTrue",              OCC::WaveAnyTrue,              "waveAnyTrue",                 true, false, false, false, false, false, false, false, false, Attribute::None,     },
+  {  OC::WaveAllTrue,             "WaveAllTrue",              OCC::WaveAllTrue,              "waveAllTrue",                 true, false, false, false, false, false, false, false, false, Attribute::None,     },
+  {  OC::WaveActiveAllEqual,      "WaveActiveAllEqual",       OCC::WaveActiveAllEqual,       "waveActiveAllEqual",         false,  true,  true,  true,  true,  true,  true,  true,  true, Attribute::None,     },
+  {  OC::WaveActiveBallot,        "WaveActiveBallot",         OCC::WaveActiveBallot,         "waveActiveBallot",            true, false, false, false, false, false, false, false, false, Attribute::None,     },
+  {  OC::WaveReadLaneAt,          "WaveReadLaneAt",           OCC::WaveReadLaneAt,           "waveReadLaneAt",             false,  true,  true,  true,  true,  true,  true,  true,  true, Attribute::None,     },
+  {  OC::WaveReadLaneFirst,       "WaveReadLaneFirst",        OCC::WaveReadLaneFirst,        "waveReadLaneFirst",          false,  true,  true, false,  true,  true,  true,  true,  true, Attribute::None,     },
+  {  OC::WaveActiveOp,            "WaveActiveOp",             OCC::WaveActiveOp,             "waveActiveOp",               false,  true,  true,  true,  true,  true,  true,  true,  true, Attribute::None,     },
+  {  OC::WaveActiveBit,           "WaveActiveBit",            OCC::WaveActiveBit,            "waveActiveBit",              false, false, false, false, false,  true,  true,  true,  true, Attribute::None,     },
+  {  OC::WavePrefixOp,            "WavePrefixOp",             OCC::WavePrefixOp,             "wavePrefixOp",               false,  true,  true,  true, false,  true,  true,  true,  true, Attribute::None,     },
+  {  OC::QuadReadLaneAt,          "QuadReadLaneAt",           OCC::QuadReadLaneAt,           "quadReadLaneAt",             false,  true,  true,  true,  true,  true,  true,  true,  true, Attribute::None,     },
+  {  OC::QuadOp,                  "QuadOp",                   OCC::QuadOp,                   "quadOp",                     false,  true,  true,  true, false,  true,  true,  true,  true, Attribute::None,     },

  // Bitcasts with different sizes                                                                                          void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
  {  OC::BitcastI16toF16,         "BitcastI16toF16",          OCC::BitcastI16toF16,          "bitcastI16toF16",             true, false, false, false, false, false, false, false, false, Attribute::ReadNone, },
@ -248,9 +226,6 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
  {  OC::BitcastI64toF64,         "BitcastI64toF64",          OCC::BitcastI64toF64,          "bitcastI64toF64",             true, false, false, false, false, false, false, false, false, Attribute::ReadNone, },
  {  OC::BitcastF64toI64,         "BitcastF64toI64",          OCC::BitcastF64toI64,          "bitcastF64toI64",             true, false, false, false, false, false, false, false, false, Attribute::ReadNone, },

-  // GS                                                                                                                     void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::GSInstanceID,            "GSInstanceID",             OCC::GSInstanceID,             "gsInstanceID",               false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
-
  // Legacy floating-point                                                                                                  void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
  {  OC::LegacyF32ToF16,          "LegacyF32ToF16",           OCC::LegacyF32ToF16,           "legacyF32ToF16",              true, false, false, false, false, false, false, false, false, Attribute::ReadNone, },
  {  OC::LegacyF16ToF32,          "LegacyF16ToF32",           OCC::LegacyF16ToF32,           "legacyF16ToF32",              true, false, false, false, false, false, false, false, false, Attribute::ReadNone, },
@ -261,13 +236,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
  {  OC::LegacyDoubleToUInt32,    "LegacyDoubleToUInt32",     OCC::LegacyDoubleToUInt32,     "legacyDoubleToUInt32",        true, false, false, false, false, false, false, false, false, Attribute::ReadNone, },

  // Wave                                                                                                                   void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::WaveAllBitCount,         "WaveAllBitCount",          OCC::WaveAllOp,                "waveAllOp",                   true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
-  {  OC::WavePrefixBitCount,      "WavePrefixBitCount",       OCC::WavePrefixOp,             "wavePrefixOp",                true, false, false, false, false, false, false, false, false, Attribute::ReadOnly, },
-
-  // Pixel shader                                                                                                           void,     h,     f,     d,    i1,    i8,   i16,   i32,   i64  function attribute
-  {  OC::SampleIndex,             "SampleIndex",              OCC::SampleIndex,              "sampleIndex",                false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
-  {  OC::Coverage,                "Coverage",                 OCC::Coverage,                 "coverage",                   false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
-  {  OC::InnerCoverage,           "InnerCoverage",            OCC::InnerCoverage,            "innerCoverage",              false, false, false, false, false, false, false,  true, false, Attribute::ReadNone, },
+  {  OC::WaveAllBitCount,         "WaveAllBitCount",          OCC::WaveAllOp,                "waveAllOp",                   true, false, false, false, false, false, false, false, false, Attribute::None,     },
+  {  OC::WavePrefixBitCount,      "WavePrefixBitCount",       OCC::WavePrefixOp,             "wavePrefixOp",                true, false, false, false, false, false, false, false, false, Attribute::None,     },
 };
 // OPCODE-OLOADS:END

@ -379,14 +349,13 @@ bool OP::IsDxilOpWave(OpCode C) {
  unsigned op = (unsigned)C;
  /* <py::lines('OPCODE-WAVE')>hctdb_instrhelp.get_instrs_pred("op", "is_wave")</py>*/
  // OPCODE-WAVE:BEGIN
-  // Instructions: WaveCaptureReserved=114, WaveIsFirstLane=115,
-  // WaveGetLaneIndex=116, WaveGetLaneCount=117, WaveIsHelperLaneReserved=118,
-  // WaveAnyTrue=119, WaveAllTrue=120, WaveActiveAllEqual=121,
-  // WaveActiveBallot=122, WaveReadLaneAt=123, WaveReadLaneFirst=124,
-  // WaveActiveOp=125, WaveActiveBit=126, WavePrefixOp=127,
-  // WaveGetOrderedIndex=128, QuadReadLaneAt=130, QuadOp=131,
-  // WaveAllBitCount=144, WavePrefixBitCount=145
-  return 114 <= op && op <= 128 || 130 <= op && op <= 131 || 144 <= op && op <= 145;
+  // Instructions: WaveIsFirstLane=110, WaveGetLaneIndex=111,
+  // WaveGetLaneCount=112, WaveAnyTrue=113, WaveAllTrue=114,
+  // WaveActiveAllEqual=115, WaveActiveBallot=116, WaveReadLaneAt=117,
+  // WaveReadLaneFirst=118, WaveActiveOp=119, WaveActiveBit=120,
+  // WavePrefixOp=121, QuadReadLaneAt=122, QuadOp=123, WaveAllBitCount=135,
+  // WavePrefixBitCount=136
+  return 110 <= op && op <= 123 || 135 <= op && op <= 136;
  // OPCODE-WAVE:END
 }

@ -394,10 +363,10 @@ bool OP::IsDxilOpGradient(OpCode C) {
  unsigned op = (unsigned)C;
  /* <py::lines('OPCODE-GRADIENT')>hctdb_instrhelp.get_instrs_pred("op", "is_gradient")</py>*/
  // OPCODE-GRADIENT:BEGIN
-  // Instructions: Sample=61, SampleBias=62, SampleCmp=65, TextureGather=74,
-  // TextureGatherCmp=75, CalculateLOD=84, DerivCoarseX=86, DerivCoarseY=87,
-  // DerivFineX=88, DerivFineY=89
-  return 61 <= op && op <= 62 || op == 65 || 74 <= op && op <= 75 || op == 84 || 86 <= op && op <= 89;
+  // Instructions: Sample=60, SampleBias=61, SampleCmp=64, TextureGather=73,
+  // TextureGatherCmp=74, CalculateLOD=81, DerivCoarseX=83, DerivCoarseY=84,
+  // DerivFineX=85, DerivFineY=86
+  return 60 <= op && op <= 61 || op == 64 || 73 <= op && op <= 74 || op == 81 || 83 <= op && op <= 86;
  // OPCODE-GRADIENT:END
 }

@ -441,6 +410,20 @@ OP::OP(LLVMContext &Ctx, Module *pModule)

  Type *Int4Types[4] = { Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx), Type::getInt32Ty(m_Ctx) }; // HiHi, HiLo, LoHi, LoLo
  m_pInt4Type = GetOrCreateStructType(m_Ctx, Int4Types, "dx.types.fouri32", pModule);
+  // Try to find existing intrinsic function.
+  RefreshCache(pModule);
+}
+
+void OP::RefreshCache(llvm::Module *pModule) {
+  for (Function &F : pModule->functions()) {
+    if (OP::IsDxilOpFunc(&F) && !F.user_empty()) {
+      CallInst *CI = cast<CallInst>(*F.user_begin());
+      OpCode OpCode = OP::GetDxilOpFuncCallInst(CI);
+      Type *pOverloadType = OP::GetOverloadType(OpCode, &F);
+      Function *OpFunc = GetOpFunc(OpCode, pOverloadType);
+      DXASSERT_NOMSG(OpFunc == &F);
+    }
+  }
 }

 Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
@ -448,7 +431,8 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  _Analysis_assume_(0 <= (unsigned)OpCode && OpCode < OpCode::NumOpCodes);
  DXASSERT(IsOverloadLegal(OpCode, pOverloadType), "otherwise the caller requested illegal operation overload (eg HLSL function with unsupported types for mapped intrinsic function)");
  unsigned TypeSlot = GetTypeSlot(pOverloadType);
-  Function *&F = m_OpCodeClassCache[(unsigned)m_OpCodeProps[(unsigned)OpCode].OpCodeClass].pOverloads[TypeSlot];
+  OpCodeClass opClass = m_OpCodeProps[(unsigned)OpCode].OpCodeClass;
+  Function *&F = m_OpCodeClassCache[(unsigned)opClass].pOverloads[TypeSlot];
  if (F != nullptr)
    return F;

@ -514,6 +498,7 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  case OpCode::Atan:                   A(pETy);     A(pI32); A(pETy); break;
  case OpCode::Hcos:                   A(pETy);     A(pI32); A(pETy); break;
  case OpCode::Hsin:                   A(pETy);     A(pI32); A(pETy); break;
+  case OpCode::Htan:                   A(pETy);     A(pI32); A(pETy); break;
  case OpCode::Exp:                    A(pETy);     A(pI32); A(pETy); break;
  case OpCode::Frc:                    A(pETy);     A(pI32); A(pETy); break;
  case OpCode::Log:                    A(pETy);     A(pI32); A(pETy); break;
@ -548,11 +533,9 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  case OpCode::UMul:                   A(p2I32);    A(pI32); A(pETy); A(pETy); break;
  case OpCode::UDiv:                   A(p2I32);    A(pI32); A(pETy); A(pETy); break;

-    // Binary int with carry
-  case OpCode::IAddc:                  A(pI32C);    A(pI32); A(pETy); A(pETy); break;
+    // Binary uint with carry or borrow
  case OpCode::UAddc:                  A(pI32C);    A(pI32); A(pETy); A(pETy); break;
-  case OpCode::ISubc:                  A(pI32C);    A(pI32); A(pETy); A(pETy); break;
-  case OpCode::USubc:                  A(pI32C);    A(pI32); A(pETy); A(pETy); break;
+  case OpCode::USubb:                  A(pI32C);    A(pI32); A(pETy); A(pETy); break;

    // Tertiary float
  case OpCode::FMad:                   A(pETy);     A(pI32); A(pETy); A(pETy); A(pETy); break;
@ -599,10 +582,6 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  case OpCode::TextureGather:          RRT(pETy);   A(pI32); A(pRes); A(pRes); A(pF32); A(pF32); A(pF32); A(pF32); A(pI32); A(pI32); A(pI32); break;
  case OpCode::TextureGatherCmp:       RRT(pETy);   A(pI32); A(pRes); A(pRes); A(pF32); A(pF32); A(pF32); A(pF32); A(pI32); A(pI32); A(pI32); A(pF32); break;

-    // 
-  case OpCode::ToDelete5:              A(pV);       A(pI32); break;
-  case OpCode::ToDelete6:              A(pV);       A(pI32); break;
-
    // Resources - sample
  case OpCode::Texture2DMSGetSamplePosition:A(pPos);     A(pI32); A(pRes); A(pI32); break;
  case OpCode::RenderTargetGetSamplePosition:A(pPos);     A(pI32); A(pI32); break;
@ -623,6 +602,9 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  case OpCode::EvalSnapped:            A(pETy);     A(pI32); A(pI32); A(pI32); A(pI8);  A(pI32); A(pI32); break;
  case OpCode::EvalSampleIndex:        A(pETy);     A(pI32); A(pI32); A(pI32); A(pI8);  A(pI32); break;
  case OpCode::EvalCentroid:           A(pETy);     A(pI32); A(pI32); A(pI32); A(pI8);  break;
+  case OpCode::SampleIndex:            A(pI32);     A(pI32); break;
+  case OpCode::Coverage:               A(pI32);     A(pI32); break;
+  case OpCode::InnerCoverage:          A(pI32);     A(pI32); break;

    // Compute shader
  case OpCode::ThreadId:               A(pI32);     A(pI32); A(pI32); break;
@ -634,21 +616,12 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  case OpCode::EmitStream:             A(pV);       A(pI32); A(pI8);  break;
  case OpCode::CutStream:              A(pV);       A(pI32); A(pI8);  break;
  case OpCode::EmitThenCutStream:      A(pV);       A(pI32); A(pI8);  break;
+  case OpCode::GSInstanceID:           A(pI32);     A(pI32); break;

    // Double precision
  case OpCode::MakeDouble:             A(pF64);     A(pI32); A(pI32); A(pI32); break;
-
-    // 
-  case OpCode::ToDelete1:              A(pV);       A(pI32); break;
-  case OpCode::ToDelete2:              A(pV);       A(pI32); break;
-
-    // Double precision
  case OpCode::SplitDouble:            A(pSDT);     A(pI32); A(pF64); break;

-    // 
-  case OpCode::ToDelete3:              A(pV);       A(pI32); break;
-  case OpCode::ToDelete4:              A(pV);       A(pI32); break;
-
    // Domain and hull shader
  case OpCode::LoadOutputControlPoint: A(pETy);     A(pI32); A(pI32); A(pI32); A(pI8);  A(pI32); break;
  case OpCode::LoadPatchConstant:      A(pETy);     A(pI32); A(pI32); A(pI32); A(pI8);  break;
@ -664,15 +637,10 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
    // Other
  case OpCode::CycleCounterLegacy:     A(p2I32);    A(pI32); break;

-    // Unary float
-  case OpCode::Htan:                   A(pETy);     A(pI32); A(pETy); break;
-
    // Wave
-  case OpCode::WaveCaptureReserved:    A(pV);       A(pI32); break;
  case OpCode::WaveIsFirstLane:        A(pI1);      A(pI32); break;
  case OpCode::WaveGetLaneIndex:       A(pI32);     A(pI32); break;
  case OpCode::WaveGetLaneCount:       A(pI32);     A(pI32); break;
-  case OpCode::WaveIsHelperLaneReserved:A(pV);       A(pI32); break;
  case OpCode::WaveAnyTrue:            A(pI1);      A(pI32); A(pI1);  break;
  case OpCode::WaveAllTrue:            A(pI1);      A(pI32); A(pI1);  break;
  case OpCode::WaveActiveAllEqual:     A(pI1);      A(pI32); A(pETy); break;
@ -682,12 +650,6 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  case OpCode::WaveActiveOp:           A(pETy);     A(pI32); A(pETy); A(pI8);  A(pI8);  break;
  case OpCode::WaveActiveBit:          A(pETy);     A(pI32); A(pETy); A(pI8);  break;
  case OpCode::WavePrefixOp:           A(pETy);     A(pI32); A(pETy); A(pI8);  A(pI8);  break;
-  case OpCode::WaveGetOrderedIndex:    A(pV);       A(pI32); break;
-
-    // 
-  case OpCode::GlobalOrderedCountIncReserved:A(pV);       A(pI32); break;
-
-    // Wave
  case OpCode::QuadReadLaneAt:         A(pETy);     A(pI32); A(pETy); A(pI32); break;
  case OpCode::QuadOp:                 A(pETy);     A(pI32); A(pETy); A(pI8);  break;

@ -699,9 +661,6 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  case OpCode::BitcastI64toF64:        A(pF64);     A(pI32); A(pI64); break;
  case OpCode::BitcastF64toI64:        A(pI64);     A(pI32); A(pF64); break;

-    // GS
-  case OpCode::GSInstanceID:           A(pI32);     A(pI32); break;
-
    // Legacy floating-point
  case OpCode::LegacyF32ToF16:         A(pI32);     A(pI32); A(pF32); break;
  case OpCode::LegacyF16ToF32:         A(pF32);     A(pI32); A(pI32); break;
@ -714,11 +673,6 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
    // Wave
  case OpCode::WaveAllBitCount:        A(pI32);     A(pI32); A(pI1);  break;
  case OpCode::WavePrefixBitCount:     A(pI32);     A(pI32); A(pI1);  break;
-
-    // Pixel shader
-  case OpCode::SampleIndex:            A(pI32);     A(pI32); break;
-  case OpCode::Coverage:               A(pI32);     A(pI32); break;
-  case OpCode::InnerCoverage:          A(pI32);     A(pI32); break;
  // OPCODE-OLOAD-FUNCS:END
  default: DXASSERT(false, "otherwise unhandled case"); break;
  }
@ -728,15 +682,10 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  FunctionType *pFT;
  DXASSERT(ArgTypes.size() > 1, "otherwise forgot to initialize arguments");
  pFT = FunctionType::get(ArgTypes[0], ArrayRef<Type*>(&ArgTypes[1], ArgTypes.size()-1), false);
-  if (pOverloadType != pV) {
-    F = Function::Create(pFT, GlobalValue::LinkageTypes::ExternalLinkage, 
-                         funcName,
-                         m_pModule);
-  } else {
-    F = Function::Create(pFT, GlobalValue::LinkageTypes::ExternalLinkage, 
-                         funcName,
-                         m_pModule);
-  }
+
+  F = cast<Function>(m_pModule->getOrInsertFunction(funcName, pFT));
+
+  m_FunctionToOpClass[F] = opClass;
  F->setCallingConv(CallingConv::C);
  F->addFnAttr(Attribute::NoUnwind);
  if (m_OpCodeProps[(unsigned)OpCode].FuncAttr != Attribute::None)
@ -745,6 +694,132 @@ Function *OP::GetOpFunc(OpCode OpCode, Type *pOverloadType) {
  return F;
 }

+llvm::ArrayRef<llvm::Function *> OP::GetOpFuncList(OpCode OpCode) const {
+  DXASSERT(0 <= (unsigned)OpCode && OpCode < OpCode::NumOpCodes, "otherwise caller passed OOB OpCode");
+  _Analysis_assume_(0 <= (unsigned)OpCode && OpCode < OpCode::NumOpCodes);
+  return m_OpCodeClassCache[(unsigned)m_OpCodeProps[(unsigned)OpCode].OpCodeClass].pOverloads;
+}
+
+void OP::RemoveFunction(Function *F) {
+  if (OP::IsDxilOpFunc(F)) {
+    OpCodeClass opClass = m_FunctionToOpClass[F];
+    for (unsigned i=0;i<kNumTypeOverloads;i++) {
+      if (F == m_OpCodeClassCache[(unsigned)opClass].pOverloads[i]) {
+        m_OpCodeClassCache[(unsigned)opClass].pOverloads[i] = nullptr;
+        m_FunctionToOpClass.erase(F);
+        break;
+      }
+    }
+  }
+}
+
+llvm::Type *OP::GetOverloadType(OpCode OpCode, llvm::Function *F) {
+  DXASSERT(F, "not work on nullptr");
+  Type *Ty = F->getReturnType();
+  FunctionType *FT = F->getFunctionType();
+/* <py::lines('OPCODE-OLOAD-TYPES')>hctdb_instrhelp.get_funcs_oload_type()</py>*/
+  switch (OpCode) {            // return     OpCode
+  // OPCODE-OLOAD-TYPES:BEGIN
+  case OpCode::IsNaN:
+  case OpCode::IsInf:
+  case OpCode::IsFinite:
+  case OpCode::IsNormal:
+  case OpCode::Countbits:
+  case OpCode::FirstbitLo:
+  case OpCode::FirstbitHi:
+  case OpCode::FirstbitSHi:
+  case OpCode::IMul:
+  case OpCode::UMul:
+  case OpCode::UDiv:
+  case OpCode::UAddc:
+  case OpCode::USubb:
+  case OpCode::WaveActiveAllEqual:
+    DXASSERT_NOMSG(FT->getNumParams() > 1);
+    return FT->getParamType(1);
+  case OpCode::TempRegStore:
+    DXASSERT_NOMSG(FT->getNumParams() > 2);
+    return FT->getParamType(2);
+  case OpCode::MinPrecXRegStore:
+  case OpCode::StoreOutput:
+  case OpCode::BufferStore:
+  case OpCode::StorePatchConstant:
+    DXASSERT_NOMSG(FT->getNumParams() > 4);
+    return FT->getParamType(4);
+  case OpCode::TextureStore:
+    DXASSERT_NOMSG(FT->getNumParams() > 5);
+    return FT->getParamType(5);
+  case OpCode::MakeDouble:
+  case OpCode::SplitDouble:
+    return Type::getDoubleTy(m_Ctx);
+  case OpCode::CheckAccessFullyMapped:
+  case OpCode::AtomicBinOp:
+  case OpCode::AtomicCompareExchange:
+  case OpCode::SampleIndex:
+  case OpCode::Coverage:
+  case OpCode::InnerCoverage:
+  case OpCode::ThreadId:
+  case OpCode::GroupId:
+  case OpCode::ThreadIdInGroup:
+  case OpCode::FlattenedThreadIdInGroup:
+  case OpCode::GSInstanceID:
+  case OpCode::OutputControlPointID:
+  case OpCode::PrimitiveID:
+    return IntegerType::get(m_Ctx, 32);
+  case OpCode::CalculateLOD:
+  case OpCode::DomainLocation:
+    return Type::getFloatTy(m_Ctx);
+  case OpCode::CreateHandle:
+  case OpCode::BufferUpdateCounter:
+  case OpCode::GetDimensions:
+  case OpCode::Texture2DMSGetSamplePosition:
+  case OpCode::RenderTargetGetSamplePosition:
+  case OpCode::RenderTargetGetSampleCount:
+  case OpCode::Barrier:
+  case OpCode::Discard:
+  case OpCode::EmitStream:
+  case OpCode::CutStream:
+  case OpCode::EmitThenCutStream:
+  case OpCode::CycleCounterLegacy:
+  case OpCode::WaveIsFirstLane:
+  case OpCode::WaveGetLaneIndex:
+  case OpCode::WaveGetLaneCount:
+  case OpCode::WaveAnyTrue:
+  case OpCode::WaveAllTrue:
+  case OpCode::WaveActiveBallot:
+  case OpCode::BitcastI16toF16:
+  case OpCode::BitcastF16toI16:
+  case OpCode::BitcastI32toF32:
+  case OpCode::BitcastF32toI32:
+  case OpCode::BitcastI64toF64:
+  case OpCode::BitcastF64toI64:
+  case OpCode::LegacyF32ToF16:
+  case OpCode::LegacyF16ToF32:
+  case OpCode::LegacyDoubleToFloat:
+  case OpCode::LegacyDoubleToSInt32:
+  case OpCode::LegacyDoubleToUInt32:
+  case OpCode::WaveAllBitCount:
+  case OpCode::WavePrefixBitCount:
+    return Type::getVoidTy(m_Ctx);
+  case OpCode::CBufferLoadLegacy:
+  case OpCode::Sample:
+  case OpCode::SampleBias:
+  case OpCode::SampleLevel:
+  case OpCode::SampleGrad:
+  case OpCode::SampleCmp:
+  case OpCode::SampleCmpLevelZero:
+  case OpCode::TextureLoad:
+  case OpCode::BufferLoad:
+  case OpCode::TextureGather:
+  case OpCode::TextureGatherCmp:
+  {
+    StructType *ST = cast<StructType>(Ty);
+    return ST->getElementType(0);
+  }
+  // OPCODE-OLOAD-TYPES:END
+  default: return Ty;
+  }
+}
+
 Type *OP::GetHandleType() const {
  return m_pHandleType;
 }
--- a/lib/HLSL/DxilRootSignature.cpp
+++ b/lib/HLSL/DxilRootSignature.cpp
--- a/lib/HLSL/DxilSignature.cpp
+++ b/lib/HLSL/DxilSignature.cpp
@ -92,7 +92,7 @@ bool DxilSignature::IsFullyAllocated() {
  return true;
 }

-unsigned DxilSignature::PackElements() {
+unsigned DxilSignature::PackElements(DXIL::PackingStrategy packing) {
  unsigned rowsUsed = 0;

  if (m_sigPointKind == DXIL::SigPointKind::GSOut) {
@ -106,7 +106,17 @@ unsigned DxilSignature::PackElements() {
    }
    for (unsigned i = 0; i < 4; ++i) {
      if (!elements[i].empty()) {
-        unsigned streamRowsUsed = alloc[i].PackMain(elements[i], 0, 32);
+        unsigned streamRowsUsed = 0;
+        switch (packing) {
+        case DXIL::PackingStrategy::PrefixStable:
+          streamRowsUsed = alloc[i].PackPrefixStable(elements[i], 0, 32);
+          break;
+        case DXIL::PackingStrategy::Optimized:
+          streamRowsUsed = alloc[i].PackOptimized(elements[i], 0, 32);
+          break;
+        default:
+          DXASSERT(false, "otherwise, invalid packing strategy supplied");
+        }
        if (streamRowsUsed > rowsUsed)
          rowsUsed = streamRowsUsed;
      }
@ -144,7 +154,16 @@ unsigned DxilSignature::PackElements() {
          continue;
        elements.push_back(SE.get());
      }
-      rowsUsed = alloc.PackMain(elements, 0, 32);
+      switch (packing) {
+      case DXIL::PackingStrategy::PrefixStable:
+        rowsUsed = alloc.PackPrefixStable(elements, 0, 32);
+        break;
+      case DXIL::PackingStrategy::Optimized:
+        rowsUsed = alloc.PackOptimized(elements, 0, 32);
+        break;
+      default:
+        DXASSERT(false, "otherwise, invalid packing strategy supplied");
+      }
    }
    break;

--- a/lib/HLSL/DxilSignatureAllocator.cpp
+++ b/lib/HLSL/DxilSignatureAllocator.cpp
@ -212,43 +212,44 @@ struct {

 } // anonymous namespace

-
-unsigned DxilSignatureAllocator::PackGreedy(std::vector<DxilSignatureElement*> elements, unsigned startRow, unsigned numRows, unsigned startCol) {
-  // Allocation failures should be caught by IsFullyAllocated()
+unsigned DxilSignatureAllocator::PackNext(DxilSignatureElement* SE, unsigned startRow, unsigned numRows, unsigned startCol) {
  unsigned rowsUsed = startRow;

-  for (auto &SE : elements) {
-    unsigned rows = SE->GetRows();
-    if (rows > numRows)
-      continue; // element will not fit
+  unsigned rows = SE->GetRows();
+  if (rows > numRows)
+    return rowsUsed; // element will not fit

-    unsigned cols = SE->GetCols();
-    DXASSERT_NOMSG(cols <= 4);
+  unsigned cols = SE->GetCols();
+  DXASSERT_NOMSG(startCol + cols <= 4);

-    bool bAllocated = false;
-    for (unsigned row = startRow; row <= (startRow + numRows - rows); ++row) {
-      if (DetectRowConflict(SE, row))
+  for (unsigned row = startRow; row <= (startRow + numRows - rows); ++row) {
+    if (DetectRowConflict(SE, row))
+      continue;
+    for (unsigned col = startCol; col <= 4 - cols; ++col) {
+      if (DetectColConflict(SE, row, col))
        continue;
-      for (unsigned col = startCol; col <= 4 - cols; ++col) {
-        if (DetectColConflict(SE, row, col))
-          continue;
-        PlaceElement(SE, row, col);
-        SE->SetStartRow((int)row);
-        SE->SetStartCol((int)col);
-        bAllocated = true;
-        if (row + rows > rowsUsed)
-          rowsUsed = row + rows;
-        break;
-      }
-      if (bAllocated)
-        break;
+      PlaceElement(SE, row, col);
+      SE->SetStartRow((int)row);
+      SE->SetStartCol((int)col);
+      return row + rows;
    }
  }

  return rowsUsed;
 }

-unsigned DxilSignatureAllocator::PackMain(std::vector<DxilSignatureElement*> elements, unsigned startRow, unsigned numRows) {
+unsigned DxilSignatureAllocator::PackGreedy(std::vector<DxilSignatureElement*> elements, unsigned startRow, unsigned numRows, unsigned startCol) {
+  // Allocation failures should be caught by IsFullyAllocated()
+  unsigned rowsUsed = startRow;
+
+  for (auto &SE : elements) {
+    rowsUsed = std::max(rowsUsed, PackNext(SE, startRow, numRows, startCol));
+  }
+
+  return rowsUsed;
+}
+
+unsigned DxilSignatureAllocator::PackOptimized(std::vector<DxilSignatureElement*> elements, unsigned startRow, unsigned numRows) {
  unsigned rowsUsed = startRow;

  // Clip/Cull needs special handling due to limitations unique to these.
@ -430,5 +431,60 @@ unsigned DxilSignatureAllocator::PackMain(std::vector<DxilSignatureElement*> ele
  return rowsUsed;
 }

+unsigned DxilSignatureAllocator::PackPrefixStable(std::vector<DxilSignatureElement*> elements, unsigned startRow, unsigned numRows) {
+  unsigned rowsUsed = startRow;
+
+  // Special handling for prefix-stable clip/cull arguments
+  // - basically, do not pack with anything else to maximize chance to pack into two register limit
+  unsigned clipcullRegUsed = 0;
+  DxilSignatureAllocator clipcullAllocator(2);
+  DxilSignatureElement clipcullTempElements[2] = {DXIL::SigPointKind::VSOut, DXIL::SigPointKind::VSOut};
+
+  for (auto &SE : elements) {
+    // Clear any existing allocation
+    if (SE->IsAllocated()) {
+      SE->SetStartRow(-1);
+      SE->SetStartCol(-1);
+    }
+
+    switch (SE->GetInterpretation()) {
+      case DXIL::SemanticInterpretationKind::Arb:
+      case DXIL::SemanticInterpretationKind::SGV:
+        break;
+      case DXIL::SemanticInterpretationKind::SV:
+        if (SE->GetKind() == DXIL::SemanticKind::ClipDistance || SE->GetKind() == DXIL::SemanticKind::CullDistance) {
+          unsigned used = clipcullAllocator.PackNext(SE, 0, 2);
+          if (used) {
+            if (used > clipcullRegUsed) {
+              clipcullRegUsed = used;
+              // allocate placeholder element, reserving new row
+              clipcullTempElements[used - 1].Initialize(SE->GetName(),
+                                                        SE->GetCompType(),
+                                                        *SE->GetInterpolationMode(),
+                                                        1, 4);
+              rowsUsed = std::max(rowsUsed, PackNext(&clipcullTempElements[used - 1], startRow, numRows));
+            }
+            // Actually place element in correct row:
+            SE->SetStartRow(clipcullTempElements[used - 1].GetStartRow());
+          }
+          continue;
+        }
+        break;
+      case DXIL::SemanticInterpretationKind::TessFactor:
+        if (SE->GetRows() > 1) {
+          // Maximize opportunity for packing while preserving prefix-stable property
+          rowsUsed = std::max(rowsUsed, PackNext(SE, startRow, numRows, 3));
+          continue;
+        }
+        break;
+      default:
+        DXASSERT(false, "otherwise, unexpected interpretation for allocated element");
+    }
+    rowsUsed = std::max(rowsUsed, PackNext(SE, startRow, numRows));
+  }
+
+  return rowsUsed;
+}
+

 } // namespace hlsl
--- a/lib/HLSL/DxilTypeSystem.cpp
+++ b/lib/HLSL/DxilTypeSystem.cpp
@ -173,9 +173,10 @@ DxilStructAnnotation *DxilTypeSystem::GetStructAnnotation(const StructType *pStr
 }

 void DxilTypeSystem::EraseStructAnnotation(const StructType *pStructType) {
-  auto it = m_StructAnnotations.find(pStructType);
-  DXASSERT_NOMSG(it != m_StructAnnotations.end());
-  m_StructAnnotations.erase(it);
+  DXASSERT_NOMSG(m_StructAnnotations.count(pStructType));
+  m_StructAnnotations.remove_if([pStructType](
+      const std::pair<const StructType *, std::unique_ptr<DxilStructAnnotation>>
+          &I) { return pStructType == I.first; });
 }

 DxilTypeSystem::StructAnnotationMap &DxilTypeSystem::GetStructAnnotationMap() {
@ -201,9 +202,10 @@ DxilFunctionAnnotation *DxilTypeSystem::GetFunctionAnnotation(const Function *pF
 }

 void DxilTypeSystem::EraseFunctionAnnotation(const Function *pFunction) {
-  auto it = m_FunctionAnnotations.find(pFunction);
-  DXASSERT_NOMSG(it != m_FunctionAnnotations.end());
-  m_FunctionAnnotations.erase(it);
+  DXASSERT_NOMSG(m_FunctionAnnotations.count(pFunction));
+  m_FunctionAnnotations.remove_if([pFunction](
+      const std::pair<const Function *, std::unique_ptr<DxilFunctionAnnotation>>
+          &I) { return pFunction == I.first; });
 }

 DxilTypeSystem::FunctionAnnotationMap &DxilTypeSystem::GetFunctionAnnotationMap() {
--- a/lib/HLSL/DxilValidation.cpp
+++ b/lib/HLSL/DxilValidation.cpp
--- a/lib/HLSL/HLMatrixLowerPass.cpp
+++ b/lib/HLSL/HLMatrixLowerPass.cpp
@ -1416,7 +1416,7 @@ void HLMatrixLowerPass::TranslateMatLoadStoreOnGlobal(
    Value *matGlobal, ArrayRef<Value *> vecGlobals,
    CallInst *matLdStInst) {
  // No dynamic indexing on matrix, flatten matrix to scalars.
-
+  // Internal global matrix use row major follow the initializer.
  Type *matType = matGlobal->getType()->getPointerElementType();
  unsigned col, row;
  HLMatrixLower::GetMatrixInfo(matType, col, row);
@ -2179,6 +2179,9 @@ void HLMatrixLowerPass::finalMatTranslation(Instruction *matInst) {
    case HLOpcodeGroup::HLSelect: {
      TranslateMatSelect(CI);
    } break;
+    default:
+      // Skip group already translated.
+      break;
    }
  }
 }
@ -2208,6 +2211,22 @@ static bool OnlyUsedByMatrixLdSt(Value *V) {
  return onlyLdSt;
 }

+static Constant *LowerMatrixArrayConst(Constant *MA, ArrayType *ResultTy) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(MA->getType())) {
+    std::vector<Constant *> Elts;
+    ArrayType *EltResultTy = cast<ArrayType>(ResultTy->getElementType());
+    for (unsigned i = 0; i < AT->getNumElements(); i++) {
+      Constant *Elt =
+          LowerMatrixArrayConst(MA->getAggregateElement(i), EltResultTy);
+      Elts.emplace_back(Elt);
+    }
+    return ConstantArray::get(ResultTy, Elts);
+  } else {
+    // Get float[row][col] from the struct.
+    return MA->getAggregateElement((unsigned)0);
+  }
+}
+
 void HLMatrixLowerPass::runOnGlobalMatrixArray(GlobalVariable *GV) {
  // Lower to array of vector array like float[row][col].
  // DynamicIndexingVectorToArray will change it to scalar array.
@ -2227,10 +2246,11 @@ void HLMatrixLowerPass::runOnGlobalMatrixArray(GlobalVariable *GV) {
    Ty = ArrayType::get(Ty, *arraySize);

  Type *VecArrayTy = Ty;
-
-  // Matrix will use store to initialize.
-  // So set init val to undef.
-  Constant *InitVal = UndefValue::get(VecArrayTy);
+  Constant *OldInitVal = GV->getInitializer();
+  Constant *InitVal =
+      isa<UndefValue>(OldInitVal)
+          ? UndefValue::get(VecArrayTy)
+          : LowerMatrixArrayConst(OldInitVal, cast<ArrayType>(VecArrayTy));

  bool isConst = GV->isConstant();
  GlobalVariable::ThreadLocalMode TLMode = GV->getThreadLocalMode();
@ -2282,6 +2302,24 @@ void HLMatrixLowerPass::runOnGlobalMatrixArray(GlobalVariable *GV) {
  GV->eraseFromParent();
 }

+static void FlattenMatConst(Constant *M, std::vector<Constant *> &Elts) {
+  unsigned row, col;
+  Type *EltTy = HLMatrixLower::GetMatrixInfo(M->getType(), col, row);
+  if (isa<UndefValue>(M)) {
+    Constant *Elt = UndefValue::get(EltTy);
+    for (unsigned i=0;i<col*row;i++)
+      Elts.emplace_back(Elt);
+  } else {
+    M = M->getAggregateElement((unsigned)0);
+    for (unsigned r = 0; r < row; r++) {
+      Constant *R = M->getAggregateElement(r);
+      for (unsigned c = 0; c < col; c++) {
+        Elts.emplace_back(R->getAggregateElement(c));
+      }
+    }
+  }
+}
+
 void HLMatrixLowerPass::runOnGlobal(GlobalVariable *GV) {
  if (HLMatrixLower::IsMatrixArrayPointer(GV->getType())) {
    runOnGlobalMatrixArray(GV);
@ -2300,13 +2338,13 @@ void HLMatrixLowerPass::runOnGlobal(GlobalVariable *GV) {
  Module *M = GV->getParent();
  const DataLayout &DL = M->getDataLayout();

+  std::vector<Constant *> Elts;
+  FlattenMatConst(GV->getInitializer(), Elts);
+
  if (onlyLdSt) {
    Type *EltTy = vecTy->getVectorElementType();
    unsigned vecSize = vecTy->getVectorNumElements();
    std::vector<Value *> vecGlobals(vecSize);
-    // Matrix will use store to initialize.
-    // So set init val to undef.
-    Constant *InitVal = UndefValue::get(EltTy);

    GlobalVariable::ThreadLocalMode TLMode = GV->getThreadLocalMode();
    unsigned AddressSpace = GV->getType()->getAddressSpace();
@ -2315,6 +2353,7 @@ void HLMatrixLowerPass::runOnGlobal(GlobalVariable *GV) {
    unsigned size = DL.getTypeAllocSizeInBits(EltTy);
    unsigned align = DL.getPrefTypeAlignment(EltTy);
    for (int i = 0, e = vecSize; i != e; ++i) {
+      Constant *InitVal = Elts[i];
      GlobalVariable *EltGV = new llvm::GlobalVariable(
          *M, EltTy, /*IsConstant*/ isConst, linkage,
          /*InitVal*/ InitVal, GV->getName() + "." + Twine(i),
@ -2341,9 +2380,10 @@ void HLMatrixLowerPass::runOnGlobal(GlobalVariable *GV) {
  else {
    // lower to array of scalar here.
    ArrayType *AT = ArrayType::get(vecTy->getVectorElementType(), vecTy->getVectorNumElements());
+    Constant *InitVal = ConstantArray::get(AT, Elts);
    GlobalVariable *arrayMat = new llvm::GlobalVariable(
      *M, AT, /*IsConstant*/ false, llvm::GlobalValue::InternalLinkage,
-      /*InitVal*/ UndefValue::get(AT), GV->getName());
+      /*InitVal*/ InitVal, GV->getName());
    // Add debug info.
    if (m_HasDbgInfo) {
      DebugInfoFinder &Finder = m_pHLModule->GetOrCreateDebugInfoFinder();
--- a/lib/HLSL/HLModule.cpp
+++ b/lib/HLSL/HLModule.cpp
@ -212,6 +212,7 @@ void HLModule::RemoveFunction(llvm::Function *F) {
  m_HLFunctionPropsMap.erase(F);
  if (m_pTypeSystem.get()->GetFunctionAnnotation(F))
    m_pTypeSystem.get()->EraseFunctionAnnotation(F);
+  m_pOP->RemoveFunction(F);
 }

 template <typename TResource>
@ -315,6 +316,10 @@ DxilTypeSystem *HLModule::ReleaseTypeSystem() {
  return m_pTypeSystem.release();
 }

+hlsl::OP *HLModule::ReleaseOP() {
+  return m_pOP.release();
+}
+
 RootSignatureHandle *HLModule::ReleaseRootSignature() {
  return m_RootSignature.release();
 }
@ -467,6 +472,10 @@ void HLModule::EmitHLMetadata() {
    NamedMDNode * resTyAnnotations = m_pModule->getOrInsertNamedMetadata(kHLDxilResourceTypeAnnotationMDName);
    resTyAnnotations->addOperand(EmitResTyAnnotations());
  }
+
+  if (!m_RootSignature->IsEmpty()) {
+    m_pMDHelper->EmitRootSignature(*m_RootSignature.get());
+  }
 }

 void HLModule::LoadHLMetadata() {
@ -541,6 +550,8 @@ void HLModule::LoadHLMetadata() {
    if (MDResTyAnnotations->getNumOperands())
      LoadResTyAnnotations(MDResTyAnnotations->getOperand(0));
  }
+
+  m_pMDHelper->LoadRootSignature(*m_RootSignature.get());
 }

 void HLModule::ClearHLMetadata(llvm::Module &M) {
@ -553,6 +564,7 @@ void HLModule::ClearHLMetadata(llvm::Module &M) {
    if (name == DxilMDHelper::kDxilVersionMDName ||
        name == DxilMDHelper::kDxilShaderModelMDName ||
        name == DxilMDHelper::kDxilEntryPointsMDName ||
+        name == DxilMDHelper::kDxilRootSignatureMDName ||
        name == DxilMDHelper::kDxilResourcesMDName ||
        name == DxilMDHelper::kDxilTypeSystemMDName ||
        name == kHLDxilFunctionPropertiesMDName || // TODO: adjust to proper name
@ -691,34 +703,11 @@ void HLModule::LoadResTyAnnotations(const llvm::MDOperand &MDO) {
 }

 MDTuple *HLModule::EmitHLShaderProperties() {
-  vector<Metadata *> MDVals;
-  if (!m_RootSignature->IsEmpty()) {
-    MDVals.emplace_back(m_pMDHelper->Uint32ToConstMD(DxilMDHelper::kDxilRootSignatureTag));
-    MDVals.emplace_back(m_pMDHelper->EmitRootSignature(*m_RootSignature.get()));
-  }
-  return MDNode::get(m_Ctx, MDVals);
+  return nullptr;
 }

 void HLModule::LoadHLShaderProperties(const MDOperand &MDO) {
-  if (MDO.get() == nullptr)
-    return;
-
-  const MDTuple *pTupleMD = dyn_cast<MDTuple>(MDO.get());
-  IFTBOOL(pTupleMD != nullptr, DXC_E_INCORRECT_DXIL_METADATA);
-  IFTBOOL((pTupleMD->getNumOperands() & 0x1) == 0, DXC_E_INCORRECT_DXIL_METADATA);
-  for (unsigned iNode = 0; iNode < pTupleMD->getNumOperands(); iNode += 2) {
-    unsigned Tag = DxilMDHelper::ConstMDToUint32(pTupleMD->getOperand(iNode));
-    const MDOperand &MDO = pTupleMD->getOperand(iNode + 1);
-    IFTBOOL(MDO.get() != nullptr, DXC_E_INCORRECT_DXIL_METADATA);
-    switch (Tag) {
-    case DxilMDHelper::kDxilRootSignatureTag:
-      m_pMDHelper->LoadRootSignature(MDO, *m_RootSignature.get());
-      break;
-    default:
-      // Ignore other extended properties for now.
-      break;
-    }
-  }
+  return;
 }

 // TODO: Don't check names.
@ -985,7 +974,8 @@ bool HLModule::HasPreciseAttributeWithMetadata(Instruction *I) {
 void HLModule::MarkPreciseAttributeWithMetadata(Instruction *I) {
  LLVMContext &Ctx = I->getContext();
  MDNode *preciseNode = MDNode::get(
-      Ctx, {MDString::get(Ctx, DxilMDHelper::kDxilPreciseAttributeMDName)});
+      Ctx,
+      {ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))});

  I->setMetadata(DxilMDHelper::kDxilPreciseAttributeMDName, preciseNode);
 }
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
--- a/lib/HLSL/HLOperationLowerExtension.cpp
+++ b/lib/HLSL/HLOperationLowerExtension.cpp
@ -34,6 +34,7 @@ ExtensionLowering::Strategy ExtensionLowering::GetStrategy(StringRef strategy) {
    case 'n': return Strategy::NoTranslation;
    case 'r': return Strategy::Replicate;
    case 'p': return Strategy::Pack;
+    case 'm': return Strategy::Resource;
    default: break;
  }
  return Strategy::Unknown;
@ -44,17 +45,18 @@ llvm::StringRef ExtensionLowering::GetStrategyName(Strategy strategy) {
    case Strategy::NoTranslation: return "n";
    case Strategy::Replicate:     return "r";
    case Strategy::Pack:          return "p";
+    case Strategy::Resource:      return "m"; // m for resource method
    default: break;
  }
  return "?";
 }

-ExtensionLowering::ExtensionLowering(Strategy strategy, HLSLExtensionsCodegenHelper *helper) 
-  : m_strategy(strategy), m_helper(helper)
+ExtensionLowering::ExtensionLowering(Strategy strategy, HLSLExtensionsCodegenHelper *helper, const HandleMap &handleMap, OP& hlslOp)
+  : m_strategy(strategy), m_helper(helper), m_handleMap(handleMap), m_hlslOp(hlslOp)
  {}

-ExtensionLowering::ExtensionLowering(StringRef strategy, HLSLExtensionsCodegenHelper *helper) 
-  : ExtensionLowering(GetStrategy(strategy), helper)
+ExtensionLowering::ExtensionLowering(StringRef strategy, HLSLExtensionsCodegenHelper *helper, const HandleMap &handleMap, OP& hlslOp)
+  : ExtensionLowering(GetStrategy(strategy), helper, handleMap, hlslOp)
  {}

 llvm::Value *ExtensionLowering::Translate(llvm::CallInst *CI) {
@ -62,6 +64,7 @@ llvm::Value *ExtensionLowering::Translate(llvm::CallInst *CI) {
  case Strategy::NoTranslation: return NoTranslation(CI);
  case Strategy::Replicate:     return Replicate(CI);
  case Strategy::Pack:          return Pack(CI);
+  case Strategy::Resource:      return Resource(CI);
  default: break;
  }
  return Unknown(CI);
@ -75,8 +78,17 @@ llvm::Value *ExtensionLowering::Unknown(CallInst *CI) {
 // Interface to describe how to translate types from HL-dxil to dxil.
 class FunctionTypeTranslator {
 public:
+  // Arguments can be exploded into multiple copies of the same type.
+  // For example a <2 x i32> could become { i32, 2 } if the vector
+  // is expanded in place or { i32, 1 } if the call is replicated.
+  struct ArgumentType {
+    Type *type;
+    int  count;
+
+    ArgumentType(Type *ty, int cnt = 1) : type(ty), count(cnt) {}
+  };
  virtual Type *TranslateReturnType(CallInst *CI) = 0;
-  virtual Type *TranslateArgumentType(Type *OrigArgType) = 0;
+  virtual ArgumentType TranslateArgumentType(Value *OrigArg) = 0;
 };

 // Class to create the new function with the translated types for low-level dxil.
@ -85,6 +97,10 @@ public:
  template <typename TypeTranslator>
  static Function *GetLoweredFunction(CallInst *CI, ExtensionLowering &lower) {
    TypeTranslator typeTranslator;
+    return GetLoweredFunction(typeTranslator, CI, lower);
+  }
+  
+  static Function *GetLoweredFunction(FunctionTypeTranslator &typeTranslator, CallInst *CI, ExtensionLowering &lower) {
    FunctionTranslator translator(typeTranslator, lower);
    return translator.GetLoweredFunction(CI);
  }
@ -120,9 +136,11 @@ private:
    SmallVector<Type *, 10> ParamTypes;
    ParamTypes.reserve(CI->getNumArgOperands());
    for (unsigned i = 0; i < CI->getNumArgOperands(); ++i) {
-      Type *OrigTy = CI->getArgOperand(i)->getType();
-      Type *TranslatedTy = m_typeTranslator.TranslateArgumentType(OrigTy);
-      ParamTypes.push_back(TranslatedTy);
+      Value *OrigArg = CI->getArgOperand(i);
+      FunctionTypeTranslator::ArgumentType newArgType = m_typeTranslator.TranslateArgumentType(OrigArg);
+      for (int i = 0; i < newArgType.count; ++i) {
+        ParamTypes.push_back(newArgType.type);
+      }
    }

    const bool IsVarArg = false;
@ -151,8 +169,8 @@ class NoTranslationTypeTranslator : public FunctionTypeTranslator {
  virtual Type *TranslateReturnType(CallInst *CI) override {
    return CI->getType();
  }
-  virtual Type *TranslateArgumentType(Type *OrigArgType) override {
-    return OrigArgType;
+  virtual ArgumentType TranslateArgumentType(Value *OrigArg) override {
+    return ArgumentType(OrigArg->getType());
  }
 };

@ -212,13 +230,13 @@ class ReplicatedFunctionTypeTranslator : public FunctionTypeTranslator {
    return RetTy;
  }

-  virtual Type *TranslateArgumentType(Type *OrigArgType) override {
-    Type *Ty = OrigArgType;
+  virtual ArgumentType TranslateArgumentType(Value *OrigArg) override {
+    Type *Ty = OrigArg->getType();
    if (Ty->isVectorTy()) {
      Ty = Ty->getVectorElementType();
    }

-    return Ty;
+    return ArgumentType(Ty);
  }

 };
@ -302,19 +320,27 @@ private:
  }
 };

-Value *ExtensionLowering::TranslateReplicating(CallInst *CI, Function *ReplicatedFunction) {
+// Translate the HL call by replicating the call for each vector element.
+//
+// For example,
+//
+//    <2xi32> %r = call @ext.foo(i32 %op, <2xi32> %v)
+//    ==>
+//    %r.1 = call @ext.foo.s(i32 %op, i32 %v.1)
+//    %r.2 = call @ext.foo.s(i32 %op, i32 %v.2)
+//    <2xi32> %r.v.1 = insertelement %r.1, 0, <2xi32> undef
+//    <2xi32> %r.v.2 = insertelement %r.2, 1, %r.v.1
+//
+// You can then RAWU %r with %r.v.2. The RAWU is not done by the translate function.
+Value *ExtensionLowering::Replicate(CallInst *CI) {
+  Function *ReplicatedFunction = FunctionTranslator::GetLoweredFunction<ReplicatedFunctionTypeTranslator>(CI, *this);
  if (!ReplicatedFunction)
-    return nullptr;
+    return NoTranslation(CI);

  ReplicateCall replicate(CI, *ReplicatedFunction);
  return replicate.Generate();
 }

-Value *ExtensionLowering::Replicate(CallInst *CI) {
-  Function *ReplicatedFunction = FunctionTranslator::GetLoweredFunction<ReplicatedFunctionTypeTranslator>(CI, *this);
-  return TranslateReplicating(CI, ReplicatedFunction);
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // Packed Lowering.
 class PackCall {
@ -404,8 +430,8 @@ class PackedFunctionTypeTranslator : public FunctionTypeTranslator {
  virtual Type *TranslateReturnType(CallInst *CI) override {
    return TranslateIfVector(CI->getType());
  }
-  virtual Type *TranslateArgumentType(Type *OrigArgType) override {
-    return TranslateIfVector(OrigArgType);
+  virtual ArgumentType TranslateArgumentType(Value *OrigArg) override {
+    return ArgumentType(TranslateIfVector(OrigArg->getType()));
  }

  Type *TranslateIfVector(Type *ty) {
@ -418,13 +444,198 @@ class PackedFunctionTypeTranslator : public FunctionTypeTranslator {
 Value *ExtensionLowering::Pack(CallInst *CI) {
  Function *PackedFunction = FunctionTranslator::GetLoweredFunction<PackedFunctionTypeTranslator>(CI, *this);
  if (!PackedFunction)
-    return nullptr;
+    return NoTranslation(CI);

  PackCall pack(CI, *PackedFunction);
  Value *result = pack.Generate();
  return result;
 }

+///////////////////////////////////////////////////////////////////////////////
+// Resource Lowering.
+
+// Modify a call to a resouce method. Makes the following transformation:
+//
+// 1. Convert non-void return value to dx.types.ResRet.
+// 2. Convert resource parameters to the corresponding dx.types.Handle value.
+// 3. Expand vectors in place as separate arguments.
+//
+// Example
+// -----------------------------------------------------------------------------
+//
+//  %0 = call <2 x float> MyBufferOp(i32 138, %class.Buffer %3, <2 x i32> <1 , 2> )
+//  %r = call %dx.types.ResRet.f32 MyBufferOp(i32 138, %dx.types.Handle %buf, i32 1, i32 2 )
+//  %x = extractvalue %r, 0
+//  %y = extractvalue %r, 1
+//  %v = <2 x float> undef
+//  %v.1 = insertelement %v,   %x, 0
+//  %v.2 = insertelement %v.1, %y, 1
+class ResourceMethodCall {
+public:
+  ResourceMethodCall(CallInst *CI, Function &explodedFunction, const ExtensionLowering::HandleMap &handleMap)
+    : m_CI(CI)
+    , m_explodedFunction(explodedFunction)
+    , m_handleMap(handleMap)
+    , m_builder(CI)
+  { }
+
+  Value *Generate() {
+    SmallVector<Value *, 16> args;
+    ExplodeArgs(args);
+    Value *result = CreateCall(args);
+    result = ConvertResult(result);
+    return result;
+  }
+  
+  // Check to see if the value is mapped to a handle in the handleMap.
+  static Instruction *IsResourceHandle(Value *OrigArg, const ExtensionLowering::HandleMap &handleMap) {
+    if (Instruction *Inst = dyn_cast<Instruction>(OrigArg)) {
+      if (handleMap.count(Inst))
+        return Inst;
+    }
+    return nullptr;
+  }
+  
+private:
+  CallInst *m_CI;
+  Function &m_explodedFunction;
+  const ExtensionLowering::HandleMap &m_handleMap;
+  IRBuilder<> m_builder;
+  
+  Value *GetResourceHandle(Value *OrigArg) {
+    if (Instruction *Inst = IsResourceHandle(OrigArg, m_handleMap))
+      return m_handleMap.at(Inst);
+    return nullptr;
+    
+  }
+
+  void ExplodeArgs(SmallVectorImpl<Value*> &args) {
+    for (Value *arg : m_CI->arg_operands()) {
+      // vector arg: <N x ty> -> ty, ty, ..., ty (N times)
+      if (arg->getType()->isVectorTy()) {
+        for (unsigned i = 0; i < arg->getType()->getVectorNumElements(); i++) {
+          Value *xarg = m_builder.CreateExtractElement(arg, i);
+          args.push_back(xarg);
+        }
+      }
+      // resource handle arg: handle -> dx.types.Handle
+      else if (Value *handle = GetResourceHandle(arg)) {
+        args.push_back(handle);
+      }
+      // any other value: arg -> arg
+      else {
+        args.push_back(arg);
+      }
+    }
+  }
+
+  Value *CreateCall(const SmallVectorImpl<Value*> &args) {
+    return m_builder.CreateCall(&m_explodedFunction, args);
+  }
+
+  Value *ConvertResult(Value *result) {
+    Type *origRetTy = m_CI->getType();
+    if (origRetTy->isVoidTy())
+      return ConvertVoidResult(result);
+    else if (origRetTy->isVectorTy())
+      return ConvertVectorResult(origRetTy, result);
+    else
+      return ConvertScalarResult(origRetTy, result);
+  }
+
+  // Void result does not need any conversion.
+  Value *ConvertVoidResult(Value *result) {
+    return result;
+  }
+
+  // Vector result will be populated with the elements from the resource return.
+  Value *ConvertVectorResult(Type *origRetTy, Value *result) {
+    Type *resourceRetTy = result->getType();
+    assert(origRetTy->isVectorTy());
+    assert(resourceRetTy->isStructTy() && "expected resource return type to be a struct");
+    
+    const unsigned vectorSize = origRetTy->getVectorNumElements();
+    const unsigned structSize = resourceRetTy->getStructNumElements();
+    const unsigned size = std::min(vectorSize, structSize);
+    assert(vectorSize < structSize);
+    
+    // Copy resource struct elements to vector.
+    Value *vector = UndefValue::get(origRetTy);
+    for (unsigned i = 0; i < size; ++i) {
+      Value *element = m_builder.CreateExtractValue(result, { i });
+      vector = m_builder.CreateInsertElement(vector, element, i);
+    }
+
+    return vector;
+  }
+
+  // Scalar result will be populated with the first element of the resource return.
+  Value *ConvertScalarResult(Type *origRetTy, Value *result) {
+    assert(origRetTy->isSingleValueType());
+    return m_builder.CreateExtractValue(result, { 0 });
+  }
+
+};
+
+// Translate function return and argument types for resource method lowering.
+class ResourceFunctionTypeTranslator : public FunctionTypeTranslator {
+public:
+  ResourceFunctionTypeTranslator(const ExtensionLowering::HandleMap &handleMap, OP& hlslOp)
+    : m_handleMap(handleMap)
+    , m_hlslOp(hlslOp)
+  { }
+
+  // Translate return type as follows:
+  //
+  // void     -> void
+  // <N x ty> -> dx.types.ResRet.ty
+  //  ty      -> dx.types.ResRet.ty
+  virtual Type *TranslateReturnType(CallInst *CI) override {
+    Type *RetTy = CI->getType();
+    if (RetTy->isVoidTy())
+      return RetTy;
+    else if (RetTy->isVectorTy())
+      RetTy = RetTy->getVectorElementType();
+
+    return m_hlslOp.GetResRetType(RetTy);
+  }
+  
+  // Translate argument type as follows:
+  //
+  // resource -> dx.types.Handle
+  // <N x ty> -> { ty, N }
+  //  ty      -> { ty, 1 }
+  virtual ArgumentType TranslateArgumentType(Value *OrigArg) override {
+    int count = 1;
+    Type *ty = OrigArg->getType();
+
+    if (ty->isVectorTy()) {
+      count = ty->getVectorNumElements();
+      ty = ty->getVectorElementType();
+    }
+    else if (ResourceMethodCall::IsResourceHandle(OrigArg, m_handleMap)) {
+      ty = m_hlslOp.GetHandleType();
+    }
+
+    return ArgumentType(ty, count);
+  }
+
+private:
+  const ExtensionLowering::HandleMap &m_handleMap;
+  OP& m_hlslOp;
+};
+
+Value *ExtensionLowering::Resource(CallInst *CI) {
+  ResourceFunctionTypeTranslator resourceTypeTranslator(m_handleMap, m_hlslOp);
+  Function *resourceFunction = FunctionTranslator::GetLoweredFunction(resourceTypeTranslator, CI, *this);
+  if (!resourceFunction)
+    return NoTranslation(CI);
+
+  ResourceMethodCall explode(CI, *resourceFunction, m_handleMap);
+  Value *result = explode.Generate();
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Computing Extension Names.

--- a/lib/HLSL/ReducibilityAnalysis.cpp
+++ b/lib/HLSL/ReducibilityAnalysis.cpp
@ -48,10 +48,12 @@ class ReducibilityAnalysis : public FunctionPass {
 public:
  static char ID;

-  explicit ReducibilityAnalysis(IrreducibilityAction Action = IrreducibilityAction::ThrowException) 
-    : FunctionPass(ID), m_Action(Action), m_bReducible(false) {
-    initializeReducibilityAnalysisPass(*PassRegistry::getPassRegistry());
-  }
+  ReducibilityAnalysis()
+      : FunctionPass(ID), m_Action(IrreducibilityAction::ThrowException),
+        m_bReducible(false) {}
+
+  explicit ReducibilityAnalysis(IrreducibilityAction Action)
+      : FunctionPass(ID), m_Action(Action), m_bReducible(false) {}

  virtual bool runOnFunction(Function &F);

--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@ -28,6 +28,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "dxc/HLSL/HLModule.h" // HLSL Change
+#include "dxc/HLSL/DxilModule.h" // HLSL Change
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/StringPool.h"
@ -237,11 +238,13 @@ Type *Function::getReturnType() const {

 void Function::removeFromParent() {
  if (getParent()->HasHLModule()) getParent()->GetHLModule().RemoveFunction(this); // HLSL Change
+  if (getParent()->HasDxilModule()) getParent()->GetDxilModule().RemoveFunction(this); // HLSL Change
  getParent()->getFunctionList().remove(this);
 }

 void Function::eraseFromParent() {
  if (getParent()->HasHLModule()) getParent()->GetHLModule().RemoveFunction(this); // HLSL Change
+  if (getParent()->HasDxilModule()) getParent()->GetDxilModule().RemoveFunction(this); // HLSL Change
  getParent()->getFunctionList().erase(this);
 }

--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -209,6 +209,9 @@ static void addHLSLPasses(bool HLSLHighLevel, bool NoOpt, hlsl::HLSLExtensionsCo
  // Change dynamic indexing vector to array.
  MPM.add(createDynamicIndexingVectorToArrayPass(NoOpt));

+  MPM.add(createSimplifyInstPass());
+  MPM.add(createCFGSimplificationPass());
+
  MPM.add(createDxilGenerationPass(NoOpt, ExtHelper));

  MPM.add(createSimplifyInstPass());
@ -262,6 +265,7 @@ void PassManagerBuilder::populateModulePassManager(
    if (!HLSLHighLevel) {
      MPM.add(createMultiDimArrayToOneDimArrayPass());// HLSL Change
      MPM.add(createDxilCondenseResourcesPass()); // HLSL Change
+      MPM.add(createDxilLegalizeSampleOffsetPass()); // HLSL Change
      MPM.add(createDxilEmitMetadataPass());      // HLSL Change
    }
    // HLSL Change Ends.
@ -527,6 +531,8 @@ void PassManagerBuilder::populateModulePassManager(
  if (!HLSLHighLevel) {
    MPM.add(createMultiDimArrayToOneDimArrayPass());// HLSL Change
    MPM.add(createDxilCondenseResourcesPass());
+    if (DisableUnrollLoops)
+      MPM.add(createDxilLegalizeSampleOffsetPass()); // HLSL Change
    MPM.add(createDxilEmitMetadataPass());
  }
  // HLSL Change Ends.
--- a/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregatesHLSL.cpp
@ -2045,8 +2045,10 @@ bool SROA_HLSL::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size,
 }

 /// LoadVectorArray - Load vector array like [2 x <4 x float>] from
-///  arrays like 4 [2 x float].
-static Value *LoadVectorArray(ArrayType *AT, ArrayRef<Value *> NewElts,
+///  arrays like 4 [2 x float] or struct array like
+///  [2 x { <4 x float>, < 4 x uint> }]
+/// from arrays like [ 2 x <4 x float> ], [ 2 x <4 x uint> ].
+static Value *LoadVectorOrStructArray(ArrayType *AT, ArrayRef<Value *> NewElts,
                              SmallVector<Value *, 8> &idxList,
                              IRBuilder<> &Builder) {
  Type *EltTy = AT->getElementType();
@ -2059,7 +2061,7 @@ static Value *LoadVectorArray(ArrayType *AT, ArrayRef<Value *> NewElts,
    idxList.emplace_back(idx);

    if (ArrayType *EltAT = dyn_cast<ArrayType>(EltTy)) {
-      Value *EltVal = LoadVectorArray(EltAT, NewElts, idxList, Builder);
+      Value *EltVal = LoadVectorOrStructArray(EltAT, NewElts, idxList, Builder);
      retVal = Builder.CreateInsertValue(retVal, EltVal, i);
    } else {
      assert(EltTy->isVectorTy() ||
@ -2087,9 +2089,12 @@ static Value *LoadVectorArray(ArrayType *AT, ArrayRef<Value *> NewElts,
  }
  return retVal;
 }
+
 /// LoadVectorArray - Store vector array like [2 x <4 x float>] to
-///  arrays like 4 [2 x float].
-static void StoreVectorArray(ArrayType *AT, Value *val,
+///  arrays like 4 [2 x float] or struct array like
+///  [2 x { <4 x float>, < 4 x uint> }]
+/// from arrays like [ 2 x <4 x float> ], [ 2 x <4 x uint> ].
+static void StoreVectorOrStructArray(ArrayType *AT, Value *val,
                             ArrayRef<Value *> NewElts,
                             SmallVector<Value *, 8> &idxList,
                             IRBuilder<> &Builder) {
@ -2104,7 +2109,7 @@ static void StoreVectorArray(ArrayType *AT, Value *val,
    idxList.emplace_back(idx);

    if (ArrayType *EltAT = dyn_cast<ArrayType>(EltTy)) {
-      StoreVectorArray(EltAT, elt, NewElts, idxList, Builder);
+      StoreVectorOrStructArray(EltAT, elt, NewElts, idxList, Builder);
    } else {
      assert(EltTy->isVectorTy() ||
             EltTy->isStructTy() && "must be a vector or struct type");
@ -2532,16 +2537,21 @@ void SROA_Helper::RewriteForGEP(GEPOperator *GEP, IRBuilder<> &Builder) {
  }
 }

-/// isVectorArray - Check if T is array of vector.
-static bool isVectorArray(Type *T) {
+static Type *getArrayEltType(Type *T) {
+  while (isa<ArrayType>(T)) {
+    T = T->getArrayElementType();
+  }
+  return T;
+}
+
+/// isVectorOrStructArray - Check if T is array of vector or struct.
+static bool isVectorOrStructArray(Type *T) {
  if (!T->isArrayTy())
    return false;

-  while (T->getArrayElementType()->isArrayTy()) {
-    T = T->getArrayElementType();
-  }
+  T = getArrayEltType(T);

-  return T->getArrayElementType()->isVectorTy();
+  return T->isStructTy() || T->isVectorTy();
 }

 static void SimplifyStructValUsage(Value *StructVal, std::vector<Value *> Elts,
@ -2596,7 +2606,7 @@ void SROA_Helper::RewriteForLoad(LoadInst *LI) {
    LI->replaceAllUsesWith(Insert);
    DeadInsts.push_back(LI);
  } else if (isCompatibleAggregate(LIType, ValTy)) {
-    if (isVectorArray(LIType)) {
+    if (isVectorOrStructArray(LIType)) {
      // Replace:
      //   %res = load [2 x <2 x float>] * %alloc
      // with:
@ -2610,7 +2620,7 @@ void SROA_Helper::RewriteForLoad(LoadInst *LI) {
      SmallVector<Value *, 8> idxList;
      idxList.emplace_back(zero);
      Value *newLd =
-          LoadVectorArray(cast<ArrayType>(LIType), NewElts, idxList, Builder);
+          LoadVectorOrStructArray(cast<ArrayType>(LIType), NewElts, idxList, Builder);
      LI->replaceAllUsesWith(newLd);
      DeadInsts.push_back(LI);
    } else {
@ -2674,7 +2684,7 @@ void SROA_Helper::RewriteForStore(StoreInst *SI) {
    }
    DeadInsts.push_back(SI);
  } else if (isCompatibleAggregate(SIType, ValTy)) {
-    if (isVectorArray(SIType)) {
+    if (isVectorOrStructArray(SIType)) {
      // Replace:
      //   store [2 x <2 x i32>] %val, [2 x <2 x i32>]* %alloc, align 16
      // with:
@ -2701,7 +2711,7 @@ void SROA_Helper::RewriteForStore(StoreInst *SI) {
      Value *zero = ConstantInt::get(i32Ty, 0);
      SmallVector<Value *, 8> idxList;
      idxList.emplace_back(zero);
-      StoreVectorArray(AT, Val, NewElts, idxList, Builder);
+      StoreVectorOrStructArray(AT, Val, NewElts, idxList, Builder);
      DeadInsts.push_back(SI);
    } else {
      // Replace:
@ -2715,9 +2725,9 @@ void SROA_Helper::RewriteForStore(StoreInst *SI) {
      Module *M = SI->getModule();
      for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
        Value *Extract = Builder.CreateExtractValue(Val, i, Val->getName());
-        if (!HLMatrixLower::IsMatrixType(Extract->getType()))
+        if (!HLMatrixLower::IsMatrixType(Extract->getType())) {
          Builder.CreateStore(Extract, NewElts[i]);
-        else {
+        } else {
          // Generate Matrix Store.
          HLModule::EmitHLOperationCall(
              Builder, HLOpcodeGroup::HLMatLoadStore,
@ -3201,6 +3211,44 @@ bool SROA_Helper::DoScalarReplacement(Value *V, std::vector<Value *> &Elts,
  return true;
 }

+static Constant *GetEltInit(Type *Ty, Constant *Init, unsigned idx,
+                            Type *EltTy) {
+  if (isa<UndefValue>(Init))
+    return UndefValue::get(EltTy);
+
+  if (StructType *ST = dyn_cast<StructType>(Ty)) {
+    return Init->getAggregateElement(idx);
+  } else if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    return Init->getAggregateElement(idx);
+  } else {
+    ArrayType *AT = cast<ArrayType>(Ty);
+    ArrayType *EltArrayTy = cast<ArrayType>(EltTy);
+    std::vector<Constant *> Elts;
+    if (!AT->getElementType()->isArrayTy()) {
+      for (unsigned i = 0; i < AT->getNumElements(); i++) {
+        // Get Array[i]
+        Constant *InitArrayElt = Init->getAggregateElement(i);
+        // Get Array[i].idx
+        InitArrayElt = InitArrayElt->getAggregateElement(idx);
+        Elts.emplace_back(InitArrayElt);
+      }
+      return ConstantArray::get(EltArrayTy, Elts);
+    } else {
+      Type *EltTy = AT->getElementType();
+      ArrayType *NestEltArrayTy = cast<ArrayType>(EltArrayTy->getElementType());
+      // Nested array.
+      for (unsigned i = 0; i < AT->getNumElements(); i++) {
+        // Get Array[i]
+        Constant *InitArrayElt = Init->getAggregateElement(i);
+        // Get Array[i].idx
+        InitArrayElt = GetEltInit(EltTy, InitArrayElt, idx, NestEltArrayTy);
+        Elts.emplace_back(InitArrayElt);
+      }
+      return ConstantArray::get(EltArrayTy, Elts);
+    }
+  }
+}
+
 /// DoScalarReplacement - Split V into AllocaInsts with Builder and save the new AllocaInsts into Elts.
 /// Then do SROA on V.
 bool SROA_Helper::DoScalarReplacement(GlobalVariable *GV, std::vector<Value *> &Elts,
@ -3242,7 +3290,7 @@ bool SROA_Helper::DoScalarReplacement(GlobalVariable *GV, std::vector<Value *> &
    Elts.reserve(numTypes);
    //DxilStructAnnotation *SA = typeSys.GetStructAnnotation(ST);
    for (int i = 0, e = numTypes; i != e; ++i) {
-      Constant *EltInit = cast<Constant>(Builder.CreateExtractValue(Init, i));
+      Constant *EltInit = GetEltInit(Ty, Init, i, ST->getElementType(i));
      GlobalVariable *EltGV = new llvm::GlobalVariable(
          *M, ST->getContainedType(i), /*IsConstant*/ isConst, linkage,
          /*InitVal*/ EltInit, GV->getName() + "." + Twine(i),
@ -3261,7 +3309,7 @@ bool SROA_Helper::DoScalarReplacement(GlobalVariable *GV, std::vector<Value *> &
    Type *EltTy = VT->getElementType();
    //DxilStructAnnotation *SA = typeSys.GetStructAnnotation(ST);
    for (int i = 0, e = numElts; i != e; ++i) {
-      Constant *EltInit = cast<Constant>(Builder.CreateExtractElement(Init, i));
+      Constant *EltInit = GetEltInit(Ty, Init, i, EltTy);
      GlobalVariable *EltGV = new llvm::GlobalVariable(
          *M, EltTy, /*IsConstant*/ isConst, linkage,
          /*InitVal*/ EltInit, GV->getName() + "." + Twine(i),
@ -3302,8 +3350,7 @@ bool SROA_Helper::DoScalarReplacement(GlobalVariable *GV, std::vector<Value *> &
      for (int i = 0, e = numTypes; i != e; ++i) {
        Type *EltTy =
            CreateNestArrayTy(ElST->getContainedType(i), nestArrayTys);
-        // Don't need InitVal, struct type will use store to init.
-        Constant *EltInit = UndefValue::get(EltTy);
+        Constant *EltInit = GetEltInit(Ty, Init, i, EltTy);
        GlobalVariable *EltGV = new llvm::GlobalVariable(
            *M, EltTy, /*IsConstant*/ isConst, linkage,
            /*InitVal*/ EltInit, GV->getName() + "." + Twine(i),
@ -3329,8 +3376,7 @@ bool SROA_Helper::DoScalarReplacement(GlobalVariable *GV, std::vector<Value *> &
          CreateNestArrayTy(ElVT->getElementType(), nestArrayTys);

      for (int i = 0, e = ElVT->getNumElements(); i != e; ++i) {
-        // Don't need InitVal, struct type will use store to init.
-        Constant *EltInit = UndefValue::get(scalarArrayTy);
+        Constant *EltInit = GetEltInit(Ty, Init, i, scalarArrayTy);
        GlobalVariable *EltGV = new llvm::GlobalVariable(
            *M, scalarArrayTy, /*IsConstant*/ isConst, linkage,
            /*InitVal*/ EltInit, GV->getName() + "." + Twine(i),
@ -4069,8 +4115,11 @@ void SROA_Parameter_HLSL::flattenArgument(
                DXASSERT(data->getType()->isPointerTy(),
                         "Append value must be pointer.");
                IRBuilder<> Builder(CI);
-                Value *ldInst = Builder.CreateLoad(data);
-                Builder.CreateStore(ldInst, outputVal);
+
+                llvm::SmallVector<llvm::Value *, 16> idxList;
+                SplitCpy(data->getType(), outputVal, data, idxList,
+                         /*bAllowReplace*/ false, Builder);
+
                CI->setArgOperand(HLOperandIndex::kStreamAppendDataOpIndex, outputVal);
              }
              else {
@ -4089,9 +4138,13 @@ void SROA_Parameter_HLSL::flattenArgument(
                DXASSERT_LOCALVAR(eltCount, eltCount == EltPtrList.size(), "invalid element count");

                for (unsigned i = HLOperandIndex::kStreamAppendDataOpIndex; i < CI->getNumArgOperands(); i++) {
-                  Value *Elt = Builder.CreateLoad(CI->getArgOperand(i));
-                  Value *EltPtr = EltPtrList[i-HLOperandIndex::kStreamAppendDataOpIndex];
-                  Builder.CreateStore(Elt, EltPtr);
+                  Value *DataPtr = CI->getArgOperand(i);
+                  Value *EltPtr =
+                      EltPtrList[i - HLOperandIndex::kStreamAppendDataOpIndex];
+
+                  llvm::SmallVector<llvm::Value *, 16> idxList;
+                  SplitCpy(DataPtr->getType(), EltPtr, DataPtr, idxList,
+                           /*bAllowReplace*/ false, Builder);
                  CI->setArgOperand(i, EltPtr);
                }
              }
@ -4255,6 +4308,17 @@ static void LegalizeDxilInputOutputs(Function *F, DxilFunctionAnnotation *EntryA
      bNeedTemp = true;
      bLoadOutputFromTemp = true;
      bStoreInputToTemp = true;
+    } else if (bLoad && bStore) {
+      bNeedTemp = true;
+      switch (qual) {
+      case DxilParamInputQual::InputPrimitive:
+      case DxilParamInputQual::InputPatch:
+      case DxilParamInputQual::OutputPatch:
+        bStoreInputToTemp = true;
+        break;
+      default:
+        DXASSERT(0, "invalid input qual here");
+      }
    }

    if (HLMatrixLower::IsMatrixType(Ty)) {
@ -4629,8 +4693,7 @@ void SROA_Parameter_HLSL::createFlattenedFunctionCall(Function *F, Function *fla
       DILocation *DL = DILocation::get(F->getContext(), funcDI->getLine(), 0,  funcDI);
       DIB.insertDeclare(retValAddr, RetVar, Expr, DL, CI);
    }
-    // Create store after call.
-    RetBuilder.CreateStore(CI, retValAddr);
+
    // Load ret value and replace CI.
    Value *newRetVal = RetBuilder.CreateLoad(retValAddr);
    CI->replaceAllUsesWith(newRetVal);
@ -5166,6 +5229,26 @@ bool DynamicIndexingVectorToArray::runOnFunction(Function &F) {
  return size > 0;
 }

+static Constant *VectorConstToArray(Type *VecTy, Constant *C, ArrayType *ArrayTy) {
+  if (VecTy->isVectorTy()) {
+    SmallVector<Constant *, 4> Elts;
+    for (unsigned i=0;i<VecTy->getVectorNumElements();i++) {
+      Elts.emplace_back(C->getAggregateElement(i));
+    }
+    return ConstantArray::get(ArrayTy, Elts);
+  } else {
+    ArrayType *AT = cast<ArrayType>(VecTy);
+    Type *EltTy = AT->getElementType();
+    ArrayType *EltArrayTy = cast<ArrayType>(ArrayTy->getElementType());
+    SmallVector<Constant *, 4> Elts;
+    for (unsigned i=0;i<AT->getNumElements();i++) {
+      Constant *Elt = VectorConstToArray(EltTy, C->getAggregateElement(i), EltArrayTy);
+      Elts.emplace_back(Elt);
+    }
+    return ConstantArray::get(ArrayTy, Elts);
+  }
+}
+
 void DynamicIndexingVectorToArray::runOnInternalGlobal(GlobalVariable *GV,
                                                       HLModule *HLM) {
  Type *Ty = GV->getType()->getPointerElementType();
@ -5197,15 +5280,7 @@ void DynamicIndexingVectorToArray::runOnInternalGlobal(GlobalVariable *GV,
      InitVal = ConstantAggregateZero::get(AT);
    else if (!isa<UndefValue>(vecInitVal)) {
      // build arrayInitVal.
-      // Only vector initializer could reach here.
-      // Complex case will use store to init.
-      DXASSERT_NOMSG(vecInitVal->getType()->isVectorTy());
-      ConstantDataVector *CDV = cast<ConstantDataVector>(vecInitVal);
-      unsigned vecSize = CDV->getType()->getVectorNumElements();
-      std::vector<Constant *> vals;
-      for (unsigned i = 0; i < vecSize; i++)
-        vals.emplace_back(CDV->getAggregateElement(i));
-      InitVal = ConstantArray::get(AT, vals);
+      InitVal = VectorConstToArray(vecInitVal->getType(), vecInitVal, AT);
    }
  }

@ -5394,6 +5469,18 @@ void MultiDimArrayToOneDimArray::flattenAlloca(AllocaInst *AI) {
  AI->eraseFromParent();
 }

+static void FlattenMultiDimConstArray(Constant *V,
+                                      std::vector<Constant *> &Elts) {
+  if (!V->getType()->isArrayTy()) {
+    Elts.emplace_back(V);
+  } else {
+    ArrayType *AT = cast<ArrayType>(V->getType());
+    for (unsigned i = 0; i < AT->getNumElements(); i++) {
+      FlattenMultiDimConstArray(V->getAggregateElement(i), Elts);
+    }
+  }
+}
+
 void MultiDimArrayToOneDimArray::flattenGlobal(GlobalVariable *GV, DxilModule *DM) {
  Type *Ty = GV->getType()->getElementType();

@ -5416,8 +5503,11 @@ void MultiDimArrayToOneDimArray::flattenGlobal(GlobalVariable *GV, DxilModule *D
      InitVal = ConstantAggregateZero::get(AT);
    else if (isa<UndefValue>(InitVal))
      InitVal = UndefValue::get(AT);
-    else
-      DXASSERT(0, "invalid initializer");
+    else {
+      std::vector<Constant *> Elts;
+      FlattenMultiDimConstArray(InitVal, Elts);
+      InitVal = ConstantArray::get(AT, Elts);
+    }
  } else {
    InitVal = UndefValue::get(AT);
  }
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@ -344,8 +344,8 @@ void Scalarizer::transferMetadata(Instruction *Op, const ValueVector &CV) {
      // HLSL Change Begins
      // Transfer FPMath flag.
      if (FPMathOperator *FPMath = dyn_cast<FPMathOperator>(New)) {
-        FPMathOperator *FPMathOp = dyn_cast<FPMathOperator>(Op);
-        New->copyFastMathFlags(FPMathOp->getFastMathFlags());
+        if (FPMathOperator *FPMathOp = dyn_cast<FPMathOperator>(Op))
+          New->copyFastMathFlags(FPMathOp->getFastMathFlags());
      }
      // HLSL Change Ends
    }
--- a/tools/clang/include/clang/AST/HlslTypes.h
+++ b/tools/clang/include/clang/AST/HlslTypes.h
@ -66,6 +66,8 @@ enum HLSLScalarType {
  HLSLScalarType_uint64,
 };

+HLSLScalarType MakeUnsigned(HLSLScalarType T);
+
 static const HLSLScalarType HLSLScalarType_minvalid = HLSLScalarType_bool;
 static const HLSLScalarType HLSLScalarType_max = HLSLScalarType_uint64;
 static const size_t HLSLScalarTypeCount = static_cast<size_t>(HLSLScalarType_max) + 1;
--- a/tools/clang/include/clang/Basic/Attr.td
+++ b/tools/clang/include/clang/Basic/Attr.td
@ -838,6 +838,12 @@ def HLSLTriangleAdj : InheritableAttr {
  let Documentation = [Undocumented];
 }

+def HLSLGloballyCoherent : InheritableAttr {
+  let Spellings = [CXX11<"", "globallycoherent", 2015>];
+  let Subjects = SubjectList<[Var]>;
+  let Documentation = [Undocumented];
+}
+
 // HLSL Change Ends

 def C11NoReturn : InheritableAttr {
--- a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@ -7227,6 +7227,7 @@ def warn_sync_fetch_and_nand_semantics_change : Warning<
  InGroup<DiagGroup<"sync-fetch-and-nand-semantics-changed">>;

 // Type
+def err_sema_invalid_sign_spec : Error<"'%0' cannot be signed or unsigned">;
 def ext_invalid_sign_spec : Extension<"'%0' cannot be signed or unsigned">;
 def warn_receiver_forward_class : Warning<
    "receiver %0 is a forward class and corresponding @interface may not exist">,
@ -7453,6 +7454,8 @@ def err_hlsl_matrix_member_mixing_refs: Error<
  "matrix subscript '%0' mixes one-based and zero-based references">;
 def err_hlsl_matrix_member_out_of_bounds: Error<
  "matrix subscript '%0' is out of bounds">;
+def err_hlsl_matrix_row_index_out_of_bounds: Error<
+  "matrix row index '%0' is out of bounds">;
 def err_hlsl_matrix_member_too_many_positions: Error<
  "more than four positions are referenced in '%0'">;
 def err_hlsl_matrix_member_zero_in_one_based: Error<
@ -7463,6 +7466,8 @@ def err_hlsl_vector_member_empty: Error<
  "vector swizzle is empty '%0'">;
 def err_hlsl_vector_member_out_of_bounds: Error<
  "vector swizzle '%0' is out of bounds">;
+def err_hlsl_vector_element_index_out_of_bounds: Error<
+  "vector element index '%0' is out of bounds">;
 def err_hlsl_vector_member_too_many_positions: Error<
  "more than four positions are referenced in '%0'">;
 def err_hlsl_missing_type_specifier : Error< // Patterened after err_missing_type_specifier
@ -7505,6 +7510,8 @@ def err_hlsl_unsupported_array_size: Error<
  "array dimension must be between 1 and 65536">;
 def err_hlsl_unsupported_bool_lvalue_op : Error<
  "operator cannot be used with a bool lvalue">;
+def err_hlsl_unsupported_lvalue_cast_op : Error<
+  "cannot truncate lvalue vector/matrix">;
 def err_hlsl_unsupported_buffer_packoffset : Error<
  "packoffset is only allowed within a constant buffer, not on the constant buffer declaration">;
 def err_hlsl_unsupported_cbuffer_register : Error<
@ -7554,6 +7561,8 @@ def err_hlsl_vla : Error< // Patterened after err_opencl_vla
  "variable length arrays are not supported in HLSL">;
 def err_hlsl_type_empty_init : Error<
  "%0 cannot have an explicit empty initializer">;
+def err_hlsl_control_flow_cond_not_scalar : Error<
+  "%0 statement conditional expressions must evaluate to a scalar">;
 def err_hlsl_unsupportedvectortype : Error<
  "%0 is declared with type %1, but only primitive scalar values are supported">;
 def err_hlsl_unsupportedvectorsize : Error<
@ -7630,6 +7639,8 @@ def warn_hlsl_unused_call : Warning<
  "ignoring return value of function that only reads data">,
  InGroup<UnusedValue>;
 }
+def err_hlsl_func_in_func_decl : Error<
+   "function declaration is not allowed in function parameters">;
 // HLSL Change Ends

 let CategoryName = "OpenMP Issue" in {
--- a/tools/clang/include/clang/Basic/TokenKinds.def
+++ b/tools/clang/include/clang/Basic/TokenKinds.def
@ -510,11 +510,13 @@ KEYWORD(line                        , KEYHLSL)
 KEYWORD(lineadj                     , KEYHLSL)
 KEYWORD(triangle                    , KEYHLSL)
 KEYWORD(triangleadj                 , KEYHLSL)
+KEYWORD(globallycoherent            , KEYHLSL)
 KEYWORD(interface                   , KEYHLSL)
 KEYWORD(sampler_state               , KEYHLSL)
 KEYWORD(technique                   , KEYHLSL)
 ALIAS("Technique", technique        , KEYHLSL)
 ALIAS("technique10", technique      , KEYHLSL)
+ALIAS("technique11", technique      , KEYHLSL)

 // OpenCL address space qualifiers
 KEYWORD(__global                    , KEYOPENCL)
--- a/tools/clang/include/clang/Frontend/CodeGenOptions.h
+++ b/tools/clang/include/clang/Frontend/CodeGenOptions.h
@ -196,6 +196,8 @@ public:
  std::vector<std::string> HLSLArguments;
  /// Helper for generating llvm bitcode for hlsl extensions.
  std::shared_ptr<hlsl::HLSLExtensionsCodegenHelper> HLSLExtensionsCodegen;
+  /// Signature packing mode (0 == default for target)
+  unsigned HLSLSignaturePackingStrategy = 0;
  // HLSL Change Ends
  /// Regular expression to select optimizations for which we should enable
  /// optimization remarks. Transformation passes whose name matches this
--- a/tools/clang/include/clang/Frontend/FrontendActions.h
+++ b/tools/clang/include/clang/Frontend/FrontendActions.h
@ -13,6 +13,12 @@
 #include <string>
 #include <vector>

+// HLSL Change Begin.
+namespace hlsl {
+class RootSignatureHandle;
+}
+// HLSL Change End.
+
 namespace clang {

 class Module;
@ -235,6 +241,26 @@ protected:

  bool hasPCHSupport() const override { return true; }
 };
+
+// HLSL Change Begin.
+class HLSLRootSignatureAction : public PreprocessorFrontendAction {
+private:
+  std::string HLSLRootSignatureMacro;
+  unsigned rootSigMajor;
+  unsigned rootSigMinor;
+  std::unique_ptr<hlsl::RootSignatureHandle> rootSigHandle;
+
+protected:
+  void ExecuteAction() override;
+
+public:
+  HLSLRootSignatureAction(StringRef rootSigMacro, unsigned major,
+                          unsigned minor);
+  /// Take the generated LLVM module, for use after the action has been run.
+  /// The result may be null on failure.
+  std::unique_ptr<hlsl::RootSignatureHandle> takeRootSigHandle();
+};
+// HLSL Change End.
  
 }  // end namespace clang

--- a/tools/clang/include/clang/Lex/HLSLMacroExpander.h
+++ b/tools/clang/include/clang/Lex/HLSLMacroExpander.h
@ -0,0 +1,63 @@
+//===--- HLSLMacroExpander.h - Standalone Macro expansion ------*- C++ -*-===//
+///////////////////////////////////////////////////////////////////////////////
+//                                                                           //
+// HLSLMacroExpander.h                                                       //
+// Copyright (C) Microsoft Corporation. All rights reserved.                 //
+// This file is distributed under the University of Illinois Open Source     //
+// License. See LICENSE.TXT for details.                                     //
+//                                                                           //
+//  This file defines utilites for expanding macros after lexing has         //
+//  completed. Normally, macros are expanded as part of the lexing           //
+//  phase and returned in an expanded form directly from the lexer.          //
+//  For hlsl we need to be able to expand macros after the fact to           //
+//  correctly capture semantic defines and root signature defines.           //
+//                                                                           //
+///////////////////////////////////////////////////////////////////////////////
+#ifndef LLVM_CLANG_LEX_HLSLMACROEXPANDER_H
+#define LLVM_CLANG_LEX_HLSLMACROEXPANDER_H
+
+#include "clang/Basic/SourceLocation.h"
+
+#include <string>
+#include <utility>
+
+namespace clang {
+  class Preprocessor;
+  class Token;
+  class MacroInfo;
+}
+
+namespace llvm {
+  class StringRef;
+}
+
+namespace hlsl {
+class MacroExpander {
+public:
+  // Options used during macro expansion.
+  enum Option : unsigned {
+    // Strip quotes from string literals. Enables concatenating adjacent
+    // string literals into a single value.
+    STRIP_QUOTES = 1 << 1,
+  };
+
+  // Constructor
+  MacroExpander(clang::Preprocessor &PP, unsigned options = 0);
+
+  // Expand the given macro into the output string.
+  // Returns true if macro was expanded successfully.
+  bool ExpandMacro(clang::MacroInfo *macro, std::string *out);
+
+
+  // Look in the preprocessor for a macro with the provided name.
+  // Return nullptr if the macro could not be found.
+  static clang::MacroInfo *FindMacroInfo(clang::Preprocessor &PP, llvm::StringRef macroName);
+
+private:
+  clang::Preprocessor &PP;
+  clang::FileID m_expansionFileId;
+  bool m_stripQuotes;
+};
+}
+
+#endif // header include guard
--- a/tools/clang/include/clang/Parse/ParseHLSL.h
+++ b/tools/clang/include/clang/Parse/ParseHLSL.h
@ -20,6 +20,7 @@ class raw_ostream;
 namespace hlsl {
 enum class DxilRootSignatureVersion;
 struct DxilVersionedRootSignatureDesc;
+class  RootSignatureHandle;
 }

 namespace clang {
@ -33,6 +34,10 @@ bool ParseHLSLRootSignature(_In_count_(Len) const char *pData, unsigned Len,
 void ReportHLSLRootSigError(clang::DiagnosticsEngine &Diags,
                            clang::SourceLocation Loc,
                            _In_count_(Len) const char *pData, unsigned Len);
+void CompileRootSignature(StringRef rootSigStr, DiagnosticsEngine &Diags,
+                          SourceLocation SLoc,
+                          hlsl::DxilRootSignatureVersion rootSigVer,
+                          hlsl::RootSignatureHandle *pRootSigHandle);
 }

 #endif
--- a/tools/clang/include/clang/Sema/DeclSpec.h
+++ b/tools/clang/include/clang/Sema/DeclSpec.h
@ -588,15 +588,10 @@ public:

  /// \brief Return true if any type-specifier has been found.
  bool hasTypeSpecifier() const {
-    // HLSL Note: snorm and unorm are not, by themselves, good enough to generate a type
-    // (unlike, for example, 'unsigned' in 'unsigned i = 0;').
-    // If they were, the parser would need to be updated, because typedefs will not work
-    // correctly (so, unsigned min16uint foo fails because min16uint isn't a keyword, and
-    // it will try to parse it as the identifier name).
-    return getTypeSpecType() != DeclSpec::TST_unspecified ||
-           getTypeSpecWidth() != DeclSpec::TSW_unspecified ||
-           getTypeSpecComplex() != DeclSpec::TSC_unspecified ||
-           getTypeSpecSign() != DeclSpec::TSS_unspecified;
+      return getTypeSpecType() != DeclSpec::TST_unspecified ||
+            getTypeSpecWidth() != DeclSpec::TSW_unspecified ||
+          getTypeSpecComplex() != DeclSpec::TSC_unspecified;
+           //getTypeSpecSign() != DeclSpec::TSS_unspecified; // HLSL Change - unsigned is not a complete type specifier.
  }

  /// \brief Return a bitmask of which flavors of specifiers this
--- a/tools/clang/include/clang/Sema/Sema.h
+++ b/tools/clang/include/clang/Sema/Sema.h
@ -8781,6 +8781,9 @@ private:
  void CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
                        const ArraySubscriptExpr *ASE=nullptr,
                        bool AllowOnePastEnd=true, bool IndexNegated=false);
+  // HLSL Change Starts - checking array subscript access to vector or matrix member
+  void CheckHLSLArrayAccess(const Expr *expr);
+  // HLSL Change ends
  void CheckArrayAccess(const Expr *E);
  // Used to grab the relevant information from a FormatAttr and a
  // FunctionDeclaration.
--- a/tools/clang/include/clang/Sema/SemaHLSL.h
+++ b/tools/clang/include/clang/Sema/SemaHLSL.h
@ -73,6 +73,10 @@ void DiagnoseAssignmentResultForHLSL(
  clang::Sema::AssignmentAction Action,
  bool *Complained);

+void DiagnoseControlFlowConditionForHLSL(clang::Sema *self,
+                                         clang::Expr *condExpr,
+                                         llvm::StringRef StmtName);
+
 void DiagnosePackingOffset(
  clang::Sema* self,
  clang::SourceLocation loc,
@ -115,6 +119,11 @@ void InitializeInitSequenceForHLSL(
  bool TopLevelOfInitList,
  _Inout_ clang::InitializationSequence* initSequence);

+unsigned CaculateInitListArraySizeForHLSL(
+  _In_ clang::Sema* sema,
+  _In_ const clang::InitListExpr *InitList,
+  _In_ const clang::QualType EltTy);
+
 bool IsConversionToLessOrEqualElements(
  _In_ clang::Sema* self,
  const clang::ExprResult& sourceExpr,
@ -239,4 +248,15 @@ clang::QualType CheckVectorConditional(

 }

+bool IsTypeNumeric(_In_ clang::Sema* self, _In_ clang::QualType &type);
+
+// This function reads the given declaration TSS and returns the corresponding parsedType with the
+// corresponding type. Replaces the given parsed type with the new type
+clang::QualType ApplyTypeSpecSignToParsedType(
+    _In_ clang::Sema* self,
+    _In_ clang::QualType &type,
+    _In_ clang::TypeSpecifierSign TSS,
+    _In_ clang::SourceLocation Loc
+);
+
 #endif
--- a/tools/clang/lib/AST/Decl.cpp
+++ b/tools/clang/lib/AST/Decl.cpp
@ -1950,6 +1950,7 @@ VarDecl::isThisDeclarationADefinition(ASTContext &C) const {
      getTemplateSpecializationKind() != TSK_ExplicitSpecialization)
    return DeclarationOnly;

+  if (!getASTContext().getLangOpts().HLSL) // HLSL Change - take extern as define to match fxc.
  if (hasExternalStorage())
    return DeclarationOnly;

--- a/tools/clang/lib/AST/HlslTypes.cpp
+++ b/tools/clang/lib/AST/HlslTypes.cpp
@ -426,4 +426,16 @@ hlsl::ParameterModifier ParamModFromAttrs(llvm::ArrayRef<InheritableAttr *> attr
  return ParameterModifier::FromInOut(isIn, isOut);
 }

+HLSLScalarType MakeUnsigned(HLSLScalarType T) {
+    switch (T) {
+    case HLSLScalarType_int:
+        return HLSLScalarType_uint;
+    case HLSLScalarType_int_min16:
+        return HLSLScalarType_uint_min16;
+    case HLSLScalarType_int64:
+        return HLSLScalarType_uint64;
+    }
+    return T;
+}
+
 }
--- a/tools/clang/lib/Basic/Targets.cpp
+++ b/tools/clang/lib/Basic/Targets.cpp
@ -6987,6 +6987,9 @@ public:
    BigEndian = false;
    TLSSupported = false;
    LongWidth = LongAlign = 64;
+    BoolWidth = 32;
+    // To avoid member for alignment.
+    BoolAlign = 8;

    // using the Microsoft ABI.
    TheCXXABI.set(TargetCXXABI::Microsoft);
@ -7031,6 +7034,9 @@ public:
  DXIL_32TargetInfo(const llvm::Triple &Triple) : DXILTargetInfo(Triple) {
    LongDoubleWidth = LongDoubleAlign = 64;
    LongDoubleFormat = &llvm::APFloat::IEEEdouble;
+    BoolWidth = 32;
+    // To avoid member for alignment.
+    BoolAlign = 8;
    // TODO: Update Description for DXIL
    DescriptionString = "e-m:e-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32";
  }
--- a/tools/clang/lib/CodeGen/CGDecl.cpp
+++ b/tools/clang/lib/CodeGen/CGDecl.cpp
@ -343,12 +343,6 @@ void CodeGenFunction::EmitStaticVarDecl(const VarDecl &D,
  llvm::Value *&DMEntry = LocalDeclMap[&D];
  assert(!DMEntry && "Decl already exists in localdeclmap!");

-  // HLSL Change Begins.
-  if (D.getType()->isIncompleteArrayType() && getLangOpts().HLSL) {
-    CGM.getHLSLRuntime().UpdateHLSLIncompleteArrayType(const_cast<VarDecl&>(D));
-  }
-  // HLSL Change Ends.
-
  // Check to see if we already have a global variable for this
  // declaration.  This can happen when double-emitting function
  // bodies, e.g. with complete and base constructors.
@ -911,12 +905,6 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
  if (Ty->isVariablyModifiedType())
    EmitVariablyModifiedType(Ty);

-  // HLSL Change Begins.
-  if (Ty->isIncompleteArrayType() && getLangOpts().HLSL) {
-    Ty = CGM.getHLSLRuntime().UpdateHLSLIncompleteArrayType(const_cast<VarDecl&>(D));
-  }
-  // HLSL Change Ends.
-
  llvm::Value *DeclPtr;
  if (Ty->isConstantSizeType()) {
    bool NRVO = getLangOpts().ElideConstructors &&
--- a/tools/clang/lib/CodeGen/CGExpr.cpp
+++ b/tools/clang/lib/CodeGen/CGExpr.cpp
@ -1251,7 +1251,11 @@ llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) {
  if (hasBooleanRepresentation(Ty)) {
    assert(Value->getType()->isIntegerTy(getContext().getTypeSize(Ty)) &&
           "wrong value rep of bool");
-    return Builder.CreateTrunc(Value, Builder.getInt1Ty(), "tobool");
+    // HLSL Change Begin.
+    // Use ne v, 0 to convert to i1 instead of trunc.
+    return Builder.CreateICmpNE(
+        Value, llvm::ConstantInt::get(Value->getType(), 0), "tobool");
+    // HLSL Change End.
  }

  return Value;
@ -1686,6 +1690,36 @@ void CodeGenFunction::EmitStoreThroughExtVectorComponentLValue(RValue Src,
  if (VTy == nullptr && getContext().getLangOpts().HLSL)
    VTy =
        hlsl::ConvertHLSLVecMatTypeToExtVectorType(getContext(), Dst.getType());
+  llvm::Value * VecDstPtr = Dst.getExtVectorAddr();
+  llvm::Value *Zero = Builder.getInt32(0);
+  if (VTy) {
+    llvm::Type *VecTy = VecDstPtr->getType()->getPointerElementType();
+    unsigned NumSrcElts = VTy->getNumElements();
+    if (VecTy->getVectorNumElements() == NumSrcElts) {
+      // Full vector write, create one store.
+      for (unsigned i = 0; i < VecTy->getVectorNumElements(); i++) {
+        if (llvm::Constant *Elt = Elts->getAggregateElement(i)) {
+          llvm::Value *SrcElt = Builder.CreateExtractElement(SrcVal, i);
+          Vec = Builder.CreateInsertElement(Vec, SrcElt, Elt);
+        }
+      }
+      Builder.CreateStore(Vec, VecDstPtr);
+    } else {
+      for (unsigned i = 0; i < VecTy->getVectorNumElements(); i++) {
+        if (llvm::Constant *Elt = Elts->getAggregateElement(i)) {
+          llvm::Value *EltGEP = Builder.CreateGEP(VecDstPtr, {Zero, Elt});
+          llvm::Value *SrcElt = Builder.CreateExtractElement(SrcVal, i);
+          Builder.CreateStore(SrcElt, EltGEP);
+        }
+      }
+    }
+  } else {
+    // If the Src is a scalar (not a vector) it must be updating one element.
+    llvm::Value *EltGEP = Builder.CreateGEP(
+        VecDstPtr, {Zero, Elts->getAggregateElement((unsigned)0)});
+    Builder.CreateStore(SrcVal, EltGEP);
+  }
+  return;
  // HLSL Change Ends
  if (VTy) {  // HLSL Change
    unsigned NumSrcElts = VTy->getNumElements();
@ -1897,12 +1931,6 @@ static LValue EmitGlobalVarDeclLValue(CodeGenFunction &CGF,
      CGF.CGM.getCXXABI().usesThreadWrapperFunction())
    return CGF.CGM.getCXXABI().EmitThreadLocalVarDeclLValue(CGF, VD, T);

-  // HLSL Change Begins.
-  if (VD->getType()->isIncompleteArrayType() && CGF.getLangOpts().HLSL) {
-    T = CGF.CGM.getHLSLRuntime().UpdateHLSLIncompleteArrayType(const_cast<VarDecl&>(*VD));
-  }
-  // HLSL Change Ends.
-
  llvm::Value *V = CGF.CGM.GetAddrOfGlobalVar(VD);
  llvm::Type *RealVarTy = CGF.getTypes().ConvertTypeForMem(VD->getType());
  V = EmitBitCastOfLValueToProperType(CGF, V, RealVarTy);
@ -3346,6 +3374,19 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
    llvm::Value *bitcast = Builder.CreateBitCast(LV.getAddress(), ResultType);
    return MakeAddrLValue(bitcast, ToType);
  }
+  case CK_FlatConversion: {
+    // HLSL only single inheritance.
+    // Just bitcast.
+    QualType ToType = getContext().getLValueReferenceType(E->getType());
+
+    LValue LV = EmitLValue(E->getSubExpr());
+    llvm::Value *This = LV.getAddress();
+
+    // bitcast to target type
+    llvm::Type *ResultType = ConvertType(ToType);
+    llvm::Value *bitcast = Builder.CreateBitCast(This, ResultType);
+    return MakeAddrLValue(bitcast, ToType);
+  }
  // HLSL Change Ends
  case CK_ZeroToOCLEvent:
    llvm_unreachable("NULL to OpenCL event lvalue cast is not valid");
--- a/tools/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/tools/clang/lib/CodeGen/CGExprCXX.cpp
@ -221,7 +221,32 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(

      llvm::Value *This = nullptr;
      if (Base->getValueKind() != ExprValueKind::VK_RValue) {
-        This = EmitLValue(Base).getAddress();
+        LValue LV = EmitLValue(Base);
+        if (LV.isSimple()) {
+          This = EmitLValue(Base).getAddress();
+          if (isa<ExtMatrixElementExpr>(Base)) {
+            llvm::Value *Val = Builder.CreateLoad(This);
+            This = Builder.CreateAlloca(Val->getType());
+            Builder.CreateStore(Val, This);
+          }
+        } else {
+          assert(LV.isExtVectorElt() && "must be ext vector here");
+          This = LV.getExtVectorAddr();
+          llvm::Constant *Elts = LV.getExtVectorElts();
+          llvm::Type *Ty = ConvertType(LV.getType());
+
+          llvm::Constant *zero = Builder.getInt32(0);
+          llvm::Value *TmpThis = Builder.CreateAlloca(Ty);
+          for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
+            llvm::Value *EltIdx = Elts->getAggregateElement(i);
+            llvm::Value *EltGEP = Builder.CreateGEP(This, {zero, EltIdx});
+            llvm::Value *TmpEltGEP =
+                Builder.CreateGEP(TmpThis, {zero, Builder.getInt32(i)});
+            llvm::Value *Elt = Builder.CreateLoad(EltGEP);
+            Builder.CreateStore(Elt, TmpEltGEP);
+          }
+          This = TmpThis;
+        }
      } else {
        llvm::Value *Val = EmitScalarExpr(Base);
        This = Builder.CreateAlloca(Val->getType());
--- a/tools/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/tools/clang/lib/CodeGen/CGExprConstant.cpp
@ -25,6 +25,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "CGHLSLRuntime.h"   // HLSL Change
 using namespace clang;
 using namespace CodeGen;

@ -744,8 +745,11 @@ public:
    // HLSL Change Begins.
    case CK_FlatConversion:
      return nullptr;
-    case CK_HLSLVectorSplat:
-      return nullptr;
+    case CK_HLSLVectorSplat: {
+      unsigned vecSize = hlsl::GetHLSLVecSize(E->getType());
+      std::vector<llvm::Constant*> Elts(vecSize, C);
+      return llvm::ConstantVector::get(Elts);
+    }
    // HLSL Change Ends.
    }
    llvm_unreachable("Invalid CastKind");
@ -833,16 +837,13 @@ public:
  }

  llvm::Constant *VisitInitListExpr(InitListExpr *ILE) {
+    // HLSL Change Begins.
+    if (CGM.getLangOpts().HLSL)
+      return CGM.getHLSLRuntime().EmitHLSLConstInitListExpr(CGM, ILE);
+    // HLSL Change Ends.
    if (ILE->getType()->isArrayType())
      return EmitArrayInitialization(ILE);

-    // HLSL Change Begins.
-    if (hlsl::IsHLSLVecType(ILE->getType()))
-      return CGM.EmitConstantExpr(ILE, ILE->getType(), CGF);
-    if (hlsl::IsHLSLMatType(ILE->getType()))
-      return nullptr;
-    // HLSL Change Ends.
-
    if (ILE->getType()->isRecordType())
      return EmitRecordInitialization(ILE);

--- a/tools/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/tools/clang/lib/CodeGen/CGExprScalar.cpp
@ -1818,6 +1818,8 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
  }
  case CK_HLSLCC_IntegralToBoolean:
    return EmitIntToBoolConversion(Visit(E));
+  case CK_HLSLCC_FloatingToBoolean:
+    return EmitFloatToBoolConversion(Visit(E));
  // HLSL Change Ends
  }

--- a/tools/clang/lib/CodeGen/CGHLSLMS.cpp
+++ b/tools/clang/lib/CodeGen/CGHLSLMS.cpp
--- a/tools/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/tools/clang/lib/CodeGen/CGHLSLRuntime.h
@ -17,6 +17,7 @@ namespace llvm {
 class Function;
 template <typename T, unsigned N> class SmallVector;
 class Value;
+class Constant;
 class TerminatorInst;
 class Type;
 template <typename T> class ArrayRef;
@ -61,8 +62,7 @@ public:
  virtual llvm::Value *EmitHLSLInitListExpr(CodeGenFunction &CGF, InitListExpr *E,
      // The destPtr when emiting aggregate init, for normal case, it will be null.
      llvm::Value *DestPtr) = 0;
-
-  virtual clang::QualType UpdateHLSLIncompleteArrayType(VarDecl &D) = 0;
+  virtual llvm::Constant *EmitHLSLConstInitListExpr(CodeGenModule &CGM, InitListExpr *E) = 0;

  virtual void EmitHLSLOutParamConversionInit(
      CodeGenFunction &CGF, const FunctionDecl *FD, const CallExpr *E,
--- a/tools/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/tools/clang/lib/CodeGen/CodeGenModule.cpp
@ -2037,12 +2037,6 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D) {
  const VarDecl *InitDecl;
  const Expr *InitExpr = D->getAnyInitializer(InitDecl);

-  // HLSL Change Begins.
-  if (D->getType()->isIncompleteArrayType() && getLangOpts().HLSL) {
-    getHLSLRuntime().UpdateHLSLIncompleteArrayType(const_cast<VarDecl&>(*D));
-  }
-  // HLSL Change Ends.
-
  if (!InitExpr) {
    // This is a tentative definition; tentative definitions are
    // implicitly initialized with { 0 }.
--- a/tools/clang/lib/CodeGen/TargetInfo.cpp
+++ b/tools/clang/lib/CodeGen/TargetInfo.cpp
@ -6208,9 +6208,10 @@ ABIArgInfo MSDXILABIInfo::classifyArgumentType(QualType Ty) const {
  if (const EnumType *EnumTy = Ty->getAs<EnumType>())
    Ty = EnumTy->getDecl()->getIntegerType();

-  // Return aggregates type as indirect by value
+  // Return aggregates type as indirect by ref.
+  // By val not work for out param.
  if (isAggregateTypeForABI(Ty))
-    return ABIArgInfo::getIndirect(0, /* byval */ true);
+    return ABIArgInfo::getIndirect(0, /* byval */ false);

  return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend()
                                        : ABIArgInfo::getDirect());
--- a/tools/clang/lib/Frontend/FrontendActions.cpp
+++ b/tools/clang/lib/Frontend/FrontendActions.cpp
@ -17,6 +17,7 @@
 #include "clang/Frontend/MultiplexConsumer.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/HLSLMacroExpander.h"
 #include "clang/Lex/Pragma.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Parse/Parser.h"
@ -27,7 +28,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 #include <system_error>
-
+// HLSL Change Begin.
+#include "dxc/HLSL/DxilRootSignature.h"
+#include "clang/Parse/ParseHLSL.h"
+// HLSL Change End.
 using namespace clang;

 //===----------------------------------------------------------------------===//
@ -697,6 +701,71 @@ void PrintPreprocessedAction::ExecuteAction() {
                           CI.getPreprocessorOutputOpts());
 }

+// HLSL Change Begin.
+HLSLRootSignatureAction::HLSLRootSignatureAction(StringRef rootSigMacro,
+                                                 unsigned major, unsigned minor)
+    : HLSLRootSignatureMacro(rootSigMacro), rootSigMajor(major),
+      rootSigMinor(minor) {
+  rootSigHandle = std::make_unique<hlsl::RootSignatureHandle>();
+}
+
+void HLSLRootSignatureAction::ExecuteAction() {
+  CompilerInstance &CI = getCompilerInstance();
+  Preprocessor &PP = CI.getPreprocessor();
+  // Ignore unknown pragmas.
+  PP.IgnorePragmas();
+
+  // Scans and ignores all tokens in the files.
+  PP.EnterMainSourceFile();
+
+  Token Tok;
+  do PP.Lex(Tok);
+  while (Tok.isNot(tok::eof));
+
+  hlsl::DxilRootSignatureVersion  rootSigVer;
+  if (rootSigMinor == 0) {
+    rootSigVer = hlsl::DxilRootSignatureVersion::Version_1_0;
+  }
+  else {
+    assert(rootSigMinor == 1 &&
+      "else CGMSHLSLRuntime Constructor needs to be updated");
+    rootSigVer = hlsl::DxilRootSignatureVersion::Version_1_1;
+  }
+
+  assert(rootSigMajor == 1 &&
+           "else CGMSHLSLRuntime Constructor needs to be updated");
+
+  // Try to find HLSLRootSignatureMacro in macros.
+  MacroInfo *rootSigMacro = hlsl::MacroExpander::FindMacroInfo(PP, HLSLRootSignatureMacro);
+  DiagnosticsEngine &Diags = CI.getDiagnostics();
+  if (!rootSigMacro)  {
+    std::string cannotFindMacro =
+        "undeclared identifier " + HLSLRootSignatureMacro;
+    SourceLocation SLoc = Tok.getLocation();
+    ReportHLSLRootSigError(Diags, SLoc, cannotFindMacro.c_str(),
+                           cannotFindMacro.size());
+    return;
+  }
+
+  // Expand HLSLRootSignatureMacro.
+  SourceLocation SLoc = rootSigMacro->getDefinitionLoc();
+  std::string rootSigString;
+  hlsl::MacroExpander expander(PP, hlsl::MacroExpander::STRIP_QUOTES);
+  if (!expander.ExpandMacro(rootSigMacro, &rootSigString)) {
+    StringRef error("error expanding root signature macro");
+    ReportHLSLRootSigError(Diags, SLoc, error.data(), error.size());
+    return;
+  }
+
+  // Compile the expanded root signature.
+  clang::CompileRootSignature(rootSigString, Diags, SLoc, rootSigVer, rootSigHandle.get());
+}
+
+std::unique_ptr<hlsl::RootSignatureHandle> HLSLRootSignatureAction::takeRootSigHandle() {
+  return std::move(rootSigHandle);
+}
+// HLSL Change End.
+
 void PrintPreambleAction::ExecuteAction() {
  switch (getCurrentFileKind()) {
  case IK_C:
--- a/tools/clang/lib/Lex/CMakeLists.txt
+++ b/tools/clang/lib/Lex/CMakeLists.txt
@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS support)
 add_clang_library(clangLex
  HeaderMap.cpp
  HeaderSearch.cpp
+  HLSLMacroExpander.cpp
  Lexer.cpp
  LiteralSupport.cpp
  MacroArgs.cpp
--- a/tools/clang/lib/Lex/HLSLMacroExpander.cpp
+++ b/tools/clang/lib/Lex/HLSLMacroExpander.cpp
@ -0,0 +1,169 @@
+//===--- HLSLMacroExpander.cpp - Standalone Macro expansion -----*- C++ -*-===//
+//                                                                            //
+// HLSLMacroExpander.cpp                                                      //
+// Copyright (C) Microsoft Corporation. All rights reserved.                  //
+// This file is distributed under the University of Illinois Open Source      //
+// License. See LICENSE.TXT for details.                                      //
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MacroExpander class.
+//
+//===----------------------------------------------------------------------===//
+#include "clang/Lex/HLSLMacroExpander.h"
+
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Lex/MacroInfo.h"
+#include "clang/Lex/ModuleMap.h"
+#include "clang/Lex/PPCallbacks.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PTHLexer.h"
+#include "clang/Lex/PTHManager.h"
+#include "clang/Lex/Token.h"
+#include "clang/Lex/TokenLexer.h"
+#include "llvm/ADT/StringRef.h"
+
+#include "dxc/Support/Global.h"
+using namespace clang;
+using namespace llvm;
+using namespace hlsl;
+
+MacroExpander::MacroExpander(Preprocessor &PP_, unsigned options)
+  : PP(PP_)
+  , m_expansionFileId()
+  , m_stripQuotes(false)
+{
+  if (options & STRIP_QUOTES)
+    m_stripQuotes = true;
+
+  // The preprocess requires a file to be on the lexing stack when we
+  // call ExpandMacro. We add an empty in-memory buffer that we use
+  // just for expanding macros.
+  std::unique_ptr<llvm::MemoryBuffer> SB = llvm::MemoryBuffer::getMemBuffer("", "<hlsl-semantic-defines>");
+  if (!SB) {
+    DXASSERT(false, "Cannot create macro expansion source buffer");
+    throw hlsl::Exception(DXC_E_MACRO_EXPANSION_FAILURE);
+  }
+
+  // Unfortunately, there is no api in the SourceManager to lookup a
+  // previously added file, so we have to add the empty file every time
+  // we expand macros. We could modify source manager to get/set the
+  // macro file id similar to the one we have for getPreambleFileID.
+  // Macros should only be expanded once (if needed for a root signature)
+  // or twice (for semantic defines) so adding an empty file every time
+  // is probably not a big deal.
+  m_expansionFileId = PP.getSourceManager().createFileID(std::move(SB));
+  if (m_expansionFileId.isInvalid()) {
+    DXASSERT(false, "Could not create FileID for macro expnasion?");
+    throw hlsl::Exception(DXC_E_MACRO_EXPANSION_FAILURE);
+  }
+}
+
+// Simple struct to hold a data/length pair.
+struct LiteralData {
+  const char *Data;
+  unsigned Length;
+};
+
+// Get the literal data from a literal token.
+// If stripQuotes flag is true the quotes (and string literal type) will
+// be removed from the data and only the raw string literal value will be
+// returned.
+static LiteralData GetLiteralData(const Token &Tok, bool stripQuotes) {
+  if (!tok::isStringLiteral(Tok.getKind()))
+    return LiteralData{ Tok.getLiteralData(), Tok.getLength() };
+
+  unsigned start_offset = 0;
+  unsigned end_offset = 0;
+  switch (Tok.getKind()) {
+  case tok::string_literal:       start_offset = 1; end_offset = 1;  break; // "foo"
+  case tok::wide_string_literal:  start_offset = 2; end_offset = 1;  break; // L"foo"
+  case tok::utf8_string_literal:  start_offset = 3; end_offset = 1;  break; // u8"foo"
+  case tok::utf16_string_literal: start_offset = 2; end_offset = 1;  break; // u"foo"
+  case tok::utf32_string_literal: start_offset = 2; end_offset = 1;  break; // U"foo"
+  default: break;
+  }
+
+  unsigned length = Tok.getLength() - (start_offset + end_offset);
+  if (length > Tok.getLength()) { // Check for unsigned underflow.
+    DXASSERT(false, "string literal quote count is wrong?");
+    start_offset = 0;
+    length = Tok.getLength();
+  }
+
+  return LiteralData {Tok.getLiteralData() + start_offset, length};
+}
+
+// Print leading spaces if needed by the token.
+// Take care when stripping string literal quoates that we do not add extra
+// spaces to the output.
+static bool ShouldPrintLeadingSpace(const Token &Tok, const Token &PrevTok, bool stripQuotes) {
+  if (!Tok.hasLeadingSpace())
+    return false;
+
+  // Token has leading spaces, but the previous token was a sting literal
+  // and we are stripping quotes to paste the strings together so do not
+  // add a space between the string literal values.
+  if (tok::isStringLiteral(PrevTok.getKind()) && stripQuotes)
+    return false;
+
+  return true;
+}
+
+// Macro expansion implementation.
+// We re-lex the macro using the preprocessors lexer.
+bool MacroExpander::ExpandMacro(MacroInfo *pMacro, std::string *out) {
+  if (!pMacro || !out)
+    return false;
+  MacroInfo &macro = *pMacro;
+
+  // Initialize the token from the macro definition location.
+  Token Tok;
+  bool failed = PP.getRawToken(macro.getDefinitionLoc(), Tok);
+  if (failed)
+    return false;
+
+  // Start the lexing process. Use an outer file to make the preprocessor happy.
+  PP.EnterSourceFile(m_expansionFileId, nullptr, PP.getSourceManager().getLocForStartOfFile(m_expansionFileId));
+  PP.EnterMacro(Tok, macro.getDefinitionEndLoc(), &macro, nullptr);
+  PP.Lex(Tok);
+  llvm::raw_string_ostream OS(*out);
+
+  // Keep track of previous token to print spaces correctly.
+  Token PrevTok;
+  PrevTok.startToken();
+
+  // Lex all the tokens from the macro and add them to the output.
+  while (!Tok.is(tok::eof)) {
+    if (ShouldPrintLeadingSpace(Tok, PrevTok, m_stripQuotes)) {
+      OS << ' ';
+    }
+    if (IdentifierInfo *II = Tok.getIdentifierInfo()) {
+      OS << II->getName();
+    }
+    else if (Tok.isLiteral() && !Tok.needsCleaning() &&
+      Tok.getLiteralData()) {
+      LiteralData literalData = GetLiteralData(Tok, m_stripQuotes);
+      OS.write(literalData.Data, literalData.Length);
+    }
+    else {
+      std::string S = PP.getSpelling(Tok);
+      OS.write(&S[0], S.size());
+    }
+    PrevTok = Tok;
+    PP.Lex(Tok);
+  }
+
+  return true;
+}
+
+// Search for the macro info by the given name.
+MacroInfo *MacroExpander::FindMacroInfo(clang::Preprocessor &PP, StringRef macroName) {
+  // Lookup macro identifier.
+  IdentifierInfo *ii = PP.getIdentifierInfo(macroName);
+  if (!ii)
+    return nullptr;
+
+  // Lookup macro info.
+  return PP.getMacroInfo(ii);
+}
--- a/tools/clang/lib/Lex/LiteralSupport.cpp
+++ b/tools/clang/lib/Lex/LiteralSupport.cpp
@ -581,6 +581,11 @@ NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
    switch (*s) {
    case 'f':      // FP Suffix for "float"
    case 'F':
+// HLSL Change Starts
+// TODO : When we support true half type, these suffixes should be treated differently from f/F
+    case 'h':
+    case 'H':
+// HLSL Change Ends
      if (!isFPConstant) break;  // Error for integer constant.
      if (isFloat || isLong) break; // FF, LF invalid.
      isFloat = true;
--- a/tools/clang/lib/Parse/ParseDecl.cpp
+++ b/tools/clang/lib/Parse/ParseDecl.cpp
@ -215,7 +215,8 @@ static void ParseRegisterNumberForHLSL(_In_z_ const char *name,
  DXASSERT_NOMSG(diagId != nullptr);

  if (*name != 'b' && *name != 'c' && *name != 'i' && *name != 's' &&
-      *name != 't' && *name != 'u') {
+      *name != 't' && *name != 'u' && *name != 'B' && *name != 'C' &&
+	  *name != 'I' && *name != 'S' && *name != 'T' && *name != 'U') {
    *diagId = diag::err_hlsl_unsupported_register_type;
    *registerType = 0;
    *registerNumber = 0;
@ -250,7 +251,7 @@ void ParsePackSubcomponent(_In_z_ const char* name, _Out_ unsigned* subcomponent

  char registerType;
  ParseRegisterNumberForHLSL(name, &registerType, subcomponent, diagId);
-  if (registerType != 'c')
+  if (registerType != 'c' && registerType != 'C')
  {
    *diagId = diag::err_hlsl_unsupported_register_type;
        return;
@ -1889,7 +1890,6 @@ Parser::DeclGroupPtrTy Parser::ParseDeclaration(unsigned Context,
  Decl *OwnedType = nullptr;
  switch (Tok.getKind()) {
  case tok::kw_template:
-  case tok::kw_export:
    // HLSL Change Starts
    if (getLangOpts().HLSL) {
      Diag(Tok, diag::err_hlsl_reserved_keyword) << Tok.getName();
@ -1900,6 +1900,14 @@ Parser::DeclGroupPtrTy Parser::ParseDeclaration(unsigned Context,
    ProhibitAttributes(attrs);
    SingleDecl = ParseDeclarationStartingWithTemplate(Context, DeclEnd);
    break;
+    // HLSL Change Begin.
+  case tok::kw_export:
+    // Ignore export for now.
+    ConsumeToken();
+    return ParseSimpleDeclaration(Context, DeclEnd, attrs,
+                                  true);
+    break;
+    // HLSL Change End.
  case tok::kw_inline:
    // Could be the start of an inline namespace. Allowed as an ext in C++03.
    if (getLangOpts().CPlusPlus && NextToken().is(tok::kw_namespace) && !getLangOpts().HLSL) { // HLSL Change - disallowed in HLSL
@ -2167,6 +2175,51 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS,
    return DeclGroupPtrTy();
  }

+  // HLSL Change Starts: change global variables that will be in constant buffer to be constant by default 
+  // Global variables that are groupshared, static, or typedef 
+  // will not be part of constant buffer and therefore should not be const by default.
+  if (getLangOpts().HLSL && !D.isFunctionDeclarator() &&
+      D.getContext() == Declarator::TheContext::FileContext &&
+      DS.getStorageClassSpec() != DeclSpec::SCS::SCS_static &&
+      DS.getStorageClassSpec() != DeclSpec::SCS::SCS_typedef
+      ) {
+
+    // Check whether or not there is a 'groupshared' attribute
+    AttributeList *attrList = DS.getAttributes().getList();
+    bool isGroupShared = false;
+    while (attrList) {
+        if (attrList->getName()->getName().compare(
+            StringRef(tok::getTokenName(tok::kw_groupshared))) == 0) {
+            isGroupShared = true;
+            break;
+        }
+      attrList = attrList->getNext();
+    }
+    if (!isGroupShared) {
+      // check whether or not the given data is the typename or primitive types
+      if (DS.isTypeRep()) {
+        QualType type = DS.getRepAsType().get();
+        // canonical types of HLSL Object types are not canonical for some
+        // reason. other HLSL Object types of vector/matrix/array should be
+        // treated as const.
+        if (type.getCanonicalType().isCanonical() &&
+            IsTypeNumeric(&Actions, type)) {
+          unsigned int diagID;
+          const char *prevSpec;
+          DS.SetTypeQual(DeclSpec::TQ_const, D.getDeclSpec().getLocStart(),
+                         prevSpec, diagID, getLangOpts());
+        }
+      } else {
+        // If not a typename, it is a basic type and should be treated as const.
+        unsigned int diagID;
+        const char *prevSpec;
+        DS.SetTypeQual(DeclSpec::TQ_const, D.getDeclSpec().getLocStart(),
+                       prevSpec, diagID, getLangOpts());
+      }
+    }
+  }
+  // HLSL Change Ends
+
  // Save late-parsed attributes for now; they need to be parsed in the
  // appropriate function scope after the function Decl has been constructed.
  // These will be parsed in ParseFunctionDefinition or ParseLexedAttrList.
@ -2493,6 +2546,16 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
        return nullptr;
      }

+
+      // HLSL Change Begin.
+      // Skip the initializer of effect object.
+      if (D.isInvalidType()) {
+        SkipUntil(tok::semi, StopBeforeMatch); // skip until ';'
+        Actions.ActOnUninitializedDecl(ThisDecl, TypeContainsAuto);
+        return nullptr;
+      }
+      // HLSL Change End.
+
      ExprResult Init(ParseInitializer());

      // If this is the only decl in (possibly) range based for statement,
@ -3466,9 +3529,12 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS,
          Actions.isCurrentClassName(*Tok.getIdentifierInfo(), getCurScope()) &&
          isConstructorDeclarator(/*Unqualified*/true))
        goto DoneWithDeclSpec;
-
+      // HLSL Change start - modify TypeRep for unsigned vectors/matrix
+      QualType qt = TypeRep.get();
+      QualType newType = ApplyTypeSpecSignToParsedType(&Actions, qt, DS.getTypeSpecSign(), Loc);
      isInvalid = DS.SetTypeSpecType(DeclSpec::TST_typename, Loc, PrevSpec,
-                                     DiagID, TypeRep, Policy);
+                                     DiagID, ParsedType::make(newType), Policy);
+      // HLSL Change end
      if (isInvalid)
        break;

@ -3621,6 +3687,7 @@ HLSLReservedKeyword:
    case tok::kw_precise:
    case tok::kw_shared:
    case tok::kw_groupshared:
+    case tok::kw_globallycoherent:
    case tok::kw_uniform:
    case tok::kw_in:
    case tok::kw_out:
@ -5086,6 +5153,7 @@ bool Parser::isDeclarationSpecifier(bool DisambiguatingWithExpression) {
  case tok::kw_precise:
  case tok::kw_shared:
  case tok::kw_groupshared:
+  case tok::kw_globallycoherent:
  case tok::kw_uniform:
  case tok::kw_in:
  case tok::kw_out:
--- a/tools/clang/lib/Parse/ParseHLSL.cpp
+++ b/tools/clang/lib/Parse/ParseHLSL.cpp
@ -95,6 +95,21 @@ Decl *Parser::ParseConstBuffer(unsigned Context, SourceLocation &DeclEnd,
  Actions.ActOnStartHLSLBufferView();
  Parser::DeclGroupPtrTy dcl = ParseDeclGroup(PDS, Declarator::FileContext);

+  // Check if the register type is valid
+  NamedDecl *namedDecl = cast<NamedDecl>(dcl.get().getSingleDecl());
+  ArrayRef<hlsl::UnusualAnnotation*> annotations = namedDecl->getUnusualAnnotations();
+  for (hlsl::UnusualAnnotation* annotation : annotations) {
+    if (annotation->getKind() == hlsl::UnusualAnnotation::UnusualAnnotationKind::UA_RegisterAssignment) {
+        hlsl::RegisterAssignment *regAssignment = (hlsl::RegisterAssignment *)(annotation);
+        if (isCBuffer && regAssignment->RegisterType != 'b' && regAssignment->RegisterType != 'B') {
+            Diag(namedDecl->getLocation(), diag::err_hlsl_unsupported_cbuffer_register);
+        }
+        else if (!isCBuffer && regAssignment->RegisterType != 't' && regAssignment->RegisterType != 'T') {
+            Diag(namedDecl->getLocation(), diag::err_hlsl_unsupported_tbuffer_register);
+        }
+    }
+  }
+
  Decl *decl = Actions.ActOnHLSLBufferView(getCurScope(), BufferLoc, dcl, isCBuffer);

  return decl;
--- a/tools/clang/lib/Parse/ParseTentative.cpp
+++ b/tools/clang/lib/Parse/ParseTentative.cpp
@ -1290,6 +1290,7 @@ Parser::isCXXDeclarationSpecifier(Parser::TPResult BracedCastResult,
  case tok::kw_precise:
  case tok::kw_shared:
  case tok::kw_groupshared:
+  case tok::kw_globallycoherent:
  case tok::kw_uniform:
  case tok::kw_row_major:
  case tok::kw_column_major:
--- a/tools/clang/lib/Sema/DeclSpec.cpp
+++ b/tools/clang/lib/Sema/DeclSpec.cpp
@ -1035,15 +1035,22 @@ void DeclSpec::Finish(DiagnosticsEngine &D, Preprocessor &PP, const PrintingPoli

  // signed/unsigned are only valid with int/char/wchar_t.
  if (TypeSpecSign != TSS_unspecified) {
+    // HLSL Change starts - signed/unsigned are not complete type specifiers.
+    #if 0
    if (TypeSpecType == TST_unspecified)
-      TypeSpecType = TST_int; // unsigned -> unsigned int, signed -> signed int.
-    else if (TypeSpecType != TST_int  && TypeSpecType != TST_int128 &&
-             TypeSpecType != TST_char && TypeSpecType != TST_wchar) {
+      TypeSpecType = TST_int;
+    #endif
+    // shorthand vectors and matrices can have signed/unsigned specifiers.
+    // If other typenames are used with signed/unsigned, it is already diagnosed by hlsl external source
+    if (TypeSpecType != TST_int && TypeSpecType != TST_int128 &&
+        TypeSpecType != TST_char && TypeSpecType != TST_wchar &&
+        TypeSpecType != TST_typename) {
      Diag(D, TSSLoc, diag::err_invalid_sign_spec)
        << getSpecifierName((TST)TypeSpecType, Policy);
      // signed double -> double.
      TypeSpecSign = TSS_unspecified;
    }
+    // HLSL Change end
  }

  // Validate the width of the type.
--- a/tools/clang/lib/Sema/Sema.cpp
+++ b/tools/clang/lib/Sema/Sema.cpp
@ -359,6 +359,13 @@ ExprResult Sema::ImpCastExprToType(Expr *E, QualType Ty,
  assert((VK == VK_RValue || !E->isRValue()) && "can't cast rvalue to lvalue");
 #endif

+  if (VK == VK_LValue) {
+    if (Kind == CastKind::CK_HLSLVectorTruncationCast ||
+        Kind == CastKind::CK_HLSLMatrixTruncationCast) {
+      Diag(E->getLocStart(), diag::err_hlsl_unsupported_lvalue_cast_op);
+    }
+  }
+
  // Check whether we're implicitly casting from a nullable type to a nonnull
  // type.
  if (auto exprNullability = E->getType()->getNullability(Context)) {
--- a/tools/clang/lib/Sema/SemaCast.cpp
+++ b/tools/clang/lib/Sema/SemaCast.cpp
@ -2104,12 +2104,14 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle,
  }

  // HLSL Change Starts
-  // Check for HLSL vector or matrix shrinking.
+  // Check for HLSL vector/matrix/array/struct shrinking.
  if (ValueKind == VK_RValue && 
+      !FunctionalStyle &&
      !isPlaceholder(BuiltinType::Overload) &&
      Self.getLangOpts().HLSL &&
      SrcExpr.get()->isLValue() &&
-      hlsl::IsHLSLVecMatType(SrcExpr.get()->getType()) &&
+      // Cannot use casts on basic type l-values
+      !SrcExpr.get()->getType().getCanonicalType()->isBuiltinType() &&
      hlsl::IsConversionToLessOrEqualElements(&Self, SrcExpr, DestType, true)) {
    ValueKind = VK_LValue;
  }
--- a/tools/clang/lib/Sema/SemaChecking.cpp
+++ b/tools/clang/lib/Sema/SemaChecking.cpp
@ -8378,6 +8378,7 @@ static bool IsTailPaddedMemberArray(Sema &S, llvm::APInt Size,
  return true;
 }

+
 void Sema::CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
                            const ArraySubscriptExpr *ASE,
                            bool AllowOnePastEnd, bool IndexNegated) {
@ -8531,6 +8532,14 @@ void Sema::CheckArrayAccess(const Expr *expr) {
          CheckArrayAccess(rhs);
        return;
      }
+      // HLSL Change Starts : Access checking for HLSL vector and matrix array subscript
+      case Stmt::CXXOperatorCallExprClass : {
+        if (getLangOpts().HLSL) {
+            CheckHLSLArrayAccess(expr);
+        }
+        return;
+      }
+      // HLSL Change Ends
      default:
        return;
    }
--- a/tools/clang/lib/Sema/SemaDecl.cpp
+++ b/tools/clang/lib/Sema/SemaDecl.cpp
@ -9392,8 +9392,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init,
  } else if (VDecl->isFileVarDecl()) {
    if (VDecl->getStorageClass() == SC_Extern &&
        (!getLangOpts().CPlusPlus ||
-         !(Context.getBaseElementType(VDecl->getType()).isConstQualified() ||
-           VDecl->isExternC())) &&
+          !VDecl->isExternC()) &&
        !isTemplateInstantiation(VDecl->getTemplateSpecializationKind()))
      Diag(VDecl->getLocation(), diag::warn_extern_init);

--- a/tools/clang/lib/Sema/SemaExpr.cpp
+++ b/tools/clang/lib/Sema/SemaExpr.cpp
@ -9345,10 +9345,38 @@ static void DiagnoseConstAssignment(Sema &S, const Expr *E,
  S.Diag(Loc, diag::err_typecheck_assign_const) << ExprRange << ConstUnknown;
 }

+static bool HLSLCheckForModifiableLValue(
+    Expr *E,
+    SourceLocation Loc,
+    Sema &S
+) {
+    assert(isa<CXXOperatorCallExpr>(E));
+    const CXXOperatorCallExpr *expr = cast<CXXOperatorCallExpr>(E);
+    const Expr *LHS = expr->getArg(0);
+    QualType qt = LHS->getType();
+
+    // Check modifying const matrix with double subscript operator calls
+    if (isa<CXXOperatorCallExpr>(expr->getArg(0)))
+        return HLSLCheckForModifiableLValue(const_cast<Expr *>(expr->getArg(0)), Loc, S);
+
+    if (qt.isConstQualified() && (hlsl::IsMatrixType(&S, qt) || hlsl::IsVectorType(&S, qt))) {
+      DiagnoseConstAssignment(S, LHS, Loc);
+      return true;
+    }
+    return false;
+}
+
 /// CheckForModifiableLvalue - Verify that E is a modifiable lvalue.  If not,
 /// emit an error and return true.  If so, return false.
 bool CheckForModifiableLvalue(Expr *E, SourceLocation Loc, Sema &S) { // HLSL Change: export this function
  assert(!E->hasPlaceholderType(BuiltinType::PseudoObject));
+  // HLSL Change Starts - check const for array subscript operator for HLSL vector/matrix
+  if (S.Context.getLangOpts().HLSL && E->getStmtClass() == Stmt::CXXOperatorCallExprClass) {
+      // check if it's a vector or matrix
+      return HLSLCheckForModifiableLValue(E, Loc, S);
+  }
+  // HLSL Change Ends
+
  SourceLocation OrigLoc = Loc;
  Expr::isModifiableLvalueResult IsLV = E->isModifiableLvalue(S.Context,
                                                              &Loc);
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@ -284,6 +284,9 @@ enum ArBasicKind {
 #define IS_BPROP_MIN_PRECISION(_Props) \
    (((_Props) & BPROP_MIN_PRECISION) != 0)

+#define IS_BPROP_UNSIGNABLE(_Props) \
+    (IS_BPROP_AINT(_Props) && GET_BPROP_BITS(_Props) != BPROP_BITS12)
+
 const UINT g_uBasicKindProps[] =
 {
  BPROP_PRIMITIVE | BPROP_BOOLEAN | BPROP_INTEGER | BPROP_NUMERIC | BPROP_BITS0,  // AR_BASIC_BOOL
@ -453,6 +456,9 @@ C_ASSERT(ARRAYSIZE(g_uBasicKindProps) == AR_BASIC_MAXIMUM_COUNT);
 #define IS_BASIC_MIN_PRECISION(_Kind) \
    IS_BPROP_MIN_PRECISION(GetBasicKindProps(_Kind))

+#define IS_BASIC_UNSIGNABLE(_Kind) \
+    IS_BPROP_UNSIGNABLE(GetBasicKindProps(_Kind))
+
 #define BITWISE_ENUM_OPS(_Type)                                         \
 inline _Type operator|(_Type F1, _Type F2)                              \
 {                                                                       \
@ -2735,6 +2741,35 @@ private:

  HRESULT CombineDimensions(QualType leftType, QualType rightType, QualType *resultType);

+  clang::TypedefDecl *LookupMatrixShorthandType(HLSLScalarType scalarType, UINT rowCount, UINT colCount) {
+    DXASSERT_NOMSG(scalarType != HLSLScalarType::HLSLScalarType_unknown &&
+                   rowCount >= 0 && rowCount <= 4 && colCount >= 0 &&
+                   colCount <= 4);
+    TypedefDecl *qts =
+        m_matrixShorthandTypes[scalarType][rowCount - 1][colCount - 1];
+    if (qts == nullptr) {
+      QualType type = LookupMatrixType(scalarType, rowCount, colCount);
+      qts = CreateMatrixSpecializationShorthand(*m_context, type, scalarType,
+                                                rowCount, colCount);
+      m_matrixShorthandTypes[scalarType][rowCount - 1][colCount - 1] = qts;
+    }
+    return qts;
+  }
+
+  clang::TypedefDecl *LookupVectorShorthandType(HLSLScalarType scalarType, UINT colCount) {
+    DXASSERT_NOMSG(scalarType != HLSLScalarType::HLSLScalarType_unknown &&
+                   colCount >= 0 && colCount <= 4);
+    TypedefDecl *qts = m_vectorTypedefs[scalarType][colCount - 1];
+    if (qts == nullptr) {
+
+        QualType type = LookupVectorType(scalarType, colCount);
+      qts = CreateVectorSpecializationShorthand(*m_context, type, scalarType,
+                                                colCount);
+      m_vectorTypedefs[scalarType][colCount - 1] = qts;
+    }
+    return qts;
+  }
+
 public:
  HLSLExternalSource() :
    m_context(nullptr),
@ -2822,11 +2857,7 @@ public:
      else if (parsedType == HLSLScalarType_float_min10)
        m_sema->Diag(R.getNameLoc(), diag::warn_hlsl_sema_minprecision_promotion) << "min10float" << "min16float";

-      TypedefDecl* qts = m_matrixShorthandTypes[parsedType][rowCount - 1][colCount - 1];
-      if (qts == nullptr) {
-        qts = CreateMatrixSpecializationShorthand(*m_context, qt, parsedType, rowCount, colCount);
-        m_matrixShorthandTypes[parsedType][rowCount - 1][colCount - 1] = qts;
-      }
+      TypedefDecl* qts = LookupMatrixShorthandType(parsedType, rowCount, colCount);

      R.addDecl(qts);
      return true;
@ -2838,11 +2869,7 @@ public:
      else if (parsedType == HLSLScalarType_float_min10)
        m_sema->Diag(R.getNameLoc(), diag::warn_hlsl_sema_minprecision_promotion) << "min10float" << "min16float";

-      TypedefDecl* qts = m_vectorTypedefs[parsedType][colCount - 1];
-      if (qts == nullptr) {
-        qts = CreateVectorSpecializationShorthand(*m_context, qt, parsedType, colCount);
-        m_vectorTypedefs[parsedType][colCount - 1] = qts;
-      }
+      TypedefDecl *qts = LookupVectorShorthandType(parsedType, colCount);

      R.addDecl(qts);
      return true;
@ -3073,15 +3100,15 @@ public:
      const HLSL_INTRINSIC *pPrior = nullptr;
      UINT64 lookupCookie = 0;
      CA2W wideTypeName(typeName);
-      table->LookupIntrinsic(wideTypeName, L"*", &pIntrinsic, &lookupCookie);
-      while (pIntrinsic != nullptr) {
+      HRESULT found = table->LookupIntrinsic(wideTypeName, L"*", &pIntrinsic, &lookupCookie);
+      while (pIntrinsic != nullptr && SUCCEEDED(found)) {
        if (!AreIntrinsicTemplatesEquivalent(pIntrinsic, pPrior)) {
          AddObjectIntrinsicTemplate(recordDecl, startDepth, pIntrinsic);
          // NOTE: this only works with the current implementation because
          // intrinsics are alive as long as the table is alive.
          pPrior = pIntrinsic;
        }
-        table->LookupIntrinsic(wideTypeName, L"*", &pIntrinsic, &lookupCookie);
+        found = table->LookupIntrinsic(wideTypeName, L"*", &pIntrinsic, &lookupCookie);
      }
    }
  }
@ -3600,6 +3627,12 @@ public:
    _In_ ExprResult &RHS,
    _In_ SourceLocation QuestionLoc);

+  clang::QualType ApplyTypeSpecSignToParsedType(
+      _In_ clang::QualType &type,
+      _In_ TypeSpecifierSign TSS,
+      _In_ SourceLocation Loc
+  );
+
  bool CheckRangedTemplateArgument(SourceLocation diagLoc, llvm::APSInt& sintValue)
  {
    if (!sintValue.isStrictlyPositive() || sintValue.getLimitedValue() > 4)
@ -3868,6 +3901,7 @@ public:

  FunctionDecl* AddHLSLIntrinsicMethod(
    LPCSTR tableName,
+    LPCSTR lowering,
    _In_ const HLSL_INTRINSIC* intrinsic,
    _In_ FunctionTemplateDecl *FunctionTemplate,
    ArrayRef<Expr *> Args,
@ -3956,7 +3990,7 @@ public:
      SC_Extern, InlineSpecifiedFalse, IsConstexprFalse, NoLoc);

    // Add intrinsic attr
-    AddHLSLIntrinsicAttr(method, *m_context, tableName, "", intrinsic);
+    AddHLSLIntrinsicAttr(method, *m_context, tableName, lowering, intrinsic);

    // Record this function template specialization.
    TemplateArgumentList *argListCopy = TemplateArgumentList::CreateCopy(
@ -3982,6 +4016,7 @@ public:
  UINT64 ScoreFunction(OverloadCandidateSet::iterator &Cand);
  UINT64 ScoreImplicitConversionSequence(const ImplicitConversionSequence *s);
  unsigned GetNumElements(QualType anyType);
+  unsigned GetNumBasicElements(QualType anyType);
  unsigned GetNumConvertCheckElts(QualType leftType, unsigned leftSize, QualType rightType, unsigned rightSize);
  QualType GetNthElementType(QualType type, unsigned index);
  bool IsPromotion(ArBasicKind leftKind, ArBasicKind rightKind);
@ -4300,7 +4335,8 @@ static bool CombineObjectTypes(ArBasicKind Target, __in ArBasicKind Source,
  return false;
 }

-static ArBasicKind LiteralToConcrete(Expr *litExpr) {
+static ArBasicKind LiteralToConcrete(Expr *litExpr,
+                                     HLSLExternalSource *pHLSLExternalSource) {
  if (IntegerLiteral *intLit = dyn_cast<IntegerLiteral>(litExpr)) {
    llvm::APInt val = intLit->getValue();
    unsigned width = val.getActiveBits();
@ -4328,7 +4364,7 @@ static ArBasicKind LiteralToConcrete(Expr *litExpr) {
    else
      return AR_BASIC_FLOAT64;
  } else if (UnaryOperator *UO = dyn_cast<UnaryOperator>(litExpr)) {
-    ArBasicKind kind = LiteralToConcrete(UO->getSubExpr());
+    ArBasicKind kind = LiteralToConcrete(UO->getSubExpr(), pHLSLExternalSource);
    if (UO->getOpcode() == UnaryOperator::Opcode::UO_Minus) {
      if (kind == ArBasicKind::AR_BASIC_UINT32)
        kind = ArBasicKind::AR_BASIC_INT32;
@ -4337,20 +4373,32 @@ static ArBasicKind LiteralToConcrete(Expr *litExpr) {
    }
    return kind;
  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(litExpr)) {
-    ArBasicKind kind = LiteralToConcrete(BO->getLHS());
-    ArBasicKind kind1 = LiteralToConcrete(BO->getRHS());
+    ArBasicKind kind = LiteralToConcrete(BO->getLHS(), pHLSLExternalSource);
+    ArBasicKind kind1 = LiteralToConcrete(BO->getRHS(), pHLSLExternalSource);
    CombineBasicTypes(kind, kind1, &kind);
    return kind;
+  } else if (ParenExpr *PE = dyn_cast<ParenExpr>(litExpr)) {
+    ArBasicKind kind = LiteralToConcrete(PE->getSubExpr(), pHLSLExternalSource);
+    return kind;
+  } else if (ConditionalOperator *CO = dyn_cast<ConditionalOperator>(litExpr)) {
+    ArBasicKind kind = LiteralToConcrete(CO->getLHS(), pHLSLExternalSource);
+    ArBasicKind kind1 = LiteralToConcrete(CO->getRHS(), pHLSLExternalSource);
+    CombineBasicTypes(kind, kind1, &kind);
+    return kind;
+  } else if (ImplicitCastExpr *IC = dyn_cast<ImplicitCastExpr>(litExpr)) {
+    // Use target Type for cast.
+    ArBasicKind kind = pHLSLExternalSource->GetTypeElementKind(IC->getType());
+    return kind;
  } else {
    // Could only be function call.
    CallExpr *CE = cast<CallExpr>(litExpr);
    // TODO: calculate the function call result.
    if (CE->getNumArgs() == 1)
-      return LiteralToConcrete(CE->getArg(0));
+      return LiteralToConcrete(CE->getArg(0), pHLSLExternalSource);
    else {
-      ArBasicKind kind = LiteralToConcrete(CE->getArg(0));
+      ArBasicKind kind = LiteralToConcrete(CE->getArg(0), pHLSLExternalSource);
      for (unsigned i = 1; i < CE->getNumArgs(); i++) {
-        ArBasicKind kindI = LiteralToConcrete(CE->getArg(i));
+        ArBasicKind kindI = LiteralToConcrete(CE->getArg(i), pHLSLExternalSource);
        CombineBasicTypes(kind, kindI, &kind);
      }
      return kind;
@ -4367,9 +4415,10 @@ static bool SearchTypeInTable(ArBasicKind kind, const ArBasicKind *pCT) {
  return false;
 }

-static ArBasicKind ConcreteLiteralType(Expr *litExpr,
-                                                 ArBasicKind kind,
-                                                 unsigned uLegalComponentTypes) {
+static ArBasicKind
+ConcreteLiteralType(Expr *litExpr, ArBasicKind kind,
+                    unsigned uLegalComponentTypes,
+                    HLSLExternalSource *pHLSLExternalSource) {
  const ArBasicKind *pCT = g_LegalIntrinsicCompTypes[uLegalComponentTypes];
  ArBasicKind defaultKind = *pCT;
  // Use first none literal kind as defaultKind.
@ -4383,7 +4432,7 @@ static ArBasicKind ConcreteLiteralType(Expr *litExpr,
    break;
  }

-  ArBasicKind litKind = LiteralToConcrete(litExpr);
+  ArBasicKind litKind = LiteralToConcrete(litExpr, pHLSLExternalSource);

  if (kind == AR_BASIC_LITERAL_INT) {
    // Search for match first.
@ -4493,7 +4542,7 @@ bool HLSLExternalSource::MatchArguments(
      //   CombineBasicTypes will cover the rest cases.
      if (!affectRetType) {
        TypeInfoEltKind = ConcreteLiteralType(
-            pCallArg, TypeInfoEltKind, pIntrinsicArg->uLegalComponentTypes);
+            pCallArg, TypeInfoEltKind, pIntrinsicArg->uLegalComponentTypes, this);
      }
    }

@ -5144,6 +5193,60 @@ unsigned HLSLExternalSource::GetNumElements(QualType anyType) {
  }
 }

+unsigned HLSLExternalSource::GetNumBasicElements(QualType anyType) {
+  if (anyType.isNull()) {
+    return 0;
+  }
+
+  anyType = GetStructuralForm(anyType);
+
+  ArTypeObjectKind kind = GetTypeObjectKind(anyType);
+  switch (kind) {
+  case AR_TOBJ_BASIC:
+  case AR_TOBJ_OBJECT:
+    return 1;
+  case AR_TOBJ_COMPOUND: {
+    // TODO: consider caching this value for perf
+    unsigned total = 0;
+    const RecordType *recordType = anyType->getAs<RecordType>();
+    RecordDecl * RD = recordType->getDecl();
+    // Take care base.
+    if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+      if (CXXRD->getNumBases()) {
+        for (const auto &I : CXXRD->bases()) {
+          const CXXRecordDecl *BaseDecl =
+              cast<CXXRecordDecl>(I.getType()->castAs<RecordType>()->getDecl());
+          if (BaseDecl->field_empty())
+            continue;
+          QualType parentTy = QualType(BaseDecl->getTypeForDecl(), 0);
+          total += GetNumBasicElements(parentTy);
+        }
+      }
+    }
+    RecordDecl::field_iterator fi = RD->field_begin();
+    RecordDecl::field_iterator fend = RD->field_end();
+    while (fi != fend) {
+      total += GetNumBasicElements(fi->getType());
+      ++fi;
+    }
+    return total;
+  }
+  case AR_TOBJ_ARRAY: {
+    unsigned arraySize = GetElementCount(anyType);
+    unsigned eltSize = GetNumBasicElements(
+        QualType(anyType->getArrayElementTypeNoTypeQual(), 0));
+    return arraySize * eltSize;
+  }
+  case AR_TOBJ_MATRIX:
+  case AR_TOBJ_VECTOR:
+    return GetElementCount(anyType);
+  default:
+    DXASSERT(kind == AR_TOBJ_VOID,
+             "otherwise the type cannot be classified or is not supported");
+    return 0;
+  }
+}
+
 unsigned HLSLExternalSource::GetNumConvertCheckElts(QualType leftType,
                                                    unsigned leftSize,
                                                    QualType rightType,
@ -5896,6 +5999,28 @@ bool HLSLExternalSource::IsConversionToLessOrEqualElements(
          targetType.getCanonicalType().getUnqualifiedType()) {
    return true;
  }
+  // DerivedFrom is less.
+  if (sourceTypeInfo.ShapeKind == AR_TOBJ_COMPOUND ||
+      GetTypeObjectKind(sourceType) == AR_TOBJ_COMPOUND) {
+    const RecordType *targetRT = targetType->getAsStructureType();
+    if (!targetRT)
+      targetRT = dyn_cast<RecordType>(targetType);
+
+    const RecordType *sourceRT = sourceType->getAsStructureType();
+    if (!sourceRT)
+      sourceRT = dyn_cast<RecordType>(sourceType);
+
+    if (targetRT && sourceRT) {
+      RecordDecl *targetRD = targetRT->getDecl();
+      RecordDecl *sourceRD = sourceRT->getDecl();
+      const CXXRecordDecl *targetCXXRD = dyn_cast<CXXRecordDecl>(targetRD);
+      const CXXRecordDecl *sourceCXXRD = dyn_cast<CXXRecordDecl>(sourceRD);
+      if (targetCXXRD && sourceCXXRD) {
+        if (sourceCXXRD->isDerivedFrom(targetCXXRD))
+          return true;
+      }
+    }
+  }

  if (sourceTypeInfo.ShapeKind != AR_TOBJ_SCALAR &&
    sourceTypeInfo.ShapeKind != AR_TOBJ_VECTOR &&
@ -7291,6 +7416,21 @@ void HLSLExternalSource::CheckBinOpForHLSL(
  // Handle Assign and Comma operators and return
  switch (Opc)
  {
+  case BO_AddAssign:
+  case BO_AndAssign:
+  case BO_DivAssign:
+  case BO_MulAssign:
+  case BO_RemAssign:
+  case BO_ShlAssign:
+  case BO_ShrAssign:
+  case BO_SubAssign:
+  case BO_XorAssign: {
+    extern bool CheckForModifiableLvalue(Expr * E, SourceLocation Loc,
+                                         Sema & S);
+    if (CheckForModifiableLvalue(LHS.get(), OpLoc, *m_sema)) {
+      return;
+    }
+  } break;
  case BO_Assign: {
      extern bool CheckForModifiableLvalue(Expr *E, SourceLocation Loc, Sema &S);
      if (CheckForModifiableLvalue(LHS.get(), OpLoc, *m_sema)) {
@ -7707,6 +7847,49 @@ clang::QualType HLSLExternalSource::CheckVectorConditional(
  return ResultTy;
 }

+// Apply type specifier sign to the given QualType.
+// Other than privmitive int type, only allow shorthand vectors and matrices to be unsigned.
+clang::QualType HLSLExternalSource::ApplyTypeSpecSignToParsedType(
+    _In_ clang::QualType &type, _In_ clang::TypeSpecifierSign TSS,
+    _In_ clang::SourceLocation Loc) {
+  if (TSS == TypeSpecifierSign::TSS_unspecified) {
+    return type;
+  }
+  DXASSERT(TSS != TypeSpecifierSign::TSS_signed, "else signed keyword is supported in HLSL");
+  ArTypeObjectKind objKind = GetTypeObjectKind(type);
+  if (objKind != AR_TOBJ_VECTOR && objKind != AR_TOBJ_MATRIX &&
+      objKind != AR_TOBJ_BASIC && objKind != AR_TOBJ_ARRAY) {
+    return type;
+  }
+  // check if element type is unsigned and check if such vector exists
+  // If not create a new one, Make a QualType of the new kind
+  ArBasicKind elementKind = GetTypeElementKind(type);
+  // Only ints can have signed/unsigend ty
+  if (!IS_BASIC_UNSIGNABLE(elementKind)) {
+    return type;
+  }
+  else {
+    // Check given TypeSpecifierSign. If unsigned, change int to uint.
+    HLSLScalarType scalarType = ScalarTypeForBasic(elementKind);
+    HLSLScalarType newScalarType = MakeUnsigned(scalarType);
+
+    // Get new vector types for a given TypeSpecifierSign.
+    if (objKind == AR_TOBJ_VECTOR) {
+      UINT colCount = GetHLSLVecSize(type);
+      TypedefDecl *qts = LookupVectorShorthandType(newScalarType, colCount);
+      return m_context->getTypeDeclType(qts);
+    } else if (objKind == AR_TOBJ_MATRIX) {
+      UINT rowCount, colCount;
+      GetRowsAndCols(type, rowCount, colCount);
+      TypedefDecl *qts = LookupMatrixShorthandType(newScalarType, rowCount, colCount);
+      return m_context->getTypeDeclType(qts);
+    } else {
+      DXASSERT_NOMSG(objKind == AR_TOBJ_BASIC || objKind == AR_TOBJ_ARRAY);
+      return m_scalarTypes[newScalarType];
+    }
+  }
+}
+
 Sema::TemplateDeductionResult HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
  FunctionTemplateDecl *FunctionTemplate,
  TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef<Expr *> Args,
@ -7791,7 +7974,7 @@ Sema::TemplateDeductionResult HLSLExternalSource::DeduceTemplateArgumentsForHLSL
      continue;
    }

-    Specialization = AddHLSLIntrinsicMethod(cursor.GetTableName(), *cursor, FunctionTemplate, Args, argTypes, argCount);
+    Specialization = AddHLSLIntrinsicMethod(cursor.GetTableName(), cursor.GetLoweringStrategy(), *cursor, FunctionTemplate, Args, argTypes, argCount);
    DXASSERT_NOMSG(Specialization->getPrimaryTemplate()->getCanonicalDecl() ==
      FunctionTemplate->getCanonicalDecl());

@ -8198,6 +8381,19 @@ void hlsl::DiagnoseAssignmentResultForHLSL(Sema* self,
    ->DiagnoseAssignmentResultForHLSL(ConvTy, Loc, DstType, SrcType, SrcExpr, Action, Complained);
 }

+void hlsl::DiagnoseControlFlowConditionForHLSL(Sema *self, Expr *condExpr, StringRef StmtName) {
+  while (ImplicitCastExpr *IC = dyn_cast<ImplicitCastExpr>(condExpr)) {
+    if (IC->getCastKind() == CastKind::CK_HLSLMatrixTruncationCast ||
+        IC->getCastKind() == CastKind::CK_HLSLVectorTruncationCast) {
+      self->Diag(condExpr->getLocStart(),
+                 diag::err_hlsl_control_flow_cond_not_scalar)
+          << StmtName;
+      return;
+    }
+    condExpr = IC->getSubExpr();
+  }
+}
+
 static bool ShaderModelsMatch(const StringRef& left, const StringRef& right)
 {
  // TODO: handle shorthand cases.
@ -8262,7 +8458,8 @@ void hlsl::DiagnoseRegisterType(
  case AR_BASIC_MIN16INT:
  case AR_BASIC_MIN16UINT:
    expected = "'b', 'c', or 'i'";
-    isValid = registerType == 'b' || registerType == 'c' || registerType == 'i';
+    isValid = registerType == 'b' || registerType == 'c' || registerType == 'i' ||
+		registerType == 'B' || registerType == 'C' || registerType == 'I';
    break;

  case AR_OBJECT_TEXTURE1D:
@ -8275,7 +8472,8 @@ void hlsl::DiagnoseRegisterType(
  case AR_OBJECT_TEXTURE2DMS:
  case AR_OBJECT_TEXTURE2DMS_ARRAY:
    expected = "'t' or 's'";
-    isValid = registerType == 't' || registerType == 's';
+    isValid = registerType == 't' || registerType == 's' ||
+		    registerType == 'T' || registerType == 'S';
    break;

  case AR_OBJECT_SAMPLER:
@ -8285,12 +8483,13 @@ void hlsl::DiagnoseRegisterType(
  case AR_OBJECT_SAMPLERCUBE:
  case AR_OBJECT_SAMPLERCOMPARISON:
    expected = "'s' or 't'";
-    isValid = registerType == 's' || registerType == 't';
+    isValid = registerType == 's' || registerType == 't' ||
+		registerType == 'S' || registerType == 'T';
    break;

  case AR_OBJECT_BUFFER:
    expected = "'t'";
-    isValid = registerType == 't';
+    isValid = registerType == 't' || registerType == 'T';
    break;

  case AR_OBJECT_POINTSTREAM:
@ -8313,13 +8512,13 @@ void hlsl::DiagnoseRegisterType(
  case AR_OBJECT_RWTEXTURE3D:
  case AR_OBJECT_RWBUFFER:
    expected = "'u'";
-    isValid = registerType == 'u';
+    isValid = registerType == 'u' || registerType == 'U';
    break;

  case AR_OBJECT_BYTEADDRESS_BUFFER:
  case AR_OBJECT_STRUCTURED_BUFFER:
    expected = "'t'";
-    isValid = registerType == 't';
+    isValid = registerType == 't' || registerType == 'T';
    break;

  case AR_OBJECT_CONSUME_STRUCTURED_BUFFER:
@ -8329,16 +8528,16 @@ void hlsl::DiagnoseRegisterType(
  case AR_OBJECT_RWSTRUCTURED_BUFFER_CONSUME:
  case AR_OBJECT_APPEND_STRUCTURED_BUFFER:
    expected = "'u'";
-    isValid = registerType == 'u';
+    isValid = registerType == 'u' || registerType == 'U';
    break;

  case AR_OBJECT_CONSTANT_BUFFER:
    expected = "'b'";
-    isValid = registerType == 'b';
+    isValid = registerType == 'b' || registerType == 'B';
    break;
  case AR_OBJECT_TEXTURE_BUFFER:
    expected = "'t'";
-    isValid = registerType == 't';
+    isValid = registerType == 't' || registerType == 'T';
    break;

  case AR_OBJECT_ROVBUFFER:
@ -8350,7 +8549,7 @@ void hlsl::DiagnoseRegisterType(
  case AR_OBJECT_ROVTEXTURE2D_ARRAY:
  case AR_OBJECT_ROVTEXTURE3D:
    expected = "'u'";
-    isValid = registerType == 'u';
+    isValid = registerType == 'u' || registerType == 'U';
    break;

  case AR_OBJECT_LEGACY_EFFECT:   // Used for all unsupported but ignored legacy effect types
@ -8449,6 +8648,36 @@ void hlsl::InitializeInitSequenceForHLSL(Sema *self,
    ->InitializeInitSequenceForHLSL(Entity, Kind, Args, TopLevelOfInitList, initSequence);
 }

+static unsigned CaculateInitListSize(HLSLExternalSource *hlslSource,
+                                     const clang::InitListExpr *InitList) {
+  unsigned totalSize = 0;
+  for (unsigned i = 0; i < InitList->getNumInits(); i++) {
+    const clang::Expr *EltInit = InitList->getInit(i);
+    QualType EltInitTy = EltInit->getType();
+    if (const InitListExpr *EltInitList = dyn_cast<InitListExpr>(EltInit)) {
+      totalSize += CaculateInitListSize(hlslSource, EltInitList);
+    } else {
+      totalSize += hlslSource->GetNumBasicElements(EltInitTy);
+    }
+  }
+  return totalSize;
+}
+
+unsigned hlsl::CaculateInitListArraySizeForHLSL(
+  _In_ clang::Sema* sema,
+  _In_ const clang::InitListExpr *InitList,
+  _In_ const clang::QualType EltTy) {
+  HLSLExternalSource *hlslSource = HLSLExternalSource::FromSema(sema);
+  unsigned totalSize = CaculateInitListSize(hlslSource, InitList);
+  unsigned eltSize = hlslSource->GetNumBasicElements(EltTy);
+
+  if (totalSize > 0 && (totalSize % eltSize)==0) {
+    return totalSize / eltSize;
+  } else {
+    return 0;
+  }
+}
+
 bool hlsl::IsConversionToLessOrEqualElements(
  _In_ clang::Sema* self,
  const clang::ExprResult& sourceExpr,
@ -9375,6 +9604,10 @@ void hlsl::HandleDeclAttributeForHLSL(Sema &S, Decl *D, const AttributeList &A,
    declAttr = ::new (S.Context) HLSLTriangleAdjAttr(A.getRange(), S.Context,
      A.getAttributeSpellingListIndex());
    break;
+  case AttributeList::AT_HLSLGloballyCoherent:
+    declAttr = ::new (S.Context) HLSLGloballyCoherentAttr(
+        A.getRange(), S.Context, A.getAttributeSpellingListIndex());
+    break;

  default:
    Handled = false;
@ -9577,7 +9810,7 @@ Decl* Sema::ActOnStartHLSLBuffer(
    case hlsl::UnusualAnnotation::UA_RegisterAssignment: {
      hlsl::RegisterAssignment* registerAssignment = cast<hlsl::RegisterAssignment>(*unusualIter);

-      if (registerAssignment->RegisterType != expectedRegisterType) {
+      if (registerAssignment->RegisterType != expectedRegisterType && registerAssignment->RegisterType != toupper(expectedRegisterType)) {
        Diag(registerAssignment->Loc, cbuffer ? diag::err_hlsl_unsupported_cbuffer_register : 
                                                diag::err_hlsl_unsupported_tbuffer_register);
      } else if (registerAssignment->ShaderProfile.size() > 0) {
@ -9764,6 +9997,16 @@ bool Sema::DiagnoseHLSLDecl(Declarator &D, DeclContext *DC,
  bool isStatic = storage == DeclSpec::SCS::SCS_static;
  bool isExtern = storage == DeclSpec::SCS::SCS_extern;

+  bool hasSignSpec = D.getDeclSpec().getTypeSpecSign() != DeclSpec::TSS::TSS_unspecified;
+
+  // Function declarations are not allowed in parameter declaration
+  // TODO : Remove this check once we support function declarations/pointers in HLSL
+  if (isParameter && isFunction) {
+      Diag(D.getLocStart(), diag::err_hlsl_func_in_func_decl);
+      D.setInvalidType();
+      return false;
+  }
+
  assert(
    (1 == (isLocalVar ? 1 : 0) + (isGlobal ? 1 : 0) + (isField ? 1 : 0) +
    (isTypedef ? 1 : 0) + (isFunction ? 1 : 0) + (isMethod ? 1 : 0) +
@ -9810,12 +10053,13 @@ bool Sema::DiagnoseHLSLDecl(Declarator &D, DeclContext *DC,

  // Check for deprecated effect object type here, warn, and invalidate decl
  bool bDeprecatedEffectObject = false;
+  bool bIsObject = false;
  if (hlsl::IsObjectType(this, qt, &bDeprecatedEffectObject)) {
+    bIsObject = true;
    if (bDeprecatedEffectObject) {
      Diag(D.getLocStart(), diag::warn_hlsl_effect_object);
-      // Setting to invalid but not returning false prevents cascading errors
-      // on subsequent references to the decl
      D.setInvalidType();
+      return false;
    }
    // Add methods if not ready.
    HLSLExternalSource *hlslSource = HLSLExternalSource::FromSema(this);
@ -9861,6 +10105,24 @@ bool Sema::DiagnoseHLSLDecl(Declarator &D, DeclContext *DC,
    }
  }

+  if (hasSignSpec) {
+     HLSLExternalSource *hlslSource = HLSLExternalSource::FromSema(this);
+     ArTypeObjectKind objKind = hlslSource->GetTypeObjectKind(qt);
+     ArBasicKind basicKind = hlslSource->GetTypeElementKind(qt);
+     // vectors or matrices can only have unsigned integer types.
+     if (objKind == AR_TOBJ_MATRIX || objKind == AR_TOBJ_VECTOR || objKind == AR_TOBJ_BASIC || objKind == AR_TOBJ_ARRAY) {
+         if (!IS_BASIC_UNSIGNABLE(basicKind)) {
+             Diag(D.getLocStart(), diag::err_sema_invalid_sign_spec)
+                 << g_ArBasicTypeNames[basicKind];
+             result = false;
+         }
+     }
+     else {
+         Diag(D.getLocStart(), diag::err_sema_invalid_sign_spec) << g_ArBasicTypeNames[basicKind];
+         result = false;
+     }
+  }
+
  // Validate attributes
  clang::AttributeList
    *pPrecise = nullptr,
@ -9915,6 +10177,13 @@ bool Sema::DiagnoseHLSLDecl(Declarator &D, DeclContext *DC,
      }
      pGroupShared = pAttr;
      break;
+    case AttributeList::AT_HLSLGloballyCoherent:
+      if (!bIsObject) {
+        Diag(pAttr->getLoc(), diag::err_hlsl_varmodifierna)
+            << pAttr->getName() << "non-UAV type";
+        result = false;
+      }
+      break;
    case AttributeList::AT_HLSLUniform:
      if (!(isGlobal || isParameter)) {
        Diag(pAttr->getLoc(), diag::err_hlsl_varmodifierna)
@ -10516,6 +10785,10 @@ void hlsl::CustomPrintHLSLAttr(const clang::Attr *A, llvm::raw_ostream &Out, con
    Out << "triangleadj ";
    break;

+  case clang::attr::HLSLGloballyCoherent:
+    Out << "globallycoherent ";
+    break;
+
  default:
    A->printPretty(Out, Policy);
    break;
@ -10566,6 +10839,7 @@ bool hlsl::IsHLSLAttr(clang::attr::Kind AttrKind) {
  case clang::attr::HLSLLineAdj:
  case clang::attr::HLSLTriangle:
  case clang::attr::HLSLTriangleAdj:
+  case clang::attr::HLSLGloballyCoherent:
    return true;
  }
  
@ -10630,3 +10904,53 @@ clang::QualType hlsl::CheckVectorConditional(
 {
  return HLSLExternalSource::FromSema(self)->CheckVectorConditional(Cond, LHS, RHS, QuestionLoc);
 }
+
+bool IsTypeNumeric(_In_ clang::Sema* self, _In_ clang::QualType &type) {
+  UINT count;
+  return HLSLExternalSource::FromSema(self)->IsTypeNumeric(type, &count);
+}
+
+void Sema::CheckHLSLArrayAccess(const Expr *expr) {
+  DXASSERT_NOMSG(isa<CXXOperatorCallExpr>(expr));
+  const CXXOperatorCallExpr *OperatorCallExpr = cast<CXXOperatorCallExpr>(expr);
+  DXASSERT_NOMSG(OperatorCallExpr->getOperator() == OverloadedOperatorKind::OO_Subscript);
+
+  const Expr *RHS = OperatorCallExpr->getArg(1); // first subscript expression
+  llvm::APSInt index;
+  if (RHS->EvaluateAsInt(index, Context)) {
+      int64_t intIndex = index.getLimitedValue();
+      const QualType LHSQualType = OperatorCallExpr->getArg(0)->getType();
+      if (IsVectorType(this, LHSQualType)) {
+          uint32_t vectorSize = GetHLSLVecSize(LHSQualType);
+          // If expression is a double two subscript operator for matrix (e.g x[0][1])
+          // we also have to check the first subscript oprator by recursively calling
+          // this funciton for the first CXXOperatorCallExpr
+          if (isa<CXXOperatorCallExpr>(OperatorCallExpr->getArg(0))) {
+              CheckHLSLArrayAccess(cast<CXXOperatorCallExpr>(OperatorCallExpr->getArg(0)));
+          }
+          if (intIndex < 0 || (uint32_t)intIndex >= vectorSize) {
+              Diag(RHS->getExprLoc(),
+                  diag::err_hlsl_vector_element_index_out_of_bounds)
+                  << (int)intIndex;
+          }
+      }
+      else if (IsMatrixType(this, LHSQualType)) {
+          uint32_t rowCount, colCount;
+          GetHLSLMatRowColCount(LHSQualType, rowCount, colCount);
+          if (intIndex < 0 || (uint32_t)intIndex >= rowCount) {
+              Diag(RHS->getExprLoc(), diag::err_hlsl_matrix_row_index_out_of_bounds)
+                  << (int)intIndex;
+          }
+      }
+  }
+}
+
+clang::QualType ApplyTypeSpecSignToParsedType(
+    _In_ clang::Sema* self,
+    _In_ clang::QualType &type,
+    _In_ clang::TypeSpecifierSign TSS, 
+    _In_ clang::SourceLocation Loc
+)
+{
+    return HLSLExternalSource::FromSema(self)->ApplyTypeSpecSignToParsedType(type, TSS, Loc);
+}
--- a/tools/clang/lib/Sema/SemaInit.cpp
+++ b/tools/clang/lib/Sema/SemaInit.cpp
@ -6530,6 +6530,26 @@ InitializationSequence::Perform(Sema &S,
        CurInit = shouldBindAsTemporary(InitEntity)
          ? S.MaybeBindToTemporary(InitList)
          : InitList;
+        // Hack: We must update *ResultType if available in order to set the
+        // bounds of arrays, e.g. in 'int ar[] = {1, 2, 3};'.
+        // Worst case: 'const int (&arref)[] = {1, 2, 3};'.
+        if (ResultType &&
+            ResultType->getNonReferenceType()->isIncompleteArrayType()) {
+          const IncompleteArrayType *IncompleteAT =
+              S.getASTContext().getAsIncompleteArrayType(
+                  ResultType->getNonReferenceType());
+          QualType EltTy = IncompleteAT->getElementType();
+          unsigned arraySize = hlsl::CaculateInitListArraySizeForHLSL(&S, InitList, EltTy);
+          if (arraySize) {
+            llvm::APInt Size(
+                /*numBits=*/32, arraySize);
+            QualType AT = S.getASTContext().getConstantArrayType(
+                EltTy, Size, ArrayType::ArraySizeModifier::Normal,
+                /*IndexTypeQuals=*/0);
+            *ResultType = AT;
+            InitList->setType(AT);
+          }
+        }
      } else { // HLSL Change Ends code below is conditional
        InitListChecker PerformInitList(S, InitEntity, Kind, // HLSL Change - added Kind
          InitList, Ty, /*VerifyOnly=*/false);
--- a/tools/clang/lib/Sema/SemaOverload.cpp
+++ b/tools/clang/lib/Sema/SemaOverload.cpp
@ -144,6 +144,7 @@ ImplicitConversionRank clang::GetConversionRank(ImplicitConversionKind Kind) {
  };
  static_assert(_countof(Rank) == ICK_Num_Conversion_Kinds,
      "Otherwise, GetConversionRank is out of sync with ImplicitConversionKind"); // HLSL Change
+  assert((int)Kind < (int)ICK_Num_Conversion_Kinds); // HLSL Change
  return Rank[(int)Kind];
 }

@ -4927,6 +4928,7 @@ TryObjectArgumentInitialization(Sema &S, QualType FromType,
  ICS.Standard.BindsToRvalue = FromClassification.isRValue();
  ICS.Standard.BindsImplicitObjectArgumentWithoutRefQualifier
    = (Method->getRefQualifier() == RQ_None);
+  ICS.Standard.ComponentConversion = ICK_Identity;
  return ICS;
 }

--- a/tools/clang/lib/Sema/SemaStmt.cpp
+++ b/tools/clang/lib/Sema/SemaStmt.cpp
@ -37,6 +37,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "clang/Sema/SemaHLSL.h" // HLSL Change
 using namespace clang;
 using namespace sema;

@ -523,7 +524,9 @@ Sema::ActOnIfStmt(SourceLocation IfLoc, FullExprArg CondVal, Decl *CondVar,
  Expr *ConditionExpr = CondResult.getAs<Expr>();
  if (!ConditionExpr)
    return StmtError();
-
+  // HLSL Change Begin.
+  hlsl::DiagnoseControlFlowConditionForHLSL(this, ConditionExpr, "if");
+  // HLSL Change End.
  DiagnoseUnusedExprResult(thenStmt);

  if (!elseStmt) {
@ -1267,6 +1270,11 @@ Sema::ActOnWhileStmt(SourceLocation WhileLoc, FullExprArg Cond,
  Expr *ConditionExpr = CondResult.get();
  if (!ConditionExpr)
    return StmtError();
+
+  // HLSL Change Begin.
+  hlsl::DiagnoseControlFlowConditionForHLSL(this, ConditionExpr, "while");
+  // HLSL Change End.
+
  CheckBreakContinueBinding(ConditionExpr);

  DiagnoseUnusedExprResult(Body);
@ -1294,6 +1302,11 @@ Sema::ActOnDoStmt(SourceLocation DoLoc, Stmt *Body,
  if (CondResult.isInvalid())
    return StmtError();
  Cond = CondResult.get();
+  // HLSL Change Begin.
+  if (Cond) {
+    hlsl::DiagnoseControlFlowConditionForHLSL(this, Cond, "do-while");
+  }
+  // HLSL Change End.

  DiagnoseUnusedExprResult(Body);

@ -1670,7 +1683,12 @@ Sema::ActOnForStmt(SourceLocation ForLoc, SourceLocation LParenLoc,
    if (SecondResult.isInvalid())
      return StmtError();
  }
-
+  // HLSL Change Begin.
+  Expr *Cond = SecondResult.get();
+  if (Cond) {
+    hlsl::DiagnoseControlFlowConditionForHLSL(this, Cond, "for");
+  }
+  // HLSL Change End.
  Expr *Third  = third.release().getAs<Expr>();

  DiagnoseUnusedExprResult(First);
@ -1680,7 +1698,7 @@ Sema::ActOnForStmt(SourceLocation ForLoc, SourceLocation LParenLoc,
  if (isa<NullStmt>(Body))
    getCurCompoundScope().setHasEmptyLoopBodies();

-  return new (Context) ForStmt(Context, First, SecondResult.get(), ConditionVar,
+  return new (Context) ForStmt(Context, First, Cond, ConditionVar,
                               Third, Body, ForLoc, LParenLoc, RParenLoc);
 }

--- a/tools/clang/lib/Sema/SemaType.cpp
+++ b/tools/clang/lib/Sema/SemaType.cpp
@ -1388,7 +1388,6 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
  }
  case DeclSpec::TST_typename: {
    assert(DS.getTypeSpecWidth() == 0 && DS.getTypeSpecComplex() == 0 &&
-           DS.getTypeSpecSign() == 0 &&
           "Can't handle qualifiers on typedef names yet!");
    Result = S.GetTypeFromParser(DS.getRepAsType());
    if (Result.isNull()) {
--- a/tools/clang/lib/Sema/gen_intrin_main_tables_15.h
+++ b/tools/clang/lib/Sema/gen_intrin_main_tables_15.h
@ -1246,8 +1246,8 @@ static const HLSL_INTRINSIC g_Intrinsics[] =
    (UINT)hlsl::IntrinsicOp::IOP_EvaluateAttributeAtSample, false, true, -1, 3, g_Intrinsics_Args7,
    (UINT)hlsl::IntrinsicOp::IOP_EvaluateAttributeCentroid, false, true, -1, 2, g_Intrinsics_Args8,
    (UINT)hlsl::IntrinsicOp::IOP_EvaluateAttributeSnapped, false, true, -1, 3, g_Intrinsics_Args9,
-    (UINT)hlsl::IntrinsicOp::IOP_GetRenderTargetSampleCount, false, false, -1, 1, g_Intrinsics_Args10,
-    (UINT)hlsl::IntrinsicOp::IOP_GetRenderTargetSamplePosition, false, false, -1, 2, g_Intrinsics_Args11,
+    (UINT)hlsl::IntrinsicOp::IOP_GetRenderTargetSampleCount, false, true, -1, 1, g_Intrinsics_Args10,
+    (UINT)hlsl::IntrinsicOp::IOP_GetRenderTargetSamplePosition, false, true, -1, 2, g_Intrinsics_Args11,
    (UINT)hlsl::IntrinsicOp::IOP_GroupMemoryBarrier, false, false, -1, 1, g_Intrinsics_Args12,
    (UINT)hlsl::IntrinsicOp::IOP_GroupMemoryBarrierWithGroupSync, false, false, -1, 1, g_Intrinsics_Args13,
    (UINT)hlsl::IntrinsicOp::IOP_InterlockedAdd, false, false, -1, 3, g_Intrinsics_Args14,
@ -1266,40 +1266,40 @@ static const HLSL_INTRINSIC g_Intrinsics[] =
    (UINT)hlsl::IntrinsicOp::IOP_InterlockedXor, false, false, -1, 3, g_Intrinsics_Args27,
    (UINT)hlsl::IntrinsicOp::IOP_InterlockedXor, false, false, -1, 4, g_Intrinsics_Args28,
    (UINT)hlsl::IntrinsicOp::IOP_NonUniformResourceIndex, false, true, -1, 2, g_Intrinsics_Args29,
-    (UINT)hlsl::IntrinsicOp::IOP_Process2DQuadTessFactorsAvg, false, false, -1, 6, g_Intrinsics_Args30,
-    (UINT)hlsl::IntrinsicOp::IOP_Process2DQuadTessFactorsMax, false, false, -1, 6, g_Intrinsics_Args31,
-    (UINT)hlsl::IntrinsicOp::IOP_Process2DQuadTessFactorsMin, false, false, -1, 6, g_Intrinsics_Args32,
-    (UINT)hlsl::IntrinsicOp::IOP_ProcessIsolineTessFactors, false, false, -1, 5, g_Intrinsics_Args33,
-    (UINT)hlsl::IntrinsicOp::IOP_ProcessQuadTessFactorsAvg, false, false, -1, 6, g_Intrinsics_Args34,
-    (UINT)hlsl::IntrinsicOp::IOP_ProcessQuadTessFactorsMax, false, false, -1, 6, g_Intrinsics_Args35,
-    (UINT)hlsl::IntrinsicOp::IOP_ProcessQuadTessFactorsMin, false, false, -1, 6, g_Intrinsics_Args36,
-    (UINT)hlsl::IntrinsicOp::IOP_ProcessTriTessFactorsAvg, false, false, -1, 6, g_Intrinsics_Args37,
-    (UINT)hlsl::IntrinsicOp::IOP_ProcessTriTessFactorsMax, false, false, -1, 6, g_Intrinsics_Args38,
-    (UINT)hlsl::IntrinsicOp::IOP_ProcessTriTessFactorsMin, false, false, -1, 6, g_Intrinsics_Args39,
-    (UINT)hlsl::IntrinsicOp::IOP_QuadReadAcrossDiagonal, false, true, -1, 2, g_Intrinsics_Args40,
-    (UINT)hlsl::IntrinsicOp::IOP_QuadReadAcrossX, false, true, -1, 2, g_Intrinsics_Args41,
-    (UINT)hlsl::IntrinsicOp::IOP_QuadReadAcrossY, false, true, -1, 2, g_Intrinsics_Args42,
-    (UINT)hlsl::IntrinsicOp::IOP_QuadReadLaneAt, false, true, -1, 3, g_Intrinsics_Args43,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveAllEqual, false, true, -1, 2, g_Intrinsics_Args44,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveAllTrue, false, true, -1, 2, g_Intrinsics_Args45,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveAnyTrue, false, true, -1, 2, g_Intrinsics_Args46,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveBallot, false, true, -1, 2, g_Intrinsics_Args47,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveBitAnd, false, true, -1, 2, g_Intrinsics_Args48,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveBitOr, false, true, -1, 2, g_Intrinsics_Args49,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveBitXor, false, true, -1, 2, g_Intrinsics_Args50,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveCountBits, false, true, -1, 2, g_Intrinsics_Args51,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveMax, false, true, -1, 2, g_Intrinsics_Args52,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveMin, false, true, -1, 2, g_Intrinsics_Args53,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveProduct, false, true, -1, 2, g_Intrinsics_Args54,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveSum, false, true, -1, 2, g_Intrinsics_Args55,
+    (UINT)hlsl::IntrinsicOp::IOP_Process2DQuadTessFactorsAvg, false, true, -1, 6, g_Intrinsics_Args30,
+    (UINT)hlsl::IntrinsicOp::IOP_Process2DQuadTessFactorsMax, false, true, -1, 6, g_Intrinsics_Args31,
+    (UINT)hlsl::IntrinsicOp::IOP_Process2DQuadTessFactorsMin, false, true, -1, 6, g_Intrinsics_Args32,
+    (UINT)hlsl::IntrinsicOp::IOP_ProcessIsolineTessFactors, false, true, -1, 5, g_Intrinsics_Args33,
+    (UINT)hlsl::IntrinsicOp::IOP_ProcessQuadTessFactorsAvg, false, true, -1, 6, g_Intrinsics_Args34,
+    (UINT)hlsl::IntrinsicOp::IOP_ProcessQuadTessFactorsMax, false, true, -1, 6, g_Intrinsics_Args35,
+    (UINT)hlsl::IntrinsicOp::IOP_ProcessQuadTessFactorsMin, false, true, -1, 6, g_Intrinsics_Args36,
+    (UINT)hlsl::IntrinsicOp::IOP_ProcessTriTessFactorsAvg, false, true, -1, 6, g_Intrinsics_Args37,
+    (UINT)hlsl::IntrinsicOp::IOP_ProcessTriTessFactorsMax, false, true, -1, 6, g_Intrinsics_Args38,
+    (UINT)hlsl::IntrinsicOp::IOP_ProcessTriTessFactorsMin, false, true, -1, 6, g_Intrinsics_Args39,
+    (UINT)hlsl::IntrinsicOp::IOP_QuadReadAcrossDiagonal, false, false, -1, 2, g_Intrinsics_Args40,
+    (UINT)hlsl::IntrinsicOp::IOP_QuadReadAcrossX, false, false, -1, 2, g_Intrinsics_Args41,
+    (UINT)hlsl::IntrinsicOp::IOP_QuadReadAcrossY, false, false, -1, 2, g_Intrinsics_Args42,
+    (UINT)hlsl::IntrinsicOp::IOP_QuadReadLaneAt, false, false, -1, 3, g_Intrinsics_Args43,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveAllEqual, false, false, -1, 2, g_Intrinsics_Args44,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveAllTrue, false, false, -1, 2, g_Intrinsics_Args45,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveAnyTrue, false, false, -1, 2, g_Intrinsics_Args46,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveBallot, false, false, -1, 2, g_Intrinsics_Args47,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveBitAnd, false, false, -1, 2, g_Intrinsics_Args48,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveBitOr, false, false, -1, 2, g_Intrinsics_Args49,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveBitXor, false, false, -1, 2, g_Intrinsics_Args50,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveCountBits, false, false, -1, 2, g_Intrinsics_Args51,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveMax, false, false, -1, 2, g_Intrinsics_Args52,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveMin, false, false, -1, 2, g_Intrinsics_Args53,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveProduct, false, false, -1, 2, g_Intrinsics_Args54,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveActiveSum, false, false, -1, 2, g_Intrinsics_Args55,
    (UINT)hlsl::IntrinsicOp::IOP_WaveGetLaneCount, false, true, -1, 1, g_Intrinsics_Args56,
    (UINT)hlsl::IntrinsicOp::IOP_WaveGetLaneIndex, false, true, -1, 1, g_Intrinsics_Args57,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveIsFirstLane, false, true, -1, 1, g_Intrinsics_Args58,
-    (UINT)hlsl::IntrinsicOp::IOP_WavePrefixCountBits, false, true, -1, 2, g_Intrinsics_Args59,
-    (UINT)hlsl::IntrinsicOp::IOP_WavePrefixProduct, false, true, -1, 2, g_Intrinsics_Args60,
-    (UINT)hlsl::IntrinsicOp::IOP_WavePrefixSum, false, true, -1, 2, g_Intrinsics_Args61,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveReadLaneAt, false, true, -1, 3, g_Intrinsics_Args62,
-    (UINT)hlsl::IntrinsicOp::IOP_WaveReadLaneFirst, false, true, -1, 2, g_Intrinsics_Args63,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveIsFirstLane, false, false, -1, 1, g_Intrinsics_Args58,
+    (UINT)hlsl::IntrinsicOp::IOP_WavePrefixCountBits, false, false, -1, 2, g_Intrinsics_Args59,
+    (UINT)hlsl::IntrinsicOp::IOP_WavePrefixProduct, false, false, -1, 2, g_Intrinsics_Args60,
+    (UINT)hlsl::IntrinsicOp::IOP_WavePrefixSum, false, false, -1, 2, g_Intrinsics_Args61,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveReadLaneAt, false, false, -1, 3, g_Intrinsics_Args62,
+    (UINT)hlsl::IntrinsicOp::IOP_WaveReadLaneFirst, false, false, -1, 2, g_Intrinsics_Args63,
    (UINT)hlsl::IntrinsicOp::IOP_abort, false, false, -1, 1, g_Intrinsics_Args64,
    (UINT)hlsl::IntrinsicOp::IOP_abs, false, true, -1, 2, g_Intrinsics_Args65,
    (UINT)hlsl::IntrinsicOp::IOP_acos, false, true, -1, 2, g_Intrinsics_Args66,
--- a/tools/clang/test/CodeGenHLSL/AddUint64.hlsl
+++ b/tools/clang/test/CodeGenHLSL/AddUint64.hlsl
@ -0,0 +1,10 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: binaryWithCarry
+// CHECK: binaryWithCarry
+
+float4 main(uint4 a : A, uint4 b :B) : SV_TARGET {
+  uint2 c2 = AddUint64(a.xy, b.xy);
+  uint4 c4 = AddUint64(a, b);
+  return c2.xxyy + c4;
+}
--- a/tools/clang/test/CodeGenHLSL/AddUint64Odd.hlsl
+++ b/tools/clang/test/CodeGenHLSL/AddUint64Odd.hlsl
@ -0,0 +1,9 @@
+// RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s
+
+// CHECK: AddUint64 can only be applied to uint2 and uint4 operands
+
+float4 main(uint4 a : A, uint4 b :B) : SV_TARGET {
+  uint c = AddUint64(a.x, b.x);
+  uint3 c3 = AddUint64(a.xyz, b.xyz);
+  return c + c3.xyzz;
+}
--- a/tools/clang/test/CodeGenHLSL/BasicHLSL11_PS.hlsl
+++ b/tools/clang/test/CodeGenHLSL/BasicHLSL11_PS.hlsl
@ -1,9 +1,9 @@
 // RUN: %dxc -E main -T ps_6_0 %s | FileCheck %s

-// CHECK: TEXCOORD
-// CHECK: xy
 // CHECK: NORMAL
 // CHECK: xyz
+// CHECK: TEXCOORD
+// CHECK: xy

 // CHECK: SV_Target
 // CHECK: xyzw
--- a/Показать больше
+++ b/Показать больше