Strip Byte Order Markers (BOMs) and fix some related issues (#3880)

* Stop creating binary blobs with UTF-8 encoding * Strip BOM and fix some related issues - detect and strip BOM only when interpreting as text: - DxcCreateBlob: only when encoding is known and not CP_ACP - DxcGetBlobAsUtf[8|16]: when encoding not known or CP_ACP - strncmp in BOM detection would misfire (should have been memcmp) - ambiguous BOM case not handled/noted - empty blob case fixes (handle null-term empty, don't leak) Note, there are still problems in the unicode handling: - No actual support for UTF-32 or big endian input encodings. - For Linux: "UTF-16" isn't, and bad assumptions around "wide" strings. * Add defines for CP_UTF*; improve error for unsupported utf conversions
2021-07-15 16:41:30 -07:00 · 2021-07-15 16:41:30 -07:00 · 58f0ee2637
--- a/lib/DxcSupport/FileIOHelper.cpp
+++ b/lib/DxcSupport/FileIOHelper.cpp
@ -24,7 +24,18 @@
 #include <intsafe.h>
 #endif

-#define CP_UTF16 1200
+// CP_UTF8 is defined in WinNls.h, but others we use are not defined there.
+// See matching value definitions here:
+// https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
+
+// We detect all these through BOM.
+#define CP_UTF16LE 1200
+#define CP_UTF16BE 1201
+#define CP_UTF32LE 12000
+#define CP_UTF32BE 12001
+
+// Alias for CP_UTF16LE, which is the only one we actually handle.
+#define CP_UTF16 CP_UTF16LE

 struct HeapMalloc : public IMalloc {
 public:
@ -146,20 +157,24 @@ UINT32 DxcCodePageFromBytes(const char *bytes, size_t byteLen) throw() {
    // Now try to use the BOM to check for Unicode encodings
    char bom[4] = { bytes[0], bytes[1], bytes[2], bytes[3] };

-    if (strncmp(bom, "\xef\xbb\xbf", 3) == 0) {
+    if (memcmp(bom, "\xef\xbb\xbf", 3) == 0) {
      codePage = CP_UTF8;
    }
-    else if (strncmp(bom, "\xff\xfe\x00\x00", 4) == 0) {
-      codePage = 12000; //UTF-32 LE
+    else if (byteLen > 4 && memcmp(bom, "\xff\xfe\x00\x00", 4) == 0) {
+      // byteLen > 4 to avoid mistaking empty UTF-16 LE BOM + null-terminator
+      // for UTF-32 LE BOM without null-terminator.
+      // If it's an empty UTF-32 LE with no null-termination,
+      // it's harmless to interpret as empty UTF-16 LE with null-termination.
+      codePage = CP_UTF32LE;
    }
-    else if (strncmp(bom, "\x00\x00\xfe\xff", 4) == 0) {
-      codePage = 12001; //UTF-32 BE
+    else if (memcmp(bom, "\x00\x00\xfe\xff", 4) == 0) {
+      codePage = CP_UTF32BE;
    }
-    else if (strncmp(bom, "\xff\xfe", 2) == 0) {
-      codePage = 1200; //UTF-16 LE
+    else if (memcmp(bom, "\xff\xfe", 2) == 0) {
+      codePage = CP_UTF16LE;
    }
-    else if (strncmp(bom, "\xfe\xff", 2) == 0) {
-      codePage = 1201; //UTF-16 BE
+    else if (memcmp(bom, "\xfe\xff", 2) == 0) {
+      codePage = CP_UTF16BE;
    }
    else {
      codePage = CP_ACP;
@ -171,6 +186,49 @@ UINT32 DxcCodePageFromBytes(const char *bytes, size_t byteLen) throw() {
  return codePage;
 }

+unsigned GetBomLengthFromCodePage(UINT32 codePage) {
+  switch(codePage) {
+    case CP_UTF8:
+      return 3;
+    case CP_UTF32LE:
+    case CP_UTF32BE:
+      return 4;
+    case CP_UTF16LE:
+    case CP_UTF16BE:
+      return 2;
+    default:
+      return 0;
+  }
+}
+
+static unsigned CharSizeFromCodePage(UINT32 codePage) {
+  switch (codePage) {
+    case CP_UTF32LE:
+    case CP_UTF32BE:
+      return 4;
+    case CP_UTF16LE:
+    case CP_UTF16BE:
+      return 2;
+    default:
+      return 1;
+  }
+}
+
+// We do not handle translation from these code page values.
+static bool IsUnsupportedUtfCodePage(UINT32 codePage) {
+  switch (codePage) {
+    case CP_UTF32LE:
+    case CP_UTF32BE:
+    case CP_UTF16BE:
+      return true;
+  }
+  return false;
+}
+
+unsigned GetBomLengthFromBytes(const char *bytes, size_t byteLen) throw() {
+  return GetBomLengthFromCodePage(DxcCodePageFromBytes(bytes, byteLen));
+}
+
 #define IsSizeWcharAligned(size) (((size) & (sizeof(wchar_t) - 1)) == 0)

 template<typename _char>
@ -364,6 +422,9 @@ static HRESULT CodePageBufferToUtf16(UINT32 codePage, LPCVOID bufferPointer,
    return S_OK;
  }

+  if (IsUnsupportedUtfCodePage(codePage))
+    return DXC_E_STRING_ENCODING_FAILED;
+
  // Calculate the length of the buffer in wchar_t elements.
  int numToConvertUTF16 =
      MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, (LPCSTR)bufferPointer,
@ -409,7 +470,6 @@ static HRESULT CodePageBufferToUtf8(UINT32 codePage, LPCVOID bufferPointer,
                                    CDxcMallocHeapPtr<char> &utf8NewCopy,
                                    _Out_ UINT32 *pConvertedCharCount) {
  *pConvertedCharCount = 0;
-
  CDxcMallocHeapPtr<WCHAR> utf16NewCopy(pMalloc);
  UINT32 utf16CharCount = 0;
  const WCHAR *utf16Chars = nullptr;
@ -509,7 +569,6 @@ static bool TryCreateBlobUtfFromBlob(
  return false;
 }

-
 HRESULT DxcCreateBlob(
    LPCVOID pPtr, SIZE_T size, bool bPinned, bool bCopy,
    bool encodingKnown, UINT32 codePage,
@ -520,17 +579,40 @@ HRESULT DxcCreateBlob(
  *ppBlobEncoding = nullptr;

  bool bNullTerminated = encodingKnown ? IsBufferNullTerminated(pPtr, size, codePage) : false;
+  unsigned bomSize = (encodingKnown && codePage != CP_ACP)
+                         ? GetBomLengthFromBytes((const char *)pPtr, size)
+                         : 0;
+  if (bomSize) {
+    // Adjust pPtr and size to skip BOM.
+    // When !encodingKnown or codePage == CP_ACP, BOM will be skipped when
+    // interpreting as text and translating to unicode, since at this point,
+    // the buffer could be an arbitrary binary blob.
+
+    // There is an odd corner case with BOM detection where an empty
+    // non-null-terminated UTF-32 LE buffer with BOM would be interpreted
+    // as an empty null-terminated UTF-16 LE buffer with a BOM.
+    // This won't matter in the end, since both cases are empty buffers, and
+    // will map to the empty buffer case with the desired codePage setting.
+    pPtr = (const char *)pPtr + bomSize;
+    size -= bomSize;
+  }
+  bool emptyString = !pPtr || !size;
+  if (bNullTerminated) {
+    DXASSERT_NOMSG(pPtr && size && encodingKnown);
+    emptyString = size == CharSizeFromCodePage(codePage);
+  }

  if (!pMalloc)
    pMalloc = DxcGetThreadMallocNoRef();

  // Handle empty blob
-  if (!pPtr || !size) {
+  if (emptyString) {
    if (encodingKnown && TryCreateEmptyBlobUtf(codePage, pMalloc, ppBlobEncoding))
      return S_OK;
    InternalDxcBlobEncoding *pInternalEncoding;
    IFR(InternalDxcBlobEncoding::CreateFromMalloc(nullptr, pMalloc, 0, encodingKnown, codePage, &pInternalEncoding));
    *ppBlobEncoding = pInternalEncoding;
+    return S_OK;
  }

  if (bPinned) {
@ -856,9 +938,17 @@ HRESULT DxcGetBlobAsUtf8(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf8 **pBlob
      return hr;
  }

+  const char *bufferPointer = (const char *)pBlob->GetBufferPointer();
  SIZE_T blobLen = pBlob->GetBufferSize();
-  if (!known) {
-    codePage = DxcCodePageFromBytes((char *)pBlob->GetBufferPointer(), blobLen);
+  unsigned bomSize = 0;
+  if (!known || codePage == CP_ACP) {
+    // Try to determine encoding from BOM.
+    // If encoding was known, any BOM should have been stripped already.
+    codePage = DxcCodePageFromBytes(bufferPointer, blobLen);
+    bomSize = GetBomLengthFromCodePage(codePage);
+    // BOM exists, adjust pointer and size to strip.
+    bufferPointer += bomSize;
+    blobLen -= bomSize;
  }

  if (!pMalloc)
@ -870,11 +960,14 @@ HRESULT DxcGetBlobAsUtf8(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf8 **pBlob
  // Reuse or copy the underlying blob depending on null-termination
  if (codePage == CP_UTF8) {
    utf8CharCount = blobLen;
-    if (IsBufferNullTerminated(pBlob->GetBufferPointer(), blobLen, CP_UTF8)) {
+    if (IsBufferNullTerminated(bufferPointer, blobLen, CP_UTF8)) {
      // Already null-terminated, reference other blob's memory
      InternalDxcBlobUtf8* internalEncoding;
      hr = InternalDxcBlobUtf8::CreateFromBlob(pBlob, pMalloc, true, CP_UTF8, &internalEncoding);
      if (SUCCEEDED(hr)) {
+        // Adjust if buffer has BOM; blobLen is already adjusted.
+        if (bomSize)
+          internalEncoding->AdjustPtrAndSize(bomSize, blobLen);
        *pBlobEncoding = internalEncoding;
      }
      return hr;
@ -882,13 +975,13 @@ HRESULT DxcGetBlobAsUtf8(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf8 **pBlob
      // Copy to new buffer and null-terminate
      if(!utf8NewCopy.Allocate(utf8CharCount + 1))
        return E_OUTOFMEMORY;
-      memcpy(utf8NewCopy.m_pData, pBlob->GetBufferPointer(), blobLen);
+      memcpy(utf8NewCopy.m_pData, bufferPointer, utf8CharCount);
      utf8NewCopy.m_pData[utf8CharCount++] = 0;
    }
  } else {
    // Convert and create a blob that owns the encoding.
    if (FAILED(
-      hr = CodePageBufferToUtf8(codePage, pBlob->GetBufferPointer(), blobLen,
+      hr = CodePageBufferToUtf8(codePage, bufferPointer, blobLen,
                                 pMalloc, utf8NewCopy, &utf8CharCount))) {
      return hr;
    }
@ -942,9 +1035,18 @@ HRESULT DxcGetBlobAsUtf16(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf16 **pBl
      return hr;
  }

+  // Look for BOM and adjust pointer and size to skip if necessary.
+  const char *bufferPointer = (const char *)pBlob->GetBufferPointer();
  SIZE_T blobLen = pBlob->GetBufferSize();
-  if (!known) {
-    codePage = DxcCodePageFromBytes((char *)pBlob->GetBufferPointer(), blobLen);
+  unsigned bomSize = 0;
+  if (!known || codePage == CP_ACP) {
+    // Try to determine encoding from BOM.
+    // If encoding was known, any BOM should have been stripped already.
+    codePage = DxcCodePageFromBytes(bufferPointer, blobLen);
+    bomSize = GetBomLengthFromCodePage(codePage);
+    // BOM exists, adjust pointer and size to strip.
+    bufferPointer += bomSize;
+    blobLen -= bomSize;
  }

  if (!pMalloc)
@ -958,11 +1060,14 @@ HRESULT DxcGetBlobAsUtf16(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf16 **pBl
    DXASSERT(IsSizeWcharAligned(blobLen),
             "otherwise, UTF-16 blob size not evenly divisible by 2");
    utf16CharCount = blobLen / sizeof(wchar_t);
-    if (IsBufferNullTerminated(pBlob->GetBufferPointer(), blobLen, CP_UTF16)) {
+    if (IsBufferNullTerminated(bufferPointer, blobLen, CP_UTF16)) {
      // Already null-terminated, reference other blob's memory
      InternalDxcBlobUtf16* internalEncoding;
      hr = InternalDxcBlobUtf16::CreateFromBlob(pBlob, pMalloc, true, CP_UTF16, &internalEncoding);
      if (SUCCEEDED(hr)) {
+        // Adjust if buffer has BOM; blobLen is already adjusted.
+        if (bomSize)
+          internalEncoding->AdjustPtrAndSize(bomSize, blobLen);
        *pBlobEncoding = internalEncoding;
      }
      return hr;
@ -970,13 +1075,13 @@ HRESULT DxcGetBlobAsUtf16(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf16 **pBl
      // Copy to new buffer and null-terminate
      if(!utf16NewCopy.Allocate(utf16CharCount + 1))
        return E_OUTOFMEMORY;
-      memcpy(utf16NewCopy.m_pData, pBlob->GetBufferPointer(), blobLen);
+      memcpy(utf16NewCopy.m_pData, bufferPointer, blobLen);
      utf16NewCopy.m_pData[utf16CharCount++] = 0;
    }
  } else {
    // Convert and create a blob that owns the encoding.
    if (FAILED(
-      hr = CodePageBufferToUtf16(codePage, pBlob->GetBufferPointer(), blobLen,
+      hr = CodePageBufferToUtf16(codePage, bufferPointer, blobLen,
                                 utf16NewCopy, &utf16CharCount))) {
      return hr;
    }
--- a/lib/DxilContainer/DxcContainerBuilder.cpp
+++ b/lib/DxilContainer/DxcContainerBuilder.cpp
@ -40,8 +40,8 @@ HRESULT STDMETHODCALLTYPE DxcContainerBuilder::Load(_In_ IDxcBlob *pSource) {
    const DxilContainerHeader *pHeader = (DxilContainerHeader *)pSource->GetBufferPointer();
    for (DxilPartIterator it = begin(pHeader), itEnd = end(pHeader); it != itEnd; ++it) {
      const DxilPartHeader *pPartHeader = *it;
-      CComPtr<IDxcBlobEncoding> pBlob;
-      IFT(DxcCreateBlobWithEncodingFromPinned((const void *)(pPartHeader + 1), pPartHeader->PartSize, CP_UTF8, &pBlob));
+      CComPtr<IDxcBlob> pBlob;
+      IFT(DxcCreateBlobFromPinned((const void *)(pPartHeader + 1), pPartHeader->PartSize, &pBlob));
      PartList::iterator itPartList = std::find_if(m_parts.begin(), m_parts.end(), [&](DxilPart part) {
        return part.m_fourCC == pPartHeader->PartFourCC;
      });
--- a/lib/DxilRootSignature/DxilRootSignature.cpp
+++ b/lib/DxilRootSignature/DxilRootSignature.cpp
@ -89,8 +89,8 @@ void RootSignatureHandle::Deserialize() {
 void RootSignatureHandle::LoadSerialized(const uint8_t *pData,
                                         unsigned length) {
  DXASSERT_NOMSG(IsEmpty());
-  IDxcBlobEncoding *pCreated;
-  IFT(DxcCreateBlobWithEncodingOnHeapCopy(pData, length, CP_UTF8, &pCreated));
+  IDxcBlob *pCreated;
+  IFT(DxcCreateBlobOnHeapCopy(pData, length, &pCreated));
  m_pSerialized = pCreated;
 }

--- a/tools/clang/tools/dxa/dxa.cpp
+++ b/tools/clang/tools/dxa/dxa.cpp
@ -115,7 +115,7 @@ void DxaContext::Assemble() {
        }
      }

-      WriteBlobToFile(pContainer, StringRefUtf16(OutputFilename), DXC_CP_UTF8); // TODO: Support DefaultTextCodePage
+      WriteBlobToFile(pContainer, StringRefUtf16(OutputFilename), DXC_CP_ACP);
      printf("Output written to \"%s\"\n", OutputFilename.c_str());
    }
  } else {
--- a/tools/clang/unittests/HLSL/CompilerTest.cpp
+++ b/tools/clang/unittests/HLSL/CompilerTest.cpp
@ -954,7 +954,7 @@ TEST_F(CompilerTest, CompileThenAddCustomDebugName) {

  CComPtr<IDxcBlobEncoding> pDebugName;

-  CreateBlobPinned(pNameBlobContent, allocatedSize, CP_UTF8, &pDebugName);
+  CreateBlobPinned(pNameBlobContent, allocatedSize, DXC_CP_ACP, &pDebugName);


  VERIFY_SUCCEEDED(pBuilder->Load(pProgram));
@ -2939,7 +2939,7 @@ TEST_F(CompilerTest, LibGVStore) {
  unsigned bitcode_size = hlsl::GetDxilBitcodeSize((hlsl::DxilProgramHeader *)pBitcode->GetBufferPointer());

  CComPtr<IDxcBlobEncoding> pBitcodeBlob;
-  CreateBlobPinned(bitcode, bitcode_size, CP_UTF8, &pBitcodeBlob);
+  CreateBlobPinned(bitcode, bitcode_size, DXC_CP_ACP, &pBitcodeBlob);

  CComPtr<IDxcBlob> pReassembled;
  CComPtr<IDxcOperationResult> pReassembleResult;
--- a/tools/clang/unittests/HLSL/ValidationTest.cpp
+++ b/tools/clang/unittests/HLSL/ValidationTest.cpp
@ -336,7 +336,7 @@ public:
    CComPtr<IDxcLibrary> pLibrary;
    CComPtr<IDxcBlobEncoding> pBlobEncoding; // Encoding doesn't actually matter, it's binary.
    VERIFY_SUCCEEDED(m_dllSupport.CreateInstance(CLSID_DxcLibrary, &pLibrary));
-    VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned(pBlob, blobSize, CP_UTF8, &pBlobEncoding));
+    VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned(pBlob, blobSize, DXC_CP_ACP, &pBlobEncoding));
    CheckValidationMsgs(pBlobEncoding, pErrorMsgs, bRegex, Flags);
  }

--- a/utils/hct/cmdtestfiles/bom-inc-ascii.hlsli
+++ b/utils/hct/cmdtestfiles/bom-inc-ascii.hlsli
@ -0,0 +1 @@
+float4 f_ascii;
--- a/utils/hct/cmdtestfiles/bom-inc-utf16le.hlsli
+++ b/utils/hct/cmdtestfiles/bom-inc-utf16le.hlsli
--- a/utils/hct/cmdtestfiles/bom-inc-utf8.hlsli
+++ b/utils/hct/cmdtestfiles/bom-inc-utf8.hlsli
@ -0,0 +1 @@
+float4 f_utf8;
--- a/utils/hct/cmdtestfiles/bom-main-ascii.hlsl
+++ b/utils/hct/cmdtestfiles/bom-main-ascii.hlsl
@ -0,0 +1,20 @@
+// Tests main and include files with/without BOM to ensure BOM is stripped.
+
+#include "bom-inc-ascii.hlsli"
+#include "bom-inc-utf8.hlsli"
+#include "bom-inc-utf16le.hlsli"
+
+// TODO: Add support for Big Endian and UTF-32
+// #include "bom-inc-utf16be.hlsli"
+// #include "bom-inc-utf32le.hlsli"
+// #include "bom-inc-utf32be.hlsli"
+
+float4 main() : SV_Target {
+  return f_ascii
+        + f_utf8
+        + f_utf16le
+        // + f_utf16be
+        // + f_utf32le
+        // + f_utf32be
+        ;
+}
--- a/utils/hct/cmdtestfiles/bom-main-utf16le.hlsl
+++ b/utils/hct/cmdtestfiles/bom-main-utf16le.hlsl
--- a/utils/hct/cmdtestfiles/bom-main-utf8.hlsl
+++ b/utils/hct/cmdtestfiles/bom-main-utf8.hlsl
@ -0,0 +1,20 @@
+// Tests main and include files with/without BOM to ensure BOM is stripped.
+
+#include "bom-inc-ascii.hlsli"
+#include "bom-inc-utf8.hlsli"
+#include "bom-inc-utf16le.hlsli"
+
+// TODO: Add support for Big Endian and UTF-32
+// #include "bom-inc-utf16be.hlsli"
+// #include "bom-inc-utf32le.hlsli"
+// #include "bom-inc-utf32be.hlsli"
+
+float4 main() : SV_Target {
+  return f_ascii
+        + f_utf8
+        + f_utf16le
+        // + f_utf16be
+        // + f_utf32le
+        // + f_utf32be
+        ;
+}
--- a/utils/hct/hcttestcmds.cmd
+++ b/utils/hct/hcttestcmds.cmd
@ -421,6 +421,12 @@ if %Failed% neq 0 goto :failed
 call :run dxc.exe -P include-main.hlsl.pp -I inc subfolder\include-main.hlsl
 if %Failed% neq 0 goto :failed

+set testname=Byte Order Markers
+call :run dxc.exe /T ps_6_0 "%testfiles%\bom-main-ascii.hlsl"
+call :run dxc.exe /T ps_6_0 "%testfiles%\bom-main-utf8.hlsl"
+call :run dxc.exe /T ps_6_0 "%testfiles%\bom-main-utf16le.hlsl"
+if %Failed% neq 0 goto :failed
+
 rem SPIR-V Change Starts
 echo Smoke test for SPIR-V CodeGen ...
 set spirv_smoke_success=0
@ -599,5 +605,5 @@ rem ============================================
 rem Cleanup and return failure
 :failed
 call :cleanup 2>nul
-if %Failed%=="0" set Failed=1
+if %Failed% eq 0 set Failed=1
 exit /b %Failed%