Strip Byte Order Markers (BOMs) and fix some related issues (#3880)

* Stop creating binary blobs with UTF-8 encoding

* Strip BOM and fix some related issues

- detect and strip BOM only when interpreting as text:
  - DxcCreateBlob: only when encoding is known and not CP_ACP
  - DxcGetBlobAsUtf[8|16]: when encoding not known or CP_ACP
- strncmp in BOM detection would misfire (should have been memcmp)
- ambiguous BOM case not handled/noted
- empty blob case fixes (handle null-term empty, don't leak)

Note, there are still problems in the unicode handling:
- No actual support for UTF-32 or big endian input encodings.
- For Linux: "UTF-16" isn't, and bad assumptions around "wide" strings.

* Add defines for CP_UTF*; improve error for unsupported utf conversions
This commit is contained in:
Tex Riddell 2021-07-15 16:41:30 -07:00 коммит произвёл GitHub
Родитель 6bf92d4f74
Коммит 58f0ee2637
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
13 изменённых файлов: 185 добавлений и 32 удалений

Просмотреть файл

@ -24,7 +24,18 @@
#include <intsafe.h>
#endif
#define CP_UTF16 1200
// CP_UTF8 is defined in WinNls.h, but others we use are not defined there.
// See matching value definitions here:
// https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
// We detect all these through BOM.
#define CP_UTF16LE 1200
#define CP_UTF16BE 1201
#define CP_UTF32LE 12000
#define CP_UTF32BE 12001
// Alias for CP_UTF16LE, which is the only one we actually handle.
#define CP_UTF16 CP_UTF16LE
struct HeapMalloc : public IMalloc {
public:
@ -146,20 +157,24 @@ UINT32 DxcCodePageFromBytes(const char *bytes, size_t byteLen) throw() {
// Now try to use the BOM to check for Unicode encodings
char bom[4] = { bytes[0], bytes[1], bytes[2], bytes[3] };
if (strncmp(bom, "\xef\xbb\xbf", 3) == 0) {
if (memcmp(bom, "\xef\xbb\xbf", 3) == 0) {
codePage = CP_UTF8;
}
else if (strncmp(bom, "\xff\xfe\x00\x00", 4) == 0) {
codePage = 12000; //UTF-32 LE
else if (byteLen > 4 && memcmp(bom, "\xff\xfe\x00\x00", 4) == 0) {
// byteLen > 4 to avoid mistaking empty UTF-16 LE BOM + null-terminator
// for UTF-32 LE BOM without null-terminator.
// If it's an empty UTF-32 LE with no null-termination,
// it's harmless to interpret as empty UTF-16 LE with null-termination.
codePage = CP_UTF32LE;
}
else if (strncmp(bom, "\x00\x00\xfe\xff", 4) == 0) {
codePage = 12001; //UTF-32 BE
else if (memcmp(bom, "\x00\x00\xfe\xff", 4) == 0) {
codePage = CP_UTF32BE;
}
else if (strncmp(bom, "\xff\xfe", 2) == 0) {
codePage = 1200; //UTF-16 LE
else if (memcmp(bom, "\xff\xfe", 2) == 0) {
codePage = CP_UTF16LE;
}
else if (strncmp(bom, "\xfe\xff", 2) == 0) {
codePage = 1201; //UTF-16 BE
else if (memcmp(bom, "\xfe\xff", 2) == 0) {
codePage = CP_UTF16BE;
}
else {
codePage = CP_ACP;
@ -171,6 +186,49 @@ UINT32 DxcCodePageFromBytes(const char *bytes, size_t byteLen) throw() {
return codePage;
}
unsigned GetBomLengthFromCodePage(UINT32 codePage) {
switch(codePage) {
case CP_UTF8:
return 3;
case CP_UTF32LE:
case CP_UTF32BE:
return 4;
case CP_UTF16LE:
case CP_UTF16BE:
return 2;
default:
return 0;
}
}
static unsigned CharSizeFromCodePage(UINT32 codePage) {
switch (codePage) {
case CP_UTF32LE:
case CP_UTF32BE:
return 4;
case CP_UTF16LE:
case CP_UTF16BE:
return 2;
default:
return 1;
}
}
// We do not handle translation from these code page values.
static bool IsUnsupportedUtfCodePage(UINT32 codePage) {
switch (codePage) {
case CP_UTF32LE:
case CP_UTF32BE:
case CP_UTF16BE:
return true;
}
return false;
}
unsigned GetBomLengthFromBytes(const char *bytes, size_t byteLen) throw() {
return GetBomLengthFromCodePage(DxcCodePageFromBytes(bytes, byteLen));
}
#define IsSizeWcharAligned(size) (((size) & (sizeof(wchar_t) - 1)) == 0)
template<typename _char>
@ -364,6 +422,9 @@ static HRESULT CodePageBufferToUtf16(UINT32 codePage, LPCVOID bufferPointer,
return S_OK;
}
if (IsUnsupportedUtfCodePage(codePage))
return DXC_E_STRING_ENCODING_FAILED;
// Calculate the length of the buffer in wchar_t elements.
int numToConvertUTF16 =
MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, (LPCSTR)bufferPointer,
@ -409,7 +470,6 @@ static HRESULT CodePageBufferToUtf8(UINT32 codePage, LPCVOID bufferPointer,
CDxcMallocHeapPtr<char> &utf8NewCopy,
_Out_ UINT32 *pConvertedCharCount) {
*pConvertedCharCount = 0;
CDxcMallocHeapPtr<WCHAR> utf16NewCopy(pMalloc);
UINT32 utf16CharCount = 0;
const WCHAR *utf16Chars = nullptr;
@ -509,7 +569,6 @@ static bool TryCreateBlobUtfFromBlob(
return false;
}
HRESULT DxcCreateBlob(
LPCVOID pPtr, SIZE_T size, bool bPinned, bool bCopy,
bool encodingKnown, UINT32 codePage,
@ -520,17 +579,40 @@ HRESULT DxcCreateBlob(
*ppBlobEncoding = nullptr;
bool bNullTerminated = encodingKnown ? IsBufferNullTerminated(pPtr, size, codePage) : false;
unsigned bomSize = (encodingKnown && codePage != CP_ACP)
? GetBomLengthFromBytes((const char *)pPtr, size)
: 0;
if (bomSize) {
// Adjust pPtr and size to skip BOM.
// When !encodingKnown or codePage == CP_ACP, BOM will be skipped when
// interpreting as text and translating to unicode, since at this point,
// the buffer could be an arbitrary binary blob.
// There is an odd corner case with BOM detection where an empty
// non-null-terminated UTF-32 LE buffer with BOM would be interpreted
// as an empty null-terminated UTF-16 LE buffer with a BOM.
// This won't matter in the end, since both cases are empty buffers, and
// will map to the empty buffer case with the desired codePage setting.
pPtr = (const char *)pPtr + bomSize;
size -= bomSize;
}
bool emptyString = !pPtr || !size;
if (bNullTerminated) {
DXASSERT_NOMSG(pPtr && size && encodingKnown);
emptyString = size == CharSizeFromCodePage(codePage);
}
if (!pMalloc)
pMalloc = DxcGetThreadMallocNoRef();
// Handle empty blob
if (!pPtr || !size) {
if (emptyString) {
if (encodingKnown && TryCreateEmptyBlobUtf(codePage, pMalloc, ppBlobEncoding))
return S_OK;
InternalDxcBlobEncoding *pInternalEncoding;
IFR(InternalDxcBlobEncoding::CreateFromMalloc(nullptr, pMalloc, 0, encodingKnown, codePage, &pInternalEncoding));
*ppBlobEncoding = pInternalEncoding;
return S_OK;
}
if (bPinned) {
@ -856,9 +938,17 @@ HRESULT DxcGetBlobAsUtf8(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf8 **pBlob
return hr;
}
const char *bufferPointer = (const char *)pBlob->GetBufferPointer();
SIZE_T blobLen = pBlob->GetBufferSize();
if (!known) {
codePage = DxcCodePageFromBytes((char *)pBlob->GetBufferPointer(), blobLen);
unsigned bomSize = 0;
if (!known || codePage == CP_ACP) {
// Try to determine encoding from BOM.
// If encoding was known, any BOM should have been stripped already.
codePage = DxcCodePageFromBytes(bufferPointer, blobLen);
bomSize = GetBomLengthFromCodePage(codePage);
// BOM exists, adjust pointer and size to strip.
bufferPointer += bomSize;
blobLen -= bomSize;
}
if (!pMalloc)
@ -870,11 +960,14 @@ HRESULT DxcGetBlobAsUtf8(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf8 **pBlob
// Reuse or copy the underlying blob depending on null-termination
if (codePage == CP_UTF8) {
utf8CharCount = blobLen;
if (IsBufferNullTerminated(pBlob->GetBufferPointer(), blobLen, CP_UTF8)) {
if (IsBufferNullTerminated(bufferPointer, blobLen, CP_UTF8)) {
// Already null-terminated, reference other blob's memory
InternalDxcBlobUtf8* internalEncoding;
hr = InternalDxcBlobUtf8::CreateFromBlob(pBlob, pMalloc, true, CP_UTF8, &internalEncoding);
if (SUCCEEDED(hr)) {
// Adjust if buffer has BOM; blobLen is already adjusted.
if (bomSize)
internalEncoding->AdjustPtrAndSize(bomSize, blobLen);
*pBlobEncoding = internalEncoding;
}
return hr;
@ -882,13 +975,13 @@ HRESULT DxcGetBlobAsUtf8(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf8 **pBlob
// Copy to new buffer and null-terminate
if(!utf8NewCopy.Allocate(utf8CharCount + 1))
return E_OUTOFMEMORY;
memcpy(utf8NewCopy.m_pData, pBlob->GetBufferPointer(), blobLen);
memcpy(utf8NewCopy.m_pData, bufferPointer, utf8CharCount);
utf8NewCopy.m_pData[utf8CharCount++] = 0;
}
} else {
// Convert and create a blob that owns the encoding.
if (FAILED(
hr = CodePageBufferToUtf8(codePage, pBlob->GetBufferPointer(), blobLen,
hr = CodePageBufferToUtf8(codePage, bufferPointer, blobLen,
pMalloc, utf8NewCopy, &utf8CharCount))) {
return hr;
}
@ -942,9 +1035,18 @@ HRESULT DxcGetBlobAsUtf16(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf16 **pBl
return hr;
}
// Look for BOM and adjust pointer and size to skip if necessary.
const char *bufferPointer = (const char *)pBlob->GetBufferPointer();
SIZE_T blobLen = pBlob->GetBufferSize();
if (!known) {
codePage = DxcCodePageFromBytes((char *)pBlob->GetBufferPointer(), blobLen);
unsigned bomSize = 0;
if (!known || codePage == CP_ACP) {
// Try to determine encoding from BOM.
// If encoding was known, any BOM should have been stripped already.
codePage = DxcCodePageFromBytes(bufferPointer, blobLen);
bomSize = GetBomLengthFromCodePage(codePage);
// BOM exists, adjust pointer and size to strip.
bufferPointer += bomSize;
blobLen -= bomSize;
}
if (!pMalloc)
@ -958,11 +1060,14 @@ HRESULT DxcGetBlobAsUtf16(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf16 **pBl
DXASSERT(IsSizeWcharAligned(blobLen),
"otherwise, UTF-16 blob size not evenly divisible by 2");
utf16CharCount = blobLen / sizeof(wchar_t);
if (IsBufferNullTerminated(pBlob->GetBufferPointer(), blobLen, CP_UTF16)) {
if (IsBufferNullTerminated(bufferPointer, blobLen, CP_UTF16)) {
// Already null-terminated, reference other blob's memory
InternalDxcBlobUtf16* internalEncoding;
hr = InternalDxcBlobUtf16::CreateFromBlob(pBlob, pMalloc, true, CP_UTF16, &internalEncoding);
if (SUCCEEDED(hr)) {
// Adjust if buffer has BOM; blobLen is already adjusted.
if (bomSize)
internalEncoding->AdjustPtrAndSize(bomSize, blobLen);
*pBlobEncoding = internalEncoding;
}
return hr;
@ -970,13 +1075,13 @@ HRESULT DxcGetBlobAsUtf16(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf16 **pBl
// Copy to new buffer and null-terminate
if(!utf16NewCopy.Allocate(utf16CharCount + 1))
return E_OUTOFMEMORY;
memcpy(utf16NewCopy.m_pData, pBlob->GetBufferPointer(), blobLen);
memcpy(utf16NewCopy.m_pData, bufferPointer, blobLen);
utf16NewCopy.m_pData[utf16CharCount++] = 0;
}
} else {
// Convert and create a blob that owns the encoding.
if (FAILED(
hr = CodePageBufferToUtf16(codePage, pBlob->GetBufferPointer(), blobLen,
hr = CodePageBufferToUtf16(codePage, bufferPointer, blobLen,
utf16NewCopy, &utf16CharCount))) {
return hr;
}

Просмотреть файл

@ -40,8 +40,8 @@ HRESULT STDMETHODCALLTYPE DxcContainerBuilder::Load(_In_ IDxcBlob *pSource) {
const DxilContainerHeader *pHeader = (DxilContainerHeader *)pSource->GetBufferPointer();
for (DxilPartIterator it = begin(pHeader), itEnd = end(pHeader); it != itEnd; ++it) {
const DxilPartHeader *pPartHeader = *it;
CComPtr<IDxcBlobEncoding> pBlob;
IFT(DxcCreateBlobWithEncodingFromPinned((const void *)(pPartHeader + 1), pPartHeader->PartSize, CP_UTF8, &pBlob));
CComPtr<IDxcBlob> pBlob;
IFT(DxcCreateBlobFromPinned((const void *)(pPartHeader + 1), pPartHeader->PartSize, &pBlob));
PartList::iterator itPartList = std::find_if(m_parts.begin(), m_parts.end(), [&](DxilPart part) {
return part.m_fourCC == pPartHeader->PartFourCC;
});

Просмотреть файл

@ -89,8 +89,8 @@ void RootSignatureHandle::Deserialize() {
void RootSignatureHandle::LoadSerialized(const uint8_t *pData,
unsigned length) {
DXASSERT_NOMSG(IsEmpty());
IDxcBlobEncoding *pCreated;
IFT(DxcCreateBlobWithEncodingOnHeapCopy(pData, length, CP_UTF8, &pCreated));
IDxcBlob *pCreated;
IFT(DxcCreateBlobOnHeapCopy(pData, length, &pCreated));
m_pSerialized = pCreated;
}

Просмотреть файл

@ -115,7 +115,7 @@ void DxaContext::Assemble() {
}
}
WriteBlobToFile(pContainer, StringRefUtf16(OutputFilename), DXC_CP_UTF8); // TODO: Support DefaultTextCodePage
WriteBlobToFile(pContainer, StringRefUtf16(OutputFilename), DXC_CP_ACP);
printf("Output written to \"%s\"\n", OutputFilename.c_str());
}
} else {

Просмотреть файл

@ -954,7 +954,7 @@ TEST_F(CompilerTest, CompileThenAddCustomDebugName) {
CComPtr<IDxcBlobEncoding> pDebugName;
CreateBlobPinned(pNameBlobContent, allocatedSize, CP_UTF8, &pDebugName);
CreateBlobPinned(pNameBlobContent, allocatedSize, DXC_CP_ACP, &pDebugName);
VERIFY_SUCCEEDED(pBuilder->Load(pProgram));
@ -2939,7 +2939,7 @@ TEST_F(CompilerTest, LibGVStore) {
unsigned bitcode_size = hlsl::GetDxilBitcodeSize((hlsl::DxilProgramHeader *)pBitcode->GetBufferPointer());
CComPtr<IDxcBlobEncoding> pBitcodeBlob;
CreateBlobPinned(bitcode, bitcode_size, CP_UTF8, &pBitcodeBlob);
CreateBlobPinned(bitcode, bitcode_size, DXC_CP_ACP, &pBitcodeBlob);
CComPtr<IDxcBlob> pReassembled;
CComPtr<IDxcOperationResult> pReassembleResult;

Просмотреть файл

@ -336,7 +336,7 @@ public:
CComPtr<IDxcLibrary> pLibrary;
CComPtr<IDxcBlobEncoding> pBlobEncoding; // Encoding doesn't actually matter, it's binary.
VERIFY_SUCCEEDED(m_dllSupport.CreateInstance(CLSID_DxcLibrary, &pLibrary));
VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned(pBlob, blobSize, CP_UTF8, &pBlobEncoding));
VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned(pBlob, blobSize, DXC_CP_ACP, &pBlobEncoding));
CheckValidationMsgs(pBlobEncoding, pErrorMsgs, bRegex, Flags);
}

Просмотреть файл

@ -0,0 +1 @@
float4 f_ascii;

Двоичные данные
utils/hct/cmdtestfiles/bom-inc-utf16le.hlsli Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1 @@
float4 f_utf8;

Просмотреть файл

@ -0,0 +1,20 @@
// Tests main and include files with/without BOM to ensure BOM is stripped.
#include "bom-inc-ascii.hlsli"
#include "bom-inc-utf8.hlsli"
#include "bom-inc-utf16le.hlsli"
// TODO: Add support for Big Endian and UTF-32
// #include "bom-inc-utf16be.hlsli"
// #include "bom-inc-utf32le.hlsli"
// #include "bom-inc-utf32be.hlsli"
float4 main() : SV_Target {
return f_ascii
+ f_utf8
+ f_utf16le
// + f_utf16be
// + f_utf32le
// + f_utf32be
;
}

Двоичные данные
utils/hct/cmdtestfiles/bom-main-utf16le.hlsl Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,20 @@
// Tests main and include files with/without BOM to ensure BOM is stripped.
#include "bom-inc-ascii.hlsli"
#include "bom-inc-utf8.hlsli"
#include "bom-inc-utf16le.hlsli"
// TODO: Add support for Big Endian and UTF-32
// #include "bom-inc-utf16be.hlsli"
// #include "bom-inc-utf32le.hlsli"
// #include "bom-inc-utf32be.hlsli"
float4 main() : SV_Target {
return f_ascii
+ f_utf8
+ f_utf16le
// + f_utf16be
// + f_utf32le
// + f_utf32be
;
}

Просмотреть файл

@ -421,6 +421,12 @@ if %Failed% neq 0 goto :failed
call :run dxc.exe -P include-main.hlsl.pp -I inc subfolder\include-main.hlsl
if %Failed% neq 0 goto :failed
set testname=Byte Order Markers
call :run dxc.exe /T ps_6_0 "%testfiles%\bom-main-ascii.hlsl"
call :run dxc.exe /T ps_6_0 "%testfiles%\bom-main-utf8.hlsl"
call :run dxc.exe /T ps_6_0 "%testfiles%\bom-main-utf16le.hlsl"
if %Failed% neq 0 goto :failed
rem SPIR-V Change Starts
echo Smoke test for SPIR-V CodeGen ...
set spirv_smoke_success=0
@ -599,5 +605,5 @@ rem ============================================
rem Cleanup and return failure
:failed
call :cleanup 2>nul
if %Failed%=="0" set Failed=1
if %Failed% eq 0 set Failed=1
exit /b %Failed%