Strip Byte Order Markers (BOMs) and fix some related issues (#3880)
* Stop creating binary blobs with UTF-8 encoding * Strip BOM and fix some related issues - detect and strip BOM only when interpreting as text: - DxcCreateBlob: only when encoding is known and not CP_ACP - DxcGetBlobAsUtf[8|16]: when encoding not known or CP_ACP - strncmp in BOM detection would misfire (should have been memcmp) - ambiguous BOM case not handled/noted - empty blob case fixes (handle null-term empty, don't leak) Note, there are still problems in the unicode handling: - No actual support for UTF-32 or big endian input encodings. - For Linux: "UTF-16" isn't, and bad assumptions around "wide" strings. * Add defines for CP_UTF*; improve error for unsupported utf conversions
This commit is contained in:
Родитель
6bf92d4f74
Коммит
58f0ee2637
|
@ -24,7 +24,18 @@
|
|||
#include <intsafe.h>
|
||||
#endif
|
||||
|
||||
#define CP_UTF16 1200
|
||||
// CP_UTF8 is defined in WinNls.h, but others we use are not defined there.
|
||||
// See matching value definitions here:
|
||||
// https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
|
||||
|
||||
// We detect all these through BOM.
|
||||
#define CP_UTF16LE 1200
|
||||
#define CP_UTF16BE 1201
|
||||
#define CP_UTF32LE 12000
|
||||
#define CP_UTF32BE 12001
|
||||
|
||||
// Alias for CP_UTF16LE, which is the only one we actually handle.
|
||||
#define CP_UTF16 CP_UTF16LE
|
||||
|
||||
struct HeapMalloc : public IMalloc {
|
||||
public:
|
||||
|
@ -146,20 +157,24 @@ UINT32 DxcCodePageFromBytes(const char *bytes, size_t byteLen) throw() {
|
|||
// Now try to use the BOM to check for Unicode encodings
|
||||
char bom[4] = { bytes[0], bytes[1], bytes[2], bytes[3] };
|
||||
|
||||
if (strncmp(bom, "\xef\xbb\xbf", 3) == 0) {
|
||||
if (memcmp(bom, "\xef\xbb\xbf", 3) == 0) {
|
||||
codePage = CP_UTF8;
|
||||
}
|
||||
else if (strncmp(bom, "\xff\xfe\x00\x00", 4) == 0) {
|
||||
codePage = 12000; //UTF-32 LE
|
||||
else if (byteLen > 4 && memcmp(bom, "\xff\xfe\x00\x00", 4) == 0) {
|
||||
// byteLen > 4 to avoid mistaking empty UTF-16 LE BOM + null-terminator
|
||||
// for UTF-32 LE BOM without null-terminator.
|
||||
// If it's an empty UTF-32 LE with no null-termination,
|
||||
// it's harmless to interpret as empty UTF-16 LE with null-termination.
|
||||
codePage = CP_UTF32LE;
|
||||
}
|
||||
else if (strncmp(bom, "\x00\x00\xfe\xff", 4) == 0) {
|
||||
codePage = 12001; //UTF-32 BE
|
||||
else if (memcmp(bom, "\x00\x00\xfe\xff", 4) == 0) {
|
||||
codePage = CP_UTF32BE;
|
||||
}
|
||||
else if (strncmp(bom, "\xff\xfe", 2) == 0) {
|
||||
codePage = 1200; //UTF-16 LE
|
||||
else if (memcmp(bom, "\xff\xfe", 2) == 0) {
|
||||
codePage = CP_UTF16LE;
|
||||
}
|
||||
else if (strncmp(bom, "\xfe\xff", 2) == 0) {
|
||||
codePage = 1201; //UTF-16 BE
|
||||
else if (memcmp(bom, "\xfe\xff", 2) == 0) {
|
||||
codePage = CP_UTF16BE;
|
||||
}
|
||||
else {
|
||||
codePage = CP_ACP;
|
||||
|
@ -171,6 +186,49 @@ UINT32 DxcCodePageFromBytes(const char *bytes, size_t byteLen) throw() {
|
|||
return codePage;
|
||||
}
|
||||
|
||||
unsigned GetBomLengthFromCodePage(UINT32 codePage) {
|
||||
switch(codePage) {
|
||||
case CP_UTF8:
|
||||
return 3;
|
||||
case CP_UTF32LE:
|
||||
case CP_UTF32BE:
|
||||
return 4;
|
||||
case CP_UTF16LE:
|
||||
case CP_UTF16BE:
|
||||
return 2;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned CharSizeFromCodePage(UINT32 codePage) {
|
||||
switch (codePage) {
|
||||
case CP_UTF32LE:
|
||||
case CP_UTF32BE:
|
||||
return 4;
|
||||
case CP_UTF16LE:
|
||||
case CP_UTF16BE:
|
||||
return 2;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// We do not handle translation from these code page values.
|
||||
static bool IsUnsupportedUtfCodePage(UINT32 codePage) {
|
||||
switch (codePage) {
|
||||
case CP_UTF32LE:
|
||||
case CP_UTF32BE:
|
||||
case CP_UTF16BE:
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned GetBomLengthFromBytes(const char *bytes, size_t byteLen) throw() {
|
||||
return GetBomLengthFromCodePage(DxcCodePageFromBytes(bytes, byteLen));
|
||||
}
|
||||
|
||||
#define IsSizeWcharAligned(size) (((size) & (sizeof(wchar_t) - 1)) == 0)
|
||||
|
||||
template<typename _char>
|
||||
|
@ -364,6 +422,9 @@ static HRESULT CodePageBufferToUtf16(UINT32 codePage, LPCVOID bufferPointer,
|
|||
return S_OK;
|
||||
}
|
||||
|
||||
if (IsUnsupportedUtfCodePage(codePage))
|
||||
return DXC_E_STRING_ENCODING_FAILED;
|
||||
|
||||
// Calculate the length of the buffer in wchar_t elements.
|
||||
int numToConvertUTF16 =
|
||||
MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, (LPCSTR)bufferPointer,
|
||||
|
@ -409,7 +470,6 @@ static HRESULT CodePageBufferToUtf8(UINT32 codePage, LPCVOID bufferPointer,
|
|||
CDxcMallocHeapPtr<char> &utf8NewCopy,
|
||||
_Out_ UINT32 *pConvertedCharCount) {
|
||||
*pConvertedCharCount = 0;
|
||||
|
||||
CDxcMallocHeapPtr<WCHAR> utf16NewCopy(pMalloc);
|
||||
UINT32 utf16CharCount = 0;
|
||||
const WCHAR *utf16Chars = nullptr;
|
||||
|
@ -509,7 +569,6 @@ static bool TryCreateBlobUtfFromBlob(
|
|||
return false;
|
||||
}
|
||||
|
||||
|
||||
HRESULT DxcCreateBlob(
|
||||
LPCVOID pPtr, SIZE_T size, bool bPinned, bool bCopy,
|
||||
bool encodingKnown, UINT32 codePage,
|
||||
|
@ -520,17 +579,40 @@ HRESULT DxcCreateBlob(
|
|||
*ppBlobEncoding = nullptr;
|
||||
|
||||
bool bNullTerminated = encodingKnown ? IsBufferNullTerminated(pPtr, size, codePage) : false;
|
||||
unsigned bomSize = (encodingKnown && codePage != CP_ACP)
|
||||
? GetBomLengthFromBytes((const char *)pPtr, size)
|
||||
: 0;
|
||||
if (bomSize) {
|
||||
// Adjust pPtr and size to skip BOM.
|
||||
// When !encodingKnown or codePage == CP_ACP, BOM will be skipped when
|
||||
// interpreting as text and translating to unicode, since at this point,
|
||||
// the buffer could be an arbitrary binary blob.
|
||||
|
||||
// There is an odd corner case with BOM detection where an empty
|
||||
// non-null-terminated UTF-32 LE buffer with BOM would be interpreted
|
||||
// as an empty null-terminated UTF-16 LE buffer with a BOM.
|
||||
// This won't matter in the end, since both cases are empty buffers, and
|
||||
// will map to the empty buffer case with the desired codePage setting.
|
||||
pPtr = (const char *)pPtr + bomSize;
|
||||
size -= bomSize;
|
||||
}
|
||||
bool emptyString = !pPtr || !size;
|
||||
if (bNullTerminated) {
|
||||
DXASSERT_NOMSG(pPtr && size && encodingKnown);
|
||||
emptyString = size == CharSizeFromCodePage(codePage);
|
||||
}
|
||||
|
||||
if (!pMalloc)
|
||||
pMalloc = DxcGetThreadMallocNoRef();
|
||||
|
||||
// Handle empty blob
|
||||
if (!pPtr || !size) {
|
||||
if (emptyString) {
|
||||
if (encodingKnown && TryCreateEmptyBlobUtf(codePage, pMalloc, ppBlobEncoding))
|
||||
return S_OK;
|
||||
InternalDxcBlobEncoding *pInternalEncoding;
|
||||
IFR(InternalDxcBlobEncoding::CreateFromMalloc(nullptr, pMalloc, 0, encodingKnown, codePage, &pInternalEncoding));
|
||||
*ppBlobEncoding = pInternalEncoding;
|
||||
return S_OK;
|
||||
}
|
||||
|
||||
if (bPinned) {
|
||||
|
@ -856,9 +938,17 @@ HRESULT DxcGetBlobAsUtf8(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf8 **pBlob
|
|||
return hr;
|
||||
}
|
||||
|
||||
const char *bufferPointer = (const char *)pBlob->GetBufferPointer();
|
||||
SIZE_T blobLen = pBlob->GetBufferSize();
|
||||
if (!known) {
|
||||
codePage = DxcCodePageFromBytes((char *)pBlob->GetBufferPointer(), blobLen);
|
||||
unsigned bomSize = 0;
|
||||
if (!known || codePage == CP_ACP) {
|
||||
// Try to determine encoding from BOM.
|
||||
// If encoding was known, any BOM should have been stripped already.
|
||||
codePage = DxcCodePageFromBytes(bufferPointer, blobLen);
|
||||
bomSize = GetBomLengthFromCodePage(codePage);
|
||||
// BOM exists, adjust pointer and size to strip.
|
||||
bufferPointer += bomSize;
|
||||
blobLen -= bomSize;
|
||||
}
|
||||
|
||||
if (!pMalloc)
|
||||
|
@ -870,11 +960,14 @@ HRESULT DxcGetBlobAsUtf8(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf8 **pBlob
|
|||
// Reuse or copy the underlying blob depending on null-termination
|
||||
if (codePage == CP_UTF8) {
|
||||
utf8CharCount = blobLen;
|
||||
if (IsBufferNullTerminated(pBlob->GetBufferPointer(), blobLen, CP_UTF8)) {
|
||||
if (IsBufferNullTerminated(bufferPointer, blobLen, CP_UTF8)) {
|
||||
// Already null-terminated, reference other blob's memory
|
||||
InternalDxcBlobUtf8* internalEncoding;
|
||||
hr = InternalDxcBlobUtf8::CreateFromBlob(pBlob, pMalloc, true, CP_UTF8, &internalEncoding);
|
||||
if (SUCCEEDED(hr)) {
|
||||
// Adjust if buffer has BOM; blobLen is already adjusted.
|
||||
if (bomSize)
|
||||
internalEncoding->AdjustPtrAndSize(bomSize, blobLen);
|
||||
*pBlobEncoding = internalEncoding;
|
||||
}
|
||||
return hr;
|
||||
|
@ -882,13 +975,13 @@ HRESULT DxcGetBlobAsUtf8(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf8 **pBlob
|
|||
// Copy to new buffer and null-terminate
|
||||
if(!utf8NewCopy.Allocate(utf8CharCount + 1))
|
||||
return E_OUTOFMEMORY;
|
||||
memcpy(utf8NewCopy.m_pData, pBlob->GetBufferPointer(), blobLen);
|
||||
memcpy(utf8NewCopy.m_pData, bufferPointer, utf8CharCount);
|
||||
utf8NewCopy.m_pData[utf8CharCount++] = 0;
|
||||
}
|
||||
} else {
|
||||
// Convert and create a blob that owns the encoding.
|
||||
if (FAILED(
|
||||
hr = CodePageBufferToUtf8(codePage, pBlob->GetBufferPointer(), blobLen,
|
||||
hr = CodePageBufferToUtf8(codePage, bufferPointer, blobLen,
|
||||
pMalloc, utf8NewCopy, &utf8CharCount))) {
|
||||
return hr;
|
||||
}
|
||||
|
@ -942,9 +1035,18 @@ HRESULT DxcGetBlobAsUtf16(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf16 **pBl
|
|||
return hr;
|
||||
}
|
||||
|
||||
// Look for BOM and adjust pointer and size to skip if necessary.
|
||||
const char *bufferPointer = (const char *)pBlob->GetBufferPointer();
|
||||
SIZE_T blobLen = pBlob->GetBufferSize();
|
||||
if (!known) {
|
||||
codePage = DxcCodePageFromBytes((char *)pBlob->GetBufferPointer(), blobLen);
|
||||
unsigned bomSize = 0;
|
||||
if (!known || codePage == CP_ACP) {
|
||||
// Try to determine encoding from BOM.
|
||||
// If encoding was known, any BOM should have been stripped already.
|
||||
codePage = DxcCodePageFromBytes(bufferPointer, blobLen);
|
||||
bomSize = GetBomLengthFromCodePage(codePage);
|
||||
// BOM exists, adjust pointer and size to strip.
|
||||
bufferPointer += bomSize;
|
||||
blobLen -= bomSize;
|
||||
}
|
||||
|
||||
if (!pMalloc)
|
||||
|
@ -958,11 +1060,14 @@ HRESULT DxcGetBlobAsUtf16(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf16 **pBl
|
|||
DXASSERT(IsSizeWcharAligned(blobLen),
|
||||
"otherwise, UTF-16 blob size not evenly divisible by 2");
|
||||
utf16CharCount = blobLen / sizeof(wchar_t);
|
||||
if (IsBufferNullTerminated(pBlob->GetBufferPointer(), blobLen, CP_UTF16)) {
|
||||
if (IsBufferNullTerminated(bufferPointer, blobLen, CP_UTF16)) {
|
||||
// Already null-terminated, reference other blob's memory
|
||||
InternalDxcBlobUtf16* internalEncoding;
|
||||
hr = InternalDxcBlobUtf16::CreateFromBlob(pBlob, pMalloc, true, CP_UTF16, &internalEncoding);
|
||||
if (SUCCEEDED(hr)) {
|
||||
// Adjust if buffer has BOM; blobLen is already adjusted.
|
||||
if (bomSize)
|
||||
internalEncoding->AdjustPtrAndSize(bomSize, blobLen);
|
||||
*pBlobEncoding = internalEncoding;
|
||||
}
|
||||
return hr;
|
||||
|
@ -970,13 +1075,13 @@ HRESULT DxcGetBlobAsUtf16(IDxcBlob *pBlob, IMalloc *pMalloc, IDxcBlobUtf16 **pBl
|
|||
// Copy to new buffer and null-terminate
|
||||
if(!utf16NewCopy.Allocate(utf16CharCount + 1))
|
||||
return E_OUTOFMEMORY;
|
||||
memcpy(utf16NewCopy.m_pData, pBlob->GetBufferPointer(), blobLen);
|
||||
memcpy(utf16NewCopy.m_pData, bufferPointer, blobLen);
|
||||
utf16NewCopy.m_pData[utf16CharCount++] = 0;
|
||||
}
|
||||
} else {
|
||||
// Convert and create a blob that owns the encoding.
|
||||
if (FAILED(
|
||||
hr = CodePageBufferToUtf16(codePage, pBlob->GetBufferPointer(), blobLen,
|
||||
hr = CodePageBufferToUtf16(codePage, bufferPointer, blobLen,
|
||||
utf16NewCopy, &utf16CharCount))) {
|
||||
return hr;
|
||||
}
|
||||
|
|
|
@ -40,8 +40,8 @@ HRESULT STDMETHODCALLTYPE DxcContainerBuilder::Load(_In_ IDxcBlob *pSource) {
|
|||
const DxilContainerHeader *pHeader = (DxilContainerHeader *)pSource->GetBufferPointer();
|
||||
for (DxilPartIterator it = begin(pHeader), itEnd = end(pHeader); it != itEnd; ++it) {
|
||||
const DxilPartHeader *pPartHeader = *it;
|
||||
CComPtr<IDxcBlobEncoding> pBlob;
|
||||
IFT(DxcCreateBlobWithEncodingFromPinned((const void *)(pPartHeader + 1), pPartHeader->PartSize, CP_UTF8, &pBlob));
|
||||
CComPtr<IDxcBlob> pBlob;
|
||||
IFT(DxcCreateBlobFromPinned((const void *)(pPartHeader + 1), pPartHeader->PartSize, &pBlob));
|
||||
PartList::iterator itPartList = std::find_if(m_parts.begin(), m_parts.end(), [&](DxilPart part) {
|
||||
return part.m_fourCC == pPartHeader->PartFourCC;
|
||||
});
|
||||
|
|
|
@ -89,8 +89,8 @@ void RootSignatureHandle::Deserialize() {
|
|||
void RootSignatureHandle::LoadSerialized(const uint8_t *pData,
|
||||
unsigned length) {
|
||||
DXASSERT_NOMSG(IsEmpty());
|
||||
IDxcBlobEncoding *pCreated;
|
||||
IFT(DxcCreateBlobWithEncodingOnHeapCopy(pData, length, CP_UTF8, &pCreated));
|
||||
IDxcBlob *pCreated;
|
||||
IFT(DxcCreateBlobOnHeapCopy(pData, length, &pCreated));
|
||||
m_pSerialized = pCreated;
|
||||
}
|
||||
|
||||
|
|
|
@ -115,7 +115,7 @@ void DxaContext::Assemble() {
|
|||
}
|
||||
}
|
||||
|
||||
WriteBlobToFile(pContainer, StringRefUtf16(OutputFilename), DXC_CP_UTF8); // TODO: Support DefaultTextCodePage
|
||||
WriteBlobToFile(pContainer, StringRefUtf16(OutputFilename), DXC_CP_ACP);
|
||||
printf("Output written to \"%s\"\n", OutputFilename.c_str());
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -954,7 +954,7 @@ TEST_F(CompilerTest, CompileThenAddCustomDebugName) {
|
|||
|
||||
CComPtr<IDxcBlobEncoding> pDebugName;
|
||||
|
||||
CreateBlobPinned(pNameBlobContent, allocatedSize, CP_UTF8, &pDebugName);
|
||||
CreateBlobPinned(pNameBlobContent, allocatedSize, DXC_CP_ACP, &pDebugName);
|
||||
|
||||
|
||||
VERIFY_SUCCEEDED(pBuilder->Load(pProgram));
|
||||
|
@ -2939,7 +2939,7 @@ TEST_F(CompilerTest, LibGVStore) {
|
|||
unsigned bitcode_size = hlsl::GetDxilBitcodeSize((hlsl::DxilProgramHeader *)pBitcode->GetBufferPointer());
|
||||
|
||||
CComPtr<IDxcBlobEncoding> pBitcodeBlob;
|
||||
CreateBlobPinned(bitcode, bitcode_size, CP_UTF8, &pBitcodeBlob);
|
||||
CreateBlobPinned(bitcode, bitcode_size, DXC_CP_ACP, &pBitcodeBlob);
|
||||
|
||||
CComPtr<IDxcBlob> pReassembled;
|
||||
CComPtr<IDxcOperationResult> pReassembleResult;
|
||||
|
|
|
@ -336,7 +336,7 @@ public:
|
|||
CComPtr<IDxcLibrary> pLibrary;
|
||||
CComPtr<IDxcBlobEncoding> pBlobEncoding; // Encoding doesn't actually matter, it's binary.
|
||||
VERIFY_SUCCEEDED(m_dllSupport.CreateInstance(CLSID_DxcLibrary, &pLibrary));
|
||||
VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned(pBlob, blobSize, CP_UTF8, &pBlobEncoding));
|
||||
VERIFY_SUCCEEDED(pLibrary->CreateBlobWithEncodingFromPinned(pBlob, blobSize, DXC_CP_ACP, &pBlobEncoding));
|
||||
CheckValidationMsgs(pBlobEncoding, pErrorMsgs, bRegex, Flags);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
float4 f_ascii;
|
Двоичный файл не отображается.
|
@ -0,0 +1 @@
|
|||
float4 f_utf8;
|
|
@ -0,0 +1,20 @@
|
|||
// Tests main and include files with/without BOM to ensure BOM is stripped.
|
||||
|
||||
#include "bom-inc-ascii.hlsli"
|
||||
#include "bom-inc-utf8.hlsli"
|
||||
#include "bom-inc-utf16le.hlsli"
|
||||
|
||||
// TODO: Add support for Big Endian and UTF-32
|
||||
// #include "bom-inc-utf16be.hlsli"
|
||||
// #include "bom-inc-utf32le.hlsli"
|
||||
// #include "bom-inc-utf32be.hlsli"
|
||||
|
||||
float4 main() : SV_Target {
|
||||
return f_ascii
|
||||
+ f_utf8
|
||||
+ f_utf16le
|
||||
// + f_utf16be
|
||||
// + f_utf32le
|
||||
// + f_utf32be
|
||||
;
|
||||
}
|
Двоичный файл не отображается.
|
@ -0,0 +1,20 @@
|
|||
// Tests main and include files with/without BOM to ensure BOM is stripped.
|
||||
|
||||
#include "bom-inc-ascii.hlsli"
|
||||
#include "bom-inc-utf8.hlsli"
|
||||
#include "bom-inc-utf16le.hlsli"
|
||||
|
||||
// TODO: Add support for Big Endian and UTF-32
|
||||
// #include "bom-inc-utf16be.hlsli"
|
||||
// #include "bom-inc-utf32le.hlsli"
|
||||
// #include "bom-inc-utf32be.hlsli"
|
||||
|
||||
float4 main() : SV_Target {
|
||||
return f_ascii
|
||||
+ f_utf8
|
||||
+ f_utf16le
|
||||
// + f_utf16be
|
||||
// + f_utf32le
|
||||
// + f_utf32be
|
||||
;
|
||||
}
|
|
@ -421,6 +421,12 @@ if %Failed% neq 0 goto :failed
|
|||
call :run dxc.exe -P include-main.hlsl.pp -I inc subfolder\include-main.hlsl
|
||||
if %Failed% neq 0 goto :failed
|
||||
|
||||
set testname=Byte Order Markers
|
||||
call :run dxc.exe /T ps_6_0 "%testfiles%\bom-main-ascii.hlsl"
|
||||
call :run dxc.exe /T ps_6_0 "%testfiles%\bom-main-utf8.hlsl"
|
||||
call :run dxc.exe /T ps_6_0 "%testfiles%\bom-main-utf16le.hlsl"
|
||||
if %Failed% neq 0 goto :failed
|
||||
|
||||
rem SPIR-V Change Starts
|
||||
echo Smoke test for SPIR-V CodeGen ...
|
||||
set spirv_smoke_success=0
|
||||
|
@ -599,5 +605,5 @@ rem ============================================
|
|||
rem Cleanup and return failure
|
||||
:failed
|
||||
call :cleanup 2>nul
|
||||
if %Failed%=="0" set Failed=1
|
||||
if %Failed% eq 0 set Failed=1
|
||||
exit /b %Failed%
|
||||
|
|
Загрузка…
Ссылка в новой задаче