Merge branch 'tb/utf-16-le-with-explicit-bom'

A new encoding UTF-16LE-BOM has been invented to force encoding to
UTF-16 with BOM in little endian byte order, which cannot be directly
generated by using iconv.

* tb/utf-16-le-with-explicit-bom:
  Support working-tree-encoding "UTF-16LE-BOM"
This commit is contained in:
Junio C Hamano 2019-02-06 22:05:21 -08:00
Родитель cfd9167c15 aab2a1ae48
Коммит 0fa3cc77ee
5 изменённых файлов: 48 добавлений и 14 удалений

Просмотреть файл

@ -344,7 +344,9 @@ automatic line ending conversion based on your platform.
Use the following attributes if your '*.ps1' files are UTF-16 little Use the following attributes if your '*.ps1' files are UTF-16 little
endian encoded without BOM and you want Git to use Windows line endings endian encoded without BOM and you want Git to use Windows line endings
in the working directory. Please note, it is highly recommended to in the working directory (use `UTF-16-LE-BOM` instead of `UTF-16LE` if
you want UTF-16 little endian with BOM).
Please note, it is highly recommended to
explicitly define the line endings with `eol` if the `working-tree-encoding` explicitly define the line endings with `eol` if the `working-tree-encoding`
attribute is used to avoid ambiguity. attribute is used to avoid ambiguity.

Просмотреть файл

@ -79,7 +79,7 @@ void precompose_argv(int argc, const char **argv)
size_t namelen; size_t namelen;
oldarg = argv[i]; oldarg = argv[i];
if (has_non_ascii(oldarg, (size_t)-1, &namelen)) { if (has_non_ascii(oldarg, (size_t)-1, &namelen)) {
newarg = reencode_string_iconv(oldarg, namelen, ic_precompose, NULL); newarg = reencode_string_iconv(oldarg, namelen, ic_precompose, 0, NULL);
if (newarg) if (newarg)
argv[i] = newarg; argv[i] = newarg;
} }

Просмотреть файл

@ -11,9 +11,12 @@ test_expect_success 'setup test files' '
text="hallo there!\ncan you read me?" && text="hallo there!\ncan you read me?" &&
echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes && echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes &&
echo "*.utf16lebom text working-tree-encoding=UTF-16LE-BOM" >>.gitattributes &&
printf "$text" >test.utf8.raw && printf "$text" >test.utf8.raw &&
printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw && printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw &&
printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw && printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw &&
printf "\377\376" >test.utf16lebom.raw &&
printf "$text" | iconv -f UTF-8 -t UTF-32LE >>test.utf16lebom.raw &&
# Line ending tests # Line ending tests
printf "one\ntwo\nthree\n" >lf.utf8.raw && printf "one\ntwo\nthree\n" >lf.utf8.raw &&
@ -32,7 +35,8 @@ test_expect_success 'setup test files' '
# Add only UTF-16 file, we will add the UTF-32 file later # Add only UTF-16 file, we will add the UTF-32 file later
cp test.utf16.raw test.utf16 && cp test.utf16.raw test.utf16 &&
cp test.utf32.raw test.utf32 && cp test.utf32.raw test.utf32 &&
git add .gitattributes test.utf16 && cp test.utf16lebom.raw test.utf16lebom &&
git add .gitattributes test.utf16 test.utf16lebom &&
git commit -m initial git commit -m initial
' '
@ -51,6 +55,12 @@ test_expect_success 're-encode to UTF-16 on checkout' '
test_cmp_bin test.utf16.raw test.utf16 test_cmp_bin test.utf16.raw test.utf16
' '
test_expect_success 're-encode to UTF-16-LE-BOM on checkout' '
rm test.utf16lebom &&
git checkout test.utf16lebom &&
test_cmp_bin test.utf16lebom.raw test.utf16lebom
'
test_expect_success 'check $GIT_DIR/info/attributes support' ' test_expect_success 'check $GIT_DIR/info/attributes support' '
test_when_finished "rm -f test.utf32.git" && test_when_finished "rm -f test.utf32.git" &&
test_when_finished "git reset --hard HEAD" && test_when_finished "git reset --hard HEAD" &&

42
utf8.c
Просмотреть файл

@ -4,6 +4,11 @@
/* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */ /* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */
static const char utf16_be_bom[] = {'\xFE', '\xFF'};
static const char utf16_le_bom[] = {'\xFF', '\xFE'};
static const char utf32_be_bom[] = {'\0', '\0', '\xFE', '\xFF'};
static const char utf32_le_bom[] = {'\xFF', '\xFE', '\0', '\0'};
struct interval { struct interval {
ucs_char_t first; ucs_char_t first;
ucs_char_t last; ucs_char_t last;
@ -470,16 +475,17 @@ int utf8_fprintf(FILE *stream, const char *format, ...)
#else #else
typedef char * iconv_ibp; typedef char * iconv_ibp;
#endif #endif
char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, size_t *outsz_p) char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv,
size_t bom_len, size_t *outsz_p)
{ {
size_t outsz, outalloc; size_t outsz, outalloc;
char *out, *outpos; char *out, *outpos;
iconv_ibp cp; iconv_ibp cp;
outsz = insz; outsz = insz;
outalloc = st_add(outsz, 1); /* for terminating NUL */ outalloc = st_add(outsz, 1 + bom_len); /* for terminating NUL */
out = xmalloc(outalloc); out = xmalloc(outalloc);
outpos = out; outpos = out + bom_len;
cp = (iconv_ibp)in; cp = (iconv_ibp)in;
while (1) { while (1) {
@ -540,10 +546,30 @@ char *reencode_string_len(const char *in, size_t insz,
{ {
iconv_t conv; iconv_t conv;
char *out; char *out;
const char *bom_str = NULL;
size_t bom_len = 0;
if (!in_encoding) if (!in_encoding)
return NULL; return NULL;
/* UTF-16LE-BOM is the same as UTF-16 for reading */
if (same_utf_encoding("UTF-16LE-BOM", in_encoding))
in_encoding = "UTF-16";
/*
* For writing, UTF-16 iconv typically creates "UTF-16BE-BOM"
* Some users under Windows want the little endian version
*/
if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) {
bom_str = utf16_le_bom;
bom_len = sizeof(utf16_le_bom);
out_encoding = "UTF-16LE";
} else if (same_utf_encoding("UTF-16BE-BOM", out_encoding)) {
bom_str = utf16_be_bom;
bom_len = sizeof(utf16_be_bom);
out_encoding = "UTF-16BE";
}
conv = iconv_open(out_encoding, in_encoding); conv = iconv_open(out_encoding, in_encoding);
if (conv == (iconv_t) -1) { if (conv == (iconv_t) -1) {
in_encoding = fallback_encoding(in_encoding); in_encoding = fallback_encoding(in_encoding);
@ -553,9 +579,10 @@ char *reencode_string_len(const char *in, size_t insz,
if (conv == (iconv_t) -1) if (conv == (iconv_t) -1)
return NULL; return NULL;
} }
out = reencode_string_iconv(in, insz, conv, bom_len, outsz);
out = reencode_string_iconv(in, insz, conv, outsz);
iconv_close(conv); iconv_close(conv);
if (out && bom_str && bom_len)
memcpy(out, bom_str, bom_len);
return out; return out;
} }
#endif #endif
@ -566,11 +593,6 @@ static int has_bom_prefix(const char *data, size_t len,
return data && bom && (len >= bom_len) && !memcmp(data, bom, bom_len); return data && bom && (len >= bom_len) && !memcmp(data, bom, bom_len);
} }
static const char utf16_be_bom[] = {'\xFE', '\xFF'};
static const char utf16_le_bom[] = {'\xFF', '\xFE'};
static const char utf32_be_bom[] = {'\0', '\0', '\xFE', '\xFF'};
static const char utf32_le_bom[] = {'\xFF', '\xFE', '\0', '\0'};
int has_prohibited_utf_bom(const char *enc, const char *data, size_t len) int has_prohibited_utf_bom(const char *enc, const char *data, size_t len)
{ {
return ( return (

2
utf8.h
Просмотреть файл

@ -27,7 +27,7 @@ void strbuf_utf8_replace(struct strbuf *sb, int pos, int width,
#ifndef NO_ICONV #ifndef NO_ICONV
char *reencode_string_iconv(const char *in, size_t insz, char *reencode_string_iconv(const char *in, size_t insz,
iconv_t conv, size_t *outsz); iconv_t conv, size_t bom_len, size_t *outsz);
char *reencode_string_len(const char *in, size_t insz, char *reencode_string_len(const char *in, size_t insz,
const char *out_encoding, const char *out_encoding,
const char *in_encoding, const char *in_encoding,