gecko-dev/intl/uconv/nsUTF8ConverterService.cpp

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:expandtab:shiftwidth=2:tabstop=4: 
 */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsString.h"
#include "nsUTF8ConverterService.h"
#include "nsEscape.h"
#include "nsIUnicodeDecoder.h"
#include "mozilla/dom/EncodingUtils.h"
#include "mozilla/UniquePtr.h"

using mozilla::dom::EncodingUtils;

NS_IMPL_ISUPPORTS(nsUTF8ConverterService, nsIUTF8ConverterService)

static nsresult 
ToUTF8(const nsACString &aString, const char *aCharset,
       bool aAllowSubstitution, nsACString &aResult)
{
  nsresult rv;
  if (!aCharset || !*aCharset)
    return NS_ERROR_INVALID_ARG;

  nsDependentCString label(aCharset);
  nsAutoCString encoding;
  if (!EncodingUtils::FindEncodingForLabelNoReplacement(label, encoding)) {
    return NS_ERROR_UCONV_NOCONV;
  }
  nsCOMPtr<nsIUnicodeDecoder> unicodeDecoder =
    EncodingUtils::DecoderForEncoding(encoding);

  if (!aAllowSubstitution)
    unicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);

  int32_t srcLen = aString.Length();
  int32_t dstLen;
  const nsAFlatCString& inStr = PromiseFlatCString(aString);
  rv = unicodeDecoder->GetMaxLength(inStr.get(), srcLen, &dstLen);
  NS_ENSURE_SUCCESS(rv, rv);

  auto ustr = mozilla::MakeUnique<char16_t[]>(dstLen);
  NS_ENSURE_TRUE(ustr, NS_ERROR_OUT_OF_MEMORY);

  rv = unicodeDecoder->Convert(inStr.get(), &srcLen, ustr.get(), &dstLen);
  if (NS_SUCCEEDED(rv)){
    CopyUTF16toUTF8(Substring(ustr.get(), ustr.get() + dstLen), aResult);
  }
  return rv;
}

NS_IMETHODIMP  
nsUTF8ConverterService::ConvertStringToUTF8(const nsACString &aString, 
                                            const char *aCharset, 
                                            bool aSkipCheck, 
                                            bool aAllowSubstitution,
                                            uint8_t aOptionalArgc,
                                            nsACString &aUTF8String)
{
  bool allowSubstitution = (aOptionalArgc == 1) ? aAllowSubstitution : true;

  // return if ASCII only or valid UTF-8 providing that the ASCII/UTF-8
  // check is requested. It may not be asked for if a caller suspects
  // that the input is in non-ASCII 7bit charset (ISO-2022-xx, HZ) or 
  // it's in a charset other than UTF-8 that can be mistaken for UTF-8.
  if (!aSkipCheck && (IsASCII(aString) || IsUTF8(aString))) {
    aUTF8String = aString;
    return NS_OK;
  }

  aUTF8String.Truncate();

  nsresult rv = ToUTF8(aString, aCharset, allowSubstitution, aUTF8String);

  // additional protection for cases where check is skipped and  the input
  // is actually in UTF-8 as opposed to aCharset. (i.e. caller's hunch
  // was wrong.) We don't check ASCIIness assuming there's no charset
  // incompatible with ASCII (we don't support EBCDIC).
  if (aSkipCheck && NS_FAILED(rv) && IsUTF8(aString)) {
    aUTF8String = aString;
    return NS_OK;
  }

  return rv;
}

NS_IMETHODIMP  
nsUTF8ConverterService::ConvertURISpecToUTF8(const nsACString &aSpec, 
                                             const char *aCharset, 
                                             nsACString &aUTF8Spec)
{
  // assume UTF-8 if the spec contains unescaped non-ASCII characters.
  // No valid spec in Mozilla would break this assumption.
  if (!IsASCII(aSpec)) {
    aUTF8Spec = aSpec;
    return NS_OK;
  }

  aUTF8Spec.Truncate();

  nsAutoCString unescapedSpec; 
  // NS_UnescapeURL does not fill up unescapedSpec unless there's at least 
  // one character to unescape.
  bool written = NS_UnescapeURL(PromiseFlatCString(aSpec).get(), aSpec.Length(), 
                                  esc_OnlyNonASCII, unescapedSpec);

  if (!written) {
    aUTF8Spec = aSpec;
    return NS_OK;
  }
  // return if ASCII only or escaped UTF-8
  if (IsASCII(unescapedSpec) || IsUTF8(unescapedSpec)) {
    aUTF8Spec = unescapedSpec;
    return NS_OK;
  }

  return ToUTF8(unescapedSpec, aCharset, true, aUTF8Spec);
}
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00			`/* -- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -- */`
			`/* vim:expandtab:shiftwidth=2:tabstop=4:`
			`*/`
Bug 716478 - update licence to MPL 2. 2012-05-21 15:12:37 +04:00			`/* This Source Code Form is subject to the terms of the Mozilla Public`
			`* License, v. 2.0. If a copy of the MPL was not distributed with this`
			`* file, You can obtain one at http://mozilla.org/MPL/2.0/. */`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00			`#include "nsString.h"`
			`#include "nsUTF8ConverterService.h"`
			`#include "nsEscape.h"`
Bug 943268 - Remove nsCharsetAlias and nsCharsetConverterManager. r=emk. 2014-05-08 13:32:00 +04:00			`#include "nsIUnicodeDecoder.h"`
			`#include "mozilla/dom/EncodingUtils.h"`
Bug 1221550 - use UniquePtr<T[]> instead of nsAutoArrayPtr<T> in intl/; r=smontagu 2015-11-03 23:49:22 +03:00			`#include "mozilla/UniquePtr.h"`
Bug 943268 - Remove nsCharsetAlias and nsCharsetConverterManager. r=emk. 2014-05-08 13:32:00 +04:00
			`using mozilla::dom::EncodingUtils;`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00
Bug 900908 - Part 3: Change uses of numbered macros in nsIClassInfoImpl.h/nsISupportsImpl.h to the variadic variants. r=froydnj 2014-04-27 11:06:00 +04:00			`NS_IMPL_ISUPPORTS(nsUTF8ConverterService, nsIUTF8ConverterService)`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00
			`static nsresult`
Bug 663057 - support RFC2231/5987 encoding for title parameter in HTTP link header fields. r=hsivonen 2012-05-21 17:31:00 +04:00			`ToUTF8(const nsACString &aString, const char *aCharset,`
			`bool aAllowSubstitution, nsACString &aResult)`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00			`{`
			`nsresult rv;`
			`if (!aCharset \|\| !*aCharset)`
			`return NS_ERROR_INVALID_ARG;`

Bug 943268 - Remove nsCharsetAlias and nsCharsetConverterManager. r=emk. 2014-05-08 13:32:00 +04:00			`nsDependentCString label(aCharset);`
			`nsAutoCString encoding;`
			`if (!EncodingUtils::FindEncodingForLabelNoReplacement(label, encoding)) {`
			`return NS_ERROR_UCONV_NOCONV;`
			`}`
			`nsCOMPtr<nsIUnicodeDecoder> unicodeDecoder =`
			`EncodingUtils::DecoderForEncoding(encoding);`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00
Bug 663057 - support RFC2231/5987 encoding for title parameter in HTTP link header fields. r=hsivonen 2012-05-21 17:31:00 +04:00			`if (!aAllowSubstitution)`
			`unicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);`

Bug 579517 - Part 1: Automated conversion of NSPR numeric types to stdint types in Gecko; r=bsmedberg This patch was generated by a script. Here's the source of the script for future reference: function convert() { echo "Converting $1 to $2..." find . ! -wholename "nsprpub" \ ! -wholename "security/nss" \ ! -wholename "/.hg" \ ! -wholename "obj-ff-dbg" \ ! -name nsXPCOMCID.h \ ! -name prtypes.h \ -type f \ \( -iname ".cpp" \ -o -iname ".h" \ -o -iname ".c" \ -o -iname ".cc" \ -o -iname ".idl" \ -o -iname ".ipdl" \ -o -iname ".ipdlh" \ -o -iname "*.mm" \) \| \ xargs -n 1 sed -i -e "s/\b$1\b/$2/g" } convert PRInt8 int8_t convert PRUint8 uint8_t convert PRInt16 int16_t convert PRUint16 uint16_t convert PRInt32 int32_t convert PRUint32 uint32_t convert PRInt64 int64_t convert PRUint64 uint64_t convert PRIntn int convert PRUintn unsigned convert PRSize size_t convert PROffset32 int32_t convert PROffset64 int64_t convert PRPtrdiff ptrdiff_t convert PRFloat64 double 2012-08-22 19:56:38 +04:00			`int32_t srcLen = aString.Length();`
			`int32_t dstLen;`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00			`const nsAFlatCString& inStr = PromiseFlatCString(aString);`
			`rv = unicodeDecoder->GetMaxLength(inStr.get(), srcLen, &dstLen);`
			`NS_ENSURE_SUCCESS(rv, rv);`

Bug 1221550 - use UniquePtr<T[]> instead of nsAutoArrayPtr<T> in intl/; r=smontagu 2015-11-03 23:49:22 +03:00			`auto ustr = mozilla::MakeUnique<char16_t[]>(dstLen);`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00			`NS_ENSURE_TRUE(ustr, NS_ERROR_OUT_OF_MEMORY);`

Bug 1221550 - use UniquePtr<T[]> instead of nsAutoArrayPtr<T> in intl/; r=smontagu 2015-11-03 23:49:22 +03:00			`rv = unicodeDecoder->Convert(inStr.get(), &srcLen, ustr.get(), &dstLen);`
bug 209328 : fix mailto URL regression. (r=nhotta, sr=alecf) In addition, address two other (related) issues. - Fix ConvertURISpecToUTF8 to make it fill up the out parameter - Make Tru64 Cxx happy with NS_STATIC_CAST of nsAutoPtr<PRUnichar> 2003-06-24 18:31:09 +04:00			`if (NS_SUCCEEDED(rv)){`
Bug 1221550 - use UniquePtr<T[]> instead of nsAutoArrayPtr<T> in intl/; r=smontagu 2015-11-03 23:49:22 +03:00			`CopyUTF16toUTF8(Substring(ustr.get(), ustr.get() + dstLen), aResult);`
bug 209328 : fix mailto URL regression. (r=nhotta, sr=alecf) In addition, address two other (related) issues. - Fix ConvertURISpecToUTF8 to make it fill up the out parameter - Make Tru64 Cxx happy with NS_STATIC_CAST of nsAutoPtr<PRUnichar> 2003-06-24 18:31:09 +04:00			`}`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00			`return rv;`
			`}`

			`NS_IMETHODIMP`
			`nsUTF8ConverterService::ConvertStringToUTF8(const nsACString &aString,`
			`const char *aCharset,`
Bug 675553 - Switch from PRBool to bool on a CLOSED TREE , r=bsmedberg,khuey,bz,cjones --HG-- rename : tools/trace-malloc/bloatblame.c => tools/trace-malloc/bloatblame.cpp 2011-09-29 10:19:26 +04:00			`bool aSkipCheck,`
Bug 663057 - support RFC2231/5987 encoding for title parameter in HTTP link header fields. r=hsivonen 2012-05-21 17:31:00 +04:00			`bool aAllowSubstitution,`
Bug 579517 - Part 1: Automated conversion of NSPR numeric types to stdint types in Gecko; r=bsmedberg This patch was generated by a script. Here's the source of the script for future reference: function convert() { echo "Converting $1 to $2..." find . ! -wholename "nsprpub" \ ! -wholename "security/nss" \ ! -wholename "/.hg" \ ! -wholename "obj-ff-dbg" \ ! -name nsXPCOMCID.h \ ! -name prtypes.h \ -type f \ \( -iname ".cpp" \ -o -iname ".h" \ -o -iname ".c" \ -o -iname ".cc" \ -o -iname ".idl" \ -o -iname ".ipdl" \ -o -iname ".ipdlh" \ -o -iname "*.mm" \) \| \ xargs -n 1 sed -i -e "s/\b$1\b/$2/g" } convert PRInt8 int8_t convert PRUint8 uint8_t convert PRInt16 int16_t convert PRUint16 uint16_t convert PRInt32 int32_t convert PRUint32 uint32_t convert PRInt64 int64_t convert PRUint64 uint64_t convert PRIntn int convert PRUintn unsigned convert PRSize size_t convert PROffset32 int32_t convert PROffset64 int64_t convert PRPtrdiff ptrdiff_t convert PRFloat64 double 2012-08-22 19:56:38 +04:00			`uint8_t aOptionalArgc,`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00			`nsACString &aUTF8String)`
			`{`
Bug 776399 - Fix IDL compat problem introduced by changes for bug 663057. r=bz 2012-07-25 05:37:32 +04:00			`bool allowSubstitution = (aOptionalArgc == 1) ? aAllowSubstitution : true;`

bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00			`// return if ASCII only or valid UTF-8 providing that the ASCII/UTF-8`
			`// check is requested. It may not be asked for if a caller suspects`
			`// that the input is in non-ASCII 7bit charset (ISO-2022-xx, HZ) or`
			`// it's in a charset other than UTF-8 that can be mistaken for UTF-8.`
			`if (!aSkipCheck && (IsASCII(aString) \|\| IsUTF8(aString))) {`
			`aUTF8String = aString;`
			`return NS_OK;`
			`}`

bug 209328 : fix mailto URL regression. (r=nhotta, sr=alecf) In addition, address two other (related) issues. - Fix ConvertURISpecToUTF8 to make it fill up the out parameter - Make Tru64 Cxx happy with NS_STATIC_CAST of nsAutoPtr<PRUnichar> 2003-06-24 18:31:09 +04:00			`aUTF8String.Truncate();`

Bug 776399 - Fix IDL compat problem introduced by changes for bug 663057. r=bz 2012-07-25 05:37:32 +04:00			`nsresult rv = ToUTF8(aString, aCharset, allowSubstitution, aUTF8String);`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00
			`// additional protection for cases where check is skipped and the input`
			`// is actually in UTF-8 as opposed to aCharset. (i.e. caller's hunch`
			`// was wrong.) We don't check ASCIIness assuming there's no charset`
			`// incompatible with ASCII (we don't support EBCDIC).`
			`if (aSkipCheck && NS_FAILED(rv) && IsUTF8(aString)) {`
			`aUTF8String = aString;`
			`return NS_OK;`
			`}`

			`return rv;`
			`}`

			`NS_IMETHODIMP`
			`nsUTF8ConverterService::ConvertURISpecToUTF8(const nsACString &aSpec,`
			`const char *aCharset,`
			`nsACString &aUTF8Spec)`
			`{`
			`// assume UTF-8 if the spec contains unescaped non-ASCII characters.`
			`// No valid spec in Mozilla would break this assumption.`
			`if (!IsASCII(aSpec)) {`
			`aUTF8Spec = aSpec;`
			`return NS_OK;`
			`}`

bug 209328 : fix mailto URL regression. (r=nhotta, sr=alecf) In addition, address two other (related) issues. - Fix ConvertURISpecToUTF8 to make it fill up the out parameter - Make Tru64 Cxx happy with NS_STATIC_CAST of nsAutoPtr<PRUnichar> 2003-06-24 18:31:09 +04:00			`aUTF8Spec.Truncate();`

Bug 773151: Convert nsCAutoString->nsAutoCString CLOSED TREE r=bsmedberg 2012-09-02 06:35:17 +04:00			`nsAutoCString unescapedSpec;`
Backing out patch for bug 274264, since it caused bug 278727, per patch author's request. 2005-01-24 02:13:10 +03:00			`// NS_UnescapeURL does not fill up unescapedSpec unless there's at least`
			`// one character to unescape.`
Bug 675553 - Switch from PRBool to bool on a CLOSED TREE , r=bsmedberg,khuey,bz,cjones --HG-- rename : tools/trace-malloc/bloatblame.c => tools/trace-malloc/bloatblame.cpp 2011-09-29 10:19:26 +04:00			`bool written = NS_UnescapeURL(PromiseFlatCString(aSpec).get(), aSpec.Length(),`
Backing out patch for bug 274264, since it caused bug 278727, per patch author's request. 2005-01-24 02:13:10 +03:00			`esc_OnlyNonASCII, unescapedSpec);`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00
Backing out patch for bug 274264, since it caused bug 278727, per patch author's request. 2005-01-24 02:13:10 +03:00			`if (!written) {`
			`aUTF8Spec = aSpec;`
			`return NS_OK;`
			`}`
			`// return if ASCII only or escaped UTF-8`
			`if (IsASCII(unescapedSpec) \|\| IsUTF8(unescapedSpec)) {`
			`aUTF8Spec = unescapedSpec;`
			`return NS_OK;`
bug 167265 : add to necko Content-Disposition header handling per RFC 2231 (with fallbacks to RFC 2047 and raw 8bit chars in \|aOriginCharset\| ) necessary for i18nized filename support (when downloading files via http) : r=cbiesinger, sr=alecf 2003-06-13 01:57:49 +04:00			`}`

Bug 663057 - support RFC2231/5987 encoding for title parameter in HTTP link header fields. r=hsivonen 2012-05-21 17:31:00 +04:00			`return ToUTF8(unescapedSpec, aCharset, true, aUTF8Spec);`
bug 274264 : Japanese attachment file name garbled : patch by Ahn Dal Soo modified by me and darin (r=me, sr=darin, a=asa) 2005-01-12 02:10:58 +03:00			`}`
Backing out patch for bug 274264, since it caused bug 278727, per patch author's request. 2005-01-24 02:13:10 +03:00