Bug 1170668 - Improve short read handling in nsConverterInputStream, r=hsivonen

This patch changes how nsConverterInputStream handles passing data through to the underlying unicode converter in order to make it more reliably handle propagating errors and deal with short reads from the underlying input stream. This was done by making the code continuously read within the Fill method until at least one character has been decoded from the input stream, so that we don't spuriously communicate an EOF to the caller due to a short read not producing enough bytes for the decoder to produce a UTF-16 character. In addition, while making this change it became easier to signal to the decoder about the final read from the input stream, meaning that partial characters at the end of the stream will now generate a replacement character, rather than being ignored. Differential Revision: https://phabricator.services.mozilla.com/D152682
2022-08-10 19:44:41 +00:00 · 2022-08-10 19:44:41 +00:00 · 1972b3b710
--- a/intl/uconv/nsConverterInputStream.cpp
+++ b/intl/uconv/nsConverterInputStream.cpp
@ -105,39 +105,38 @@ nsConverterInputStream::ReadSegments(nsWriteUnicharSegmentFun aWriter,
                                     void* aClosure, uint32_t aCount,
                                     uint32_t* aReadCount) {
  NS_ASSERTION(mUnicharDataLength >= mUnicharDataOffset, "unsigned madness");
-  uint32_t bytesToWrite = mUnicharDataLength - mUnicharDataOffset;
-  nsresult rv;
-  if (0 == bytesToWrite) {
+  uint32_t codeUnitsToWrite = mUnicharDataLength - mUnicharDataOffset;
+  if (0 == codeUnitsToWrite) {
    // Fill the unichar buffer
-    bytesToWrite = Fill(&rv);
-    if (bytesToWrite <= 0) {
+    codeUnitsToWrite = Fill(&mLastErrorCode);
+    if (codeUnitsToWrite == 0) {
      *aReadCount = 0;
-      return rv;
-    }
-    if (NS_FAILED(rv)) {
-      return rv;
+      return mLastErrorCode;
    }
  }

-  if (bytesToWrite > aCount) bytesToWrite = aCount;
+  if (codeUnitsToWrite > aCount) {
+    codeUnitsToWrite = aCount;
+  }

-  uint32_t bytesWritten;
-  uint32_t totalBytesWritten = 0;
+  uint32_t codeUnitsWritten;
+  uint32_t totalCodeUnitsWritten = 0;

-  while (bytesToWrite) {
-    rv = aWriter(this, aClosure, mUnicharData.Elements() + mUnicharDataOffset,
-                 totalBytesWritten, bytesToWrite, &bytesWritten);
+  while (codeUnitsToWrite) {
+    nsresult rv =
+        aWriter(this, aClosure, mUnicharData.Elements() + mUnicharDataOffset,
+                totalCodeUnitsWritten, codeUnitsToWrite, &codeUnitsWritten);
    if (NS_FAILED(rv)) {
      // don't propagate errors to the caller
      break;
    }

-    bytesToWrite -= bytesWritten;
-    totalBytesWritten += bytesWritten;
-    mUnicharDataOffset += bytesWritten;
+    codeUnitsToWrite -= codeUnitsWritten;
+    totalCodeUnitsWritten += codeUnitsWritten;
+    mUnicharDataOffset += codeUnitsWritten;
  }

-  *aReadCount = totalBytesWritten;
+  *aReadCount = totalCodeUnitsWritten;

  return NS_OK;
 }
@ -166,7 +165,7 @@ nsConverterInputStream::ReadString(uint32_t aCount, nsAString& aString,
 }

 uint32_t nsConverterInputStream::Fill(nsresult* aErrorCode) {
-  if (nullptr == mInput) {
+  if (!mInput) {
    // We already closed the stream!
    *aErrorCode = NS_BASE_STREAM_CLOSED;
    return 0;
@ -179,54 +178,72 @@ uint32_t nsConverterInputStream::Fill(nsresult* aErrorCode) {
    return 0;
  }

-  // We assume a many to one conversion and are using equal sizes for
-  // the two buffers.  However if an error happens at the very start
-  // of a byte buffer we may end up in a situation where n bytes lead
-  // to n+1 unicode chars.  Thus we need to keep track of the leftover
-  // bytes as we convert.
-
-  uint32_t nb;
-  *aErrorCode = NS_FillArray(mByteData, mInput, mLeftOverBytes, &nb);
-  if (nb == 0 && mLeftOverBytes == 0) {
-    // No more data
-    *aErrorCode = NS_OK;
-    return 0;
-  }
-
-  NS_ASSERTION(uint32_t(nb) + mLeftOverBytes == mByteData.Length(),
-               "mByteData is lying to us somewhere");
-
-  // Now convert as much of the byte buffer to unicode as possible
-  auto src = AsBytes(Span(mByteData));
-  auto dst = Span(mUnicharData);
  // mUnicharData.Length() is the buffer length, not the fill status.
  // mUnicharDataLength reflects the current fill status.
  mUnicharDataLength = 0;
  // Whenever we convert, mUnicharData is logically empty.
  mUnicharDataOffset = 0;
-  // Truncation from size_t to uint32_t below is OK, because the sizes
-  // are bounded by the lengths of mByteData and mUnicharData.
-  uint32_t result;
-  size_t read;
-  size_t written;
-  // The design of this class is fundamentally bogus in that trailing
-  // errors are ignored. Always passing false as the last argument to
-  // Decode* calls below.
-  if (mErrorsAreFatal) {
-    std::tie(result, read, written) =
-        mConverter->DecodeToUTF16WithoutReplacement(src, dst, false);
-  } else {
-    std::tie(result, read, written, std::ignore) =
-        mConverter->DecodeToUTF16(src, dst, false);
-  }
-  mLeftOverBytes = mByteData.Length() - read;
-  mUnicharDataLength = written;
-  if (result == kInputEmpty || result == kOutputFull) {
-    *aErrorCode = NS_OK;
-  } else {
-    MOZ_ASSERT(mErrorsAreFatal, "How come DecodeToUTF16() reported error?");
-    *aErrorCode = NS_ERROR_UDEC_ILLEGALINPUT;
+
+  // Continue trying to read from the source stream until we successfully decode
+  // a character or encounter an error, as returning `0` here implies that the
+  // stream is complete.
+  //
+  // If the converter has been cleared, we've fully consumed the stream, and
+  // want to report EOF.
+  while (mUnicharDataLength == 0 && mConverter) {
+    // We assume a many to one conversion and are using equal sizes for
+    // the two buffers.  However if an error happens at the very start
+    // of a byte buffer we may end up in a situation where n bytes lead
+    // to n+1 unicode chars.  Thus we need to keep track of the leftover
+    // bytes as we convert.
+
+    uint32_t nb;
+    *aErrorCode = NS_FillArray(mByteData, mInput, mLeftOverBytes, &nb);
+    if (NS_FAILED(*aErrorCode)) {
+      return 0;
+    }
+
+    NS_ASSERTION(uint32_t(nb) + mLeftOverBytes == mByteData.Length(),
+                 "mByteData is lying to us somewhere");
+
+    // If `NS_FillArray` failed to read any new bytes, this is the last read,
+    // and we're at the end of the stream.
+    bool last = (nb == 0);
+
+    // Now convert as much of the byte buffer to unicode as possible
+    auto src = AsBytes(Span(mByteData));
+    auto dst = Span(mUnicharData);
+
+    // Truncation from size_t to uint32_t below is OK, because the sizes
+    // are bounded by the lengths of mByteData and mUnicharData.
+    uint32_t result;
+    size_t read;
+    size_t written;
+    if (mErrorsAreFatal) {
+      std::tie(result, read, written) =
+          mConverter->DecodeToUTF16WithoutReplacement(src, dst, last);
+    } else {
+      std::tie(result, read, written, std::ignore) =
+          mConverter->DecodeToUTF16(src, dst, last);
+    }
+    mLeftOverBytes = mByteData.Length() - read;
+    mUnicharDataLength = written;
+    // Clear `mConverter` if we reached the end of the stream, as we can't
+    // call methods on it anymore. This will also signal EOF to the caller
+    // through the loop condition.
+    if (last) {
+      MOZ_ASSERT(mLeftOverBytes == 0,
+                 "Failed to read all bytes on the last pass?");
+      mConverter = nullptr;
+    }
+    // If we got a decode error, we're done.
+    if (result != kInputEmpty && result != kOutputFull) {
+      MOZ_ASSERT(mErrorsAreFatal, "How come DecodeToUTF16() reported error?");
+      *aErrorCode = NS_ERROR_UDEC_ILLEGALINPUT;
+      return 0;
+    }
  }
+  *aErrorCode = NS_OK;
  return mUnicharDataLength;
 }

--- a/intl/uconv/tests/gtest/TestShortRead.cpp
+++ b/intl/uconv/tests/gtest/TestShortRead.cpp
@ -0,0 +1,106 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "gtest/gtest.h"
+#include "mozilla/ErrorNames.h"
+#include "nsCOMPtr.h"
+#include "nsConverterInputStream.h"
+#include "nsIInputStream.h"
+#include "nsISupports.h"
+#include "nsStringStream.h"
+
+namespace {
+
+class ShortReadWrapper final : public nsIInputStream {
+ public:
+  NS_DECL_THREADSAFE_ISUPPORTS
+  NS_DECL_NSIINPUTSTREAM
+
+  template <size_t N>
+  ShortReadWrapper(const uint32_t (&aShortReads)[N],
+                   nsIInputStream* aBaseStream)
+      : mShortReadIter(std::begin(aShortReads)),
+        mShortReadEnd(std::end(aShortReads)),
+        mBaseStream(aBaseStream) {}
+
+  ShortReadWrapper(const ShortReadWrapper&) = delete;
+  ShortReadWrapper& operator=(const ShortReadWrapper&) = delete;
+
+ private:
+  ~ShortReadWrapper() = default;
+
+  const uint32_t* mShortReadIter;
+  const uint32_t* mShortReadEnd;
+  nsCOMPtr<nsIInputStream> mBaseStream;
+};
+
+NS_IMPL_ISUPPORTS(ShortReadWrapper, nsIInputStream)
+
+NS_IMETHODIMP
+ShortReadWrapper::Close() { return mBaseStream->Close(); }
+
+NS_IMETHODIMP
+ShortReadWrapper::Available(uint64_t* aAvailable) {
+  nsresult rv = mBaseStream->Available(aAvailable);
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  if (mShortReadIter != mShortReadEnd) {
+    *aAvailable = std::min(uint64_t(*mShortReadIter), *aAvailable);
+  }
+  return NS_OK;
+}
+
+NS_IMETHODIMP
+ShortReadWrapper::Read(char* aBuf, uint32_t aCount, uint32_t* _retval) {
+  if (mShortReadIter != mShortReadEnd) {
+    aCount = std::min(*mShortReadIter, aCount);
+  }
+
+  nsresult rv = mBaseStream->Read(aBuf, aCount, _retval);
+  if (NS_SUCCEEDED(rv) && mShortReadIter != mShortReadEnd) {
+    ++mShortReadIter;
+  }
+  return rv;
+}
+
+NS_IMETHODIMP
+ShortReadWrapper::ReadSegments(nsWriteSegmentFun aWriter, void* aClosure,
+                               uint32_t aCount, uint32_t* _retval) {
+  return NS_ERROR_NOT_IMPLEMENTED;
+}
+
+NS_IMETHODIMP
+ShortReadWrapper::IsNonBlocking(bool* _retval) {
+  return mBaseStream->IsNonBlocking(_retval);
+}
+
+}  // namespace
+
+TEST(ConverterStreamShortRead, ShortRead)
+{
+  uint8_t bytes[] = {0xd8, 0x35, 0xdc, 0x20};
+  nsCOMPtr<nsIInputStream> baseStream;
+  ASSERT_TRUE(NS_SUCCEEDED(NS_NewByteInputStream(getter_AddRefs(baseStream),
+                                                 AsChars(mozilla::Span(bytes)),
+                                                 NS_ASSIGNMENT_COPY)));
+
+  static const uint32_t kShortReads[] = {1, 2, 1};
+  nsCOMPtr<nsIInputStream> shortStream =
+      new ShortReadWrapper(kShortReads, baseStream);
+
+  RefPtr<nsConverterInputStream> unicharStream = new nsConverterInputStream();
+  ASSERT_TRUE(NS_SUCCEEDED(
+      unicharStream->Init(shortStream, "UTF-16BE", 4096,
+                          nsIConverterInputStream::ERRORS_ARE_FATAL)));
+
+  uint32_t read;
+  nsAutoString result;
+  ASSERT_TRUE(
+      NS_SUCCEEDED(unicharStream->ReadString(UINT32_MAX, result, &read)));
+
+  ASSERT_EQ(read, 2u);
+  ASSERT_TRUE(result == u"\u{1d420}");
+}
--- a/intl/uconv/tests/gtest/moz.build
+++ b/intl/uconv/tests/gtest/moz.build
@ -0,0 +1,11 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+UNIFIED_SOURCES += [
+    "TestShortRead.cpp",
+]
+
+FINAL_LIBRARY = "xul-gtest"
--- a/intl/uconv/tests/moz.build
+++ b/intl/uconv/tests/moz.build
@ -4,6 +4,10 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

+TEST_DIRS += [
+    "gtest",
+]
+
 XPCSHELL_TESTS_MANIFESTS += ["unit/xpcshell.ini"]

 MOCHITEST_MANIFESTS += ["mochitest.ini"]
--- a/intl/uconv/tests/unit/test_bug317216.js
+++ b/intl/uconv/tests/unit/test_bug317216.js
@ -75,14 +75,14 @@ const test = [
  // 10: Lone high surrogate at the end of the input
  [
    "%D8%35%",
-    //    expected: nothing
-    "",
+    //    expected: one replacement char
+    "\uFFFD",
  ],
  // 11: Half code unit at the end of the input
  [
    "%D8",
-    //    expected: nothing
-    "",
+    //    expected: one replacement char
+    "\uFFFD",
  ],
 ];

--- a/xpcom/io/nsIUnicharInputStream.idl
+++ b/xpcom/io/nsIUnicharInputStream.idl
@ -16,9 +16,10 @@ interface nsIInputStream;
 * @param aInStream stream being read
 * @param aClosure opaque parameter passed to ReadSegments
 * @param aFromSegment pointer to memory owned by the input stream
- * @param aToOffset amount already read (since ReadSegments was called)
+ * @param aToOffset number of UTF-16 code units already read
+ *                  (since ReadSegments was called)
 * @param aCount length of fromSegment
- * @param aWriteCount number of bytes read
+ * @param aWriteCount number of UTF-16 code units read
 *
 * Implementers should return the following:
 *
@ -38,19 +39,19 @@ typedef nsresult (*nsWriteUnicharSegmentFun)(nsIUnicharInputStream *aInStream,
 native nsWriteUnicharSegmentFun(nsWriteUnicharSegmentFun);

 /**
- * Abstract unicode character input stream
+ * Abstract UTF-16 input stream
 * @see nsIInputStream
 */
 [scriptable, uuid(d5e3bd80-6723-4b92-b0c9-22f6162fd94f)]
 interface nsIUnicharInputStream : nsISupports {
  /**
-   * Reads into a caller-provided character array.
+   * Reads into a caller-provided array.
   *
-   * @return The number of characters that were successfully read. May be less
-   *         than aCount, even if there is more data in the input stream.
-   *         A return value of 0 means EOF.
+   * @return The number of utf-16 code units that were successfully read.
+   *         May be less than aCount, even if there is more data in the input
+   *         stream. A return value of 0 means EOF.
   *
-   * @note To read more than 2^32 characters, call this method multiple times.
+   * @note To read more than 2^32 code units, call this method multiple times.
   */
  [noscript] unsigned long read([array, size_is(aCount)] in char16_t aBuf,
                                in unsigned long aCount);
@ -60,13 +61,13 @@ interface nsIUnicharInputStream : nsISupports {
   * The writer function may be called multiple times for segmented buffers.
   * ReadSegments is expected to keep calling the writer until either there is
   * nothing left to read or the writer returns an error.  ReadSegments should
-   * not call the writer with zero characters to consume.
+   * not call the writer with zero UTF-16 code units to consume.
   *
   * @param aWriter the "consumer" of the data to be read
   * @param aClosure opaque parameter passed to writer
-   * @param aCount the maximum number of characters to be read
+   * @param aCount the maximum number of UTF-16 code units to be read
   *
-   * @return number of characters read (may be less than aCount)
+   * @return number of UTF-16 code units read (may be less than aCount)
   * @return 0 if reached end of file (or if aWriter refused to consume data)
   *
   * @throws NS_BASE_STREAM_WOULD_BLOCK if reading from the input stream would
@ -82,14 +83,15 @@ interface nsIUnicharInputStream : nsISupports {

  /**
   * Read into a string object.
-   * @param aCount The number of characters that should be read
-   * @return The number of characters that were read.
+   *
+   * @param aCount The number of UTF-16 code units that should be read
+   * @return The number of UTF-16 code units that were read.
   */
  unsigned long readString(in unsigned long aCount, out AString aString);

- /**
-  * Close the stream and free associated resources. This also closes the
-  * underlying stream, if any.
-  */
+  /**
+   * Close the stream and free associated resources. This also closes the
+   * underlying stream, if any.
+   */
  void close();
 };