Bug 1180225. Make convolver more like upstream. r=seth

This fixes uninitialised reads. --HG-- extra : rebase_source : c7f50d9ea7a56d845ccb2f324e3e73fbd2e83003
2015-08-27 16:06:37 -04:00 · 2015-08-27 16:06:37 -04:00 · eb58e4fc03
--- a/gfx/2d/convolver.cpp
+++ b/gfx/2d/convolver.cpp
@ -175,11 +175,11 @@ class CircularRowBuffer {
 // |src_data| and continues for the [begin, end) of the filter.
 template<bool has_alpha>
 void ConvolveHorizontally(const unsigned char* src_data,
-                          int begin, int end,
                          const ConvolutionFilter1D& filter,
                          unsigned char* out_row) {
+  int num_values = filter.num_values();
  // Loop over each pixel on this row in the output image.
-  for (int out_x = begin; out_x < end; out_x++) {
+  for (int out_x = 0; out_x < num_values; out_x++) {
    // Get the filter that determines the current output pixel.
    int filter_offset, filter_length;
    const ConvolutionFilter1D::Fixed* filter_values =
@ -220,17 +220,18 @@ void ConvolveHorizontally(const unsigned char* src_data,
 // Does vertical convolution to produce one output row. The filter values and
 // length are given in the first two parameters. These are applied to each
 // of the rows pointed to in the |source_data_rows| array, with each row
-// being |end - begin| wide.
+// being |pixel_width| wide.
 //
-// The output must have room for |(end - begin) * 4| bytes.
+// The output must have room for |pixel_width * 4| bytes.
 template<bool has_alpha>
 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
                        int filter_length,
                        unsigned char* const* source_data_rows,
-                        int begin, int end, unsigned char* out_row) {
+                        int pixel_width,
+                        unsigned char* out_row) {
  // We go through each column in the output and do a vertical convolution,
  // generating one output pixel each time.
-  for (int out_x = begin; out_x < end; out_x++) {
+  for (int out_x = 0; out_x < pixel_width; out_x++) {
    // Compute the number of bytes over in each row that the current column
    // we're convolving starts at. The pixel will cover the next 4 bytes.
    int byte_offset = out_x * 4;
@ -288,28 +289,29 @@ void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
                        int filter_length,
                        unsigned char* const* source_data_rows,
-                        int width, unsigned char* out_row,
+                        int pixel_width, unsigned char* out_row,
                        bool has_alpha, bool use_simd) {
-  int processed = 0;

 #if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A)
  // If the binary was not built with SSE2 support, we had to fallback to C version.
-  int simd_width = width & ~3;
-  if (use_simd && simd_width) {
+  if (use_simd) {
    ConvolveVertically_SIMD(filter_values, filter_length,
-                            source_data_rows, 0, simd_width,
+                            source_data_rows,
+                            pixel_width,
                            out_row, has_alpha);
-    processed = simd_width;
-  }
+  } else
 #endif
-    
-  if (width > processed) {
+  {
    if (has_alpha) {
-      ConvolveVertically<true>(filter_values, filter_length, source_data_rows,
-                               processed, width, out_row);
+      ConvolveVertically<true>(filter_values, filter_length,
+                               source_data_rows,
+                               pixel_width,
+                               out_row);
    } else {
-      ConvolveVertically<false>(filter_values, filter_length, source_data_rows,
-                                processed, width, out_row);
+      ConvolveVertically<false>(filter_values, filter_length,
+                                source_data_rows,
+                                pixel_width,
+                                out_row);
    }
  }
 }
@ -326,16 +328,16 @@ void ConvolveHorizontally(const unsigned char* src_data,
    // SIMD implementation works with 4 pixels at a time.
    // Therefore we process as much as we can using SSE and then use
    // C implementation for leftovers
-    ConvolveHorizontally_SSE2(src_data, 0, simd_width, filter, out_row);
+    ConvolveHorizontally_SSE2(src_data, filter, out_row);
    processed = simd_width;
  }
 #endif

  if (width > processed) {
    if (has_alpha) {
-      ConvolveHorizontally<true>(src_data, processed, width, filter, out_row);
+      ConvolveHorizontally<true>(src_data, filter, out_row);
    } else {
-      ConvolveHorizontally<false>(src_data, processed, width, filter, out_row);
+      ConvolveHorizontally<false>(src_data, filter, out_row);
    }
  }
 }
@ -457,9 +459,23 @@ void BGRAConvolve2D(const unsigned char* source_data,
  int num_output_rows = filter_y.num_values();
  int pixel_width = filter_x.num_values();

+
  // We need to check which is the last line to convolve before we advance 4
  // lines in one iteration.
  int last_filter_offset, last_filter_length;
+  // SSE2 can access up to 3 extra pixels past the end of the
+  // buffer. At the bottom of the image, we have to be careful
+  // not to access data past the end of the buffer. Normally
+  // we fall back to the C++ implementation for the last row.
+  // If the last row is less than 3 pixels wide, we may have to fall
+  // back to the C++ version for more rows. Compute how many
+  // rows we need to avoid the SSE implementation for here.
+  filter_x.FilterForValue(filter_x.num_values() - 1, &last_filter_offset,
+                          &last_filter_length);
+#if defined(USE_SSE2) || defined(_MIPS_ARCH_LOONGSON3A)
+  int avoid_simd_rows = 1 + 3 /
+      (last_filter_offset + last_filter_length);
+#endif
  filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
                          &last_filter_length);

@ -473,36 +489,32 @@ void BGRAConvolve2D(const unsigned char* source_data,
      // We don't want to process too much rows in batches of 4 because
      // we can go out-of-bounds at the end
      while (next_x_row < filter_offset + filter_length) {
-        if (next_x_row + 3 < last_filter_offset + last_filter_length - 3) {
+        if (next_x_row + 3 < last_filter_offset + last_filter_length -
+            avoid_simd_rows) {
          const unsigned char* src[4];
          unsigned char* out_row[4];
          for (int i = 0; i < 4; ++i) {
            src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
            out_row[i] = row_buffer.AdvanceRow();
          }
-          ConvolveHorizontally4_SIMD(src, 0, pixel_width, filter_x, out_row);
+          ConvolveHorizontally4_SIMD(src, filter_x, out_row);
          next_x_row += 4;
        } else {
-          unsigned char* buffer = row_buffer.AdvanceRow();
-
-          // For last rows, SSE2 load possibly to access data beyond the
-          // image area. therefore we use cobined C+SSE version here
-          int simd_width = pixel_width & ~3;
-          if (simd_width) {
+          // Check if we need to avoid SSE2 for this row.
+          if (next_x_row < last_filter_offset + last_filter_length -
+              avoid_simd_rows) {
            ConvolveHorizontally_SIMD(
                &source_data[next_x_row * source_byte_row_stride],
-                0, simd_width, filter_x, buffer);
-          }
-
-          if (pixel_width > simd_width) {
+                filter_x, row_buffer.AdvanceRow());
+          } else {
            if (source_has_alpha) {
              ConvolveHorizontally<true>(
                  &source_data[next_x_row * source_byte_row_stride],
-                  simd_width, pixel_width, filter_x, buffer);
+                  filter_x, row_buffer.AdvanceRow());
            } else {
              ConvolveHorizontally<false>(
                  &source_data[next_x_row * source_byte_row_stride],
-                  simd_width, pixel_width, filter_x, buffer);
+                  filter_x, row_buffer.AdvanceRow());
            }
          }
          next_x_row++;
@ -514,11 +526,11 @@ void BGRAConvolve2D(const unsigned char* source_data,
        if (source_has_alpha) {
          ConvolveHorizontally<true>(
                                     &source_data[next_x_row * source_byte_row_stride],
-              0, pixel_width, filter_x, row_buffer.AdvanceRow());
+                                     filter_x, row_buffer.AdvanceRow());
        } else {
          ConvolveHorizontally<false>(
                                      &source_data[next_x_row * source_byte_row_stride],
-              0, pixel_width, filter_x, row_buffer.AdvanceRow());
+                                      filter_x, row_buffer.AdvanceRow());
        }
        next_x_row++;
      }
--- a/gfx/2d/convolverSSE2.cpp
+++ b/gfx/2d/convolverSSE2.cpp
@ -35,26 +35,24 @@
 namespace skia {

 // Convolves horizontally along a single row. The row data is given in
-// |src_data| and continues for the [begin, end) of the filter.
+// |src_data| and continues for the num_values() of the filter.
 void ConvolveHorizontally_SSE2(const unsigned char* src_data,
-                               int begin, int end,
                               const ConvolutionFilter1D& filter,
                               unsigned char* out_row) {
+  int num_values = filter.num_values();

  int filter_offset, filter_length;
  __m128i zero = _mm_setzero_si128();
-  __m128i mask[3];
+  __m128i mask[4];
  // |mask| will be used to decimate all extra filter coefficients that are
  // loaded by SIMD when |filter_length| is not divisible by 4.
-  mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
-  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
-  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
-
-  // This buffer is used for tails
-  __m128i buffer;
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

  // Output one pixel each iteration, calculating all channels (RGBA) together.
-  for (int out_x = begin; out_x < end; out_x++) {
+  for (int out_x = 0; out_x < num_values; out_x++) {
    const ConvolutionFilter1D::Fixed* filter_values =
        filter.FilterForValue(out_x, &filter_offset, &filter_length);

@ -117,22 +115,21 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,

    // When |filter_length| is not divisible by 4, we need to decimate some of
    // the filter coefficient that was loaded incorrectly to zero; Other than
-    // that the algorithm is same with above, except that the 4th pixel will be
+    // that the algorithm is same with above, exceot that the 4th pixel will be
    // always absent.
-    int r = filter_length & 3;
+    int r = filter_length&3;
    if (r) {
-      memcpy(&buffer, row_to_filter, r * 4);
      // Note: filter_values must be padded to align_up(filter_offset, 8).
      __m128i coeff, coeff16;
      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
      // Mask out extra filter taps.
-      coeff = _mm_and_si128(coeff, mask[r-1]);
+      coeff = _mm_and_si128(coeff, mask[r]);
      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);

      // Note: line buffer must be padded to align_up(filter_offset, 16).
-      // We resolve this by temporary buffer 
-      __m128i src8 = _mm_loadu_si128(&buffer);
+      // We resolve this by use C-version for the last horizontal line.
+      __m128i src8 = _mm_loadu_si128(row_to_filter);
      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
@ -165,24 +162,26 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
 }

 // Convolves horizontally along four rows. The row data is given in
-// |src_data| and continues for the [begin, end) of the filter.
+// |src_data| and continues for the num_values() of the filter.
 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
 // refer to that function for detailed comments.
 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
-                                int begin, int end,
                                const ConvolutionFilter1D& filter,
                                unsigned char* out_row[4]) {
+  int num_values = filter.num_values();
+
  int filter_offset, filter_length;
  __m128i zero = _mm_setzero_si128();
-  __m128i mask[3];
+  __m128i mask[4];
  // |mask| will be used to decimate all extra filter coefficients that are
  // loaded by SIMD when |filter_length| is not divisible by 4.
-  mask[0] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
-  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
-  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
+  // mask[0] is not used in following algorithm.
+  mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
+  mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
+  mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);

  // Output one pixel each iteration, calculating all channels (RGBA) together.
-  for (int out_x = begin; out_x < end; out_x++) {
+  for (int out_x = 0; out_x < num_values; out_x++) {
    const ConvolutionFilter1D::Fixed* filter_values =
        filter.FilterForValue(out_x, &filter_offset, &filter_length);

@ -240,7 +239,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
      __m128i coeff;
      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values));
      // Mask out extra filter taps.
-      coeff = _mm_and_si128(coeff, mask[r-1]);
+      coeff = _mm_and_si128(coeff, mask[r]);

      __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
      /* c1 c1 c1 c1 c0 c0 c0 c0 */
@ -284,21 +283,22 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
 // Does vertical convolution to produce one output row. The filter values and
 // length are given in the first two parameters. These are applied to each
 // of the rows pointed to in the |source_data_rows| array, with each row
-// being |end - begin| wide.
+// being |pixel_width| wide.
 //
-// The output must have room for |(end - begin) * 4| bytes.
+// The output must have room for |pixel_width * 4| bytes.
 template<bool has_alpha>
 void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_values,
                                  int filter_length,
                                  unsigned char* const* source_data_rows,
-                                  int begin, int end,
+                                  int pixel_width,
                                  unsigned char* out_row) {
+  int width = pixel_width & ~3;
+
  __m128i zero = _mm_setzero_si128();
  __m128i accum0, accum1, accum2, accum3, coeff16;
  const __m128i* src;
-  int out_x;
  // Output four pixels per iteration (16 bytes).
-  for (out_x = begin; out_x + 3 < end; out_x += 4) {
+  for (int out_x = 0; out_x < width; out_x += 4) {

    // Accumulated result for each pixel. 32 bits per RGBA channel.
    accum0 = _mm_setzero_si128();
@ -391,11 +391,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value

  // When the width of the output is not divisible by 4, We need to save one
  // pixel (4 bytes) each time. And also the fourth pixel is always absent.
-  int r = end - out_x;
-  if (r > 0) {
-    // Since accum3 is never used here, we'll use it as a buffer
-    __m128i *buffer = &accum3;
-
+  if (pixel_width & 3) {
    accum0 = _mm_setzero_si128();
    accum1 = _mm_setzero_si128();
    accum2 = _mm_setzero_si128();
@ -403,9 +399,8 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
      coeff16 = _mm_set1_epi16(filter_values[filter_y]);
      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
      src = reinterpret_cast<const __m128i*>(
-          &source_data_rows[filter_y][out_x * 4]);
-      memcpy(buffer, src, r * 4);
-      __m128i src8 = _mm_loadu_si128(buffer);
+          &source_data_rows[filter_y][width<<2]);
+      __m128i src8 = _mm_loadu_si128(src);
      // [16] a1 b1 g1 r1 a0 b0 g0 r0
      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
@ -451,7 +446,7 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
      accum0 = _mm_or_si128(accum0, mask);
    }

-    for (; out_x < end; out_x++) {
+    for (int out_x = width; out_x < pixel_width; out_x++) {
      *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0);
      accum0 = _mm_srli_si128(accum0, 4);
      out_row += 4;
@ -462,14 +457,14 @@ void ConvolveVertically_SSE2_impl(const ConvolutionFilter1D::Fixed* filter_value
 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
                             int filter_length,
                             unsigned char* const* source_data_rows,
-                             int begin, int end,
+                             int pixel_width,
                             unsigned char* out_row, bool has_alpha) {
  if (has_alpha) {
    ConvolveVertically_SSE2_impl<true>(filter_values, filter_length,
-                                       source_data_rows, begin, end, out_row);
+                                       source_data_rows, pixel_width, out_row);
  } else {
    ConvolveVertically_SSE2_impl<false>(filter_values, filter_length,
-                                       source_data_rows, begin, end, out_row);
+                                       source_data_rows, pixel_width, out_row);
  }
 }

--- a/gfx/2d/convolverSSE2.h
+++ b/gfx/2d/convolverSSE2.h
@ -40,7 +40,6 @@ namespace skia {
 // Convolves horizontally along a single row. The row data is given in
 // |src_data| and continues for the [begin, end) of the filter.
 void ConvolveHorizontally_SSE2(const unsigned char* src_data,
-                               int begin, int end,
                               const ConvolutionFilter1D& filter,
                               unsigned char* out_row);

@ -49,7 +48,6 @@ void ConvolveHorizontally_SSE2(const unsigned char* src_data,
 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
 // refer to that function for detailed comments.
 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
-                                int begin, int end,
                                const ConvolutionFilter1D& filter,
                                unsigned char* out_row[4]);

@ -62,7 +60,7 @@ void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],
 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
                             int filter_length,
                             unsigned char* const* source_data_rows,
-                             int begin, int end,
+                             int pixel_width,
                             unsigned char* out_row, bool has_alpha);

 }  // namespace skia
--- a/image/Downscaler.cpp
+++ b/image/Downscaler.cpp
@ -91,7 +91,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize,
                               mYFilter.get());

  // Allocate the buffer, which contains scanlines of the original image.
-  mRowBuffer = MakeUnique<uint8_t[]>(mOriginalSize.width * sizeof(uint32_t));
+  // pad by 15 to handle overreads by the simd code
+  mRowBuffer = MakeUnique<uint8_t[]>(mOriginalSize.width * sizeof(uint32_t) + 15);
  if (MOZ_UNLIKELY(!mRowBuffer)) {
    return NS_ERROR_OUT_OF_MEMORY;
  }
@ -106,7 +107,8 @@ Downscaler::BeginFrame(const nsIntSize& aOriginalSize,
  }

  bool anyAllocationFailed = false;
-  const int rowSize = mTargetSize.width * sizeof(uint32_t);
+  // pad by 15 to handle overreads by the simd code
+  const int rowSize = mTargetSize.width * sizeof(uint32_t) + 15;
  for (int32_t i = 0; i < mWindowCapacity; ++i) {
    mWindow[i] = new uint8_t[rowSize];
    anyAllocationFailed = anyAllocationFailed || mWindow[i] == nullptr;