From 0e9a6ed72a06dd367049d33ec656f7e3bf2211a2 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Wed, 13 Jul 2011 14:51:02 -0400 Subject: [PATCH 1/4] Add improvements made in good-quality mode to real-time mode Several improvements we made in good-quality mode can be added into real-time mode to speed up encoding in speed 1, 2, and 3 with small quality loss. Tests using tulip clip showed: --rt --cpu-used=-1 (before change) PSNR: 38.028 time: 1m33.195s (after change) PSNR: 38.014 time: 1m20.851s --rt --cpu-used=-2 (before change) PSNR: 37.773 time: 0m57.650s (after change) PSNR: 37.759 time: 0m54.594s --rt --cpu-used=-3 (before change) PSNR: 37.392 time: 0m42.865s (after change) PSNR: 37.375 time: 0m41.949s Change-Id: I76ab2a38d72bc5efc91f6fe20d332c472f6510c9 --- vp8/encoder/onyx_if.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index d2b0bf36a..85d63563a 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -898,6 +898,10 @@ void vp8_set_speed_features(VP8_COMP *cpi) sf->improved_quant = 0; sf->improved_dct = 0; + + sf->use_fastquant_for_pick = 1; + sf->no_skip_block4x4_search = 0; + sf->first_step = 1; } if (Speed > 1) From 7d1b37cdac8ae096e1d33aa7adb2834784cc897e Mon Sep 17 00:00:00 2001 From: "Timothy B. Terriberry" Date: Wed, 20 Jul 2011 10:20:31 -0700 Subject: [PATCH 2/4] Increase chrow row alignment to 16 bytes. This is done by expanding luma row to 32-byte alignment, since there is currently a bunch of code that assumes that uv_stride == y_stride/2 (see, for example, vp8/common/postproc.c, common/reconinter.c, common/arm/neon/recon16x16mb_neon.asm, encoder/temporal_filter.c, and possibly others; I haven't done a full audit). It also uses replaces the hardcoded border of 16 in a number of encoder buffers with VP8BORDERINPIXELS (currently 32), as the chroma rows start at an offset of border/2. Together, these two changes have the nice advantage that simply dumping the frame memory as a contiguous blob produces a valid, if padded, image. Change-Id: Iaf5ea722ae5c82d5daa50f6e2dade9de753f1003 --- vp8/encoder/lookahead.c | 3 ++- vp8/encoder/onyx_if.c | 5 ++-- vp8/vp8_dx_iface.c | 45 +++++++++++++++++++++++----------- vpx_scale/generic/yv12config.c | 28 +++++++++++++-------- 4 files changed, 54 insertions(+), 27 deletions(-) diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c index 3b86d4094..d7f85cba1 100644 --- a/vp8/encoder/lookahead.c +++ b/vp8/encoder/lookahead.c @@ -86,7 +86,8 @@ vp8_lookahead_init(unsigned int width, if(!ctx->buf) goto bail; for(i=0; ibuf[i].img, width, height, 16)) + if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img, + width, height, VP8BORDERINPIXELS)) goto bail; } return ctx; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index ba8793dc8..35baa4c2b 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1240,7 +1240,7 @@ static void alloc_raw_frame_buffers(VP8_COMP *cpi) #if VP8_TEMPORAL_ALT_REF if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer, - width, height, 16)) + width, height, VP8BORDERINPIXELS)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); @@ -1290,7 +1290,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); - if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, width, height, 16)) + if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, + width, height, VP8BORDERINPIXELS)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 58dc486de..13a072bff 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -301,6 +301,36 @@ update_error_state(vpx_codec_alg_priv_t *ctx, return res; } +static void yuvconfig2image(vpx_image_t *img, + const YV12_BUFFER_CONFIG *yv12, + void *user_priv) +{ + /** vpx_img_wrap() doesn't allow specifying independent strides for + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ + img->fmt = yv12->clrtype == REG_YUV ? + VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420; + img->w = yv12->y_stride; + img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15; + img->d_w = yv12->y_width; + img->d_h = yv12->y_height; + img->x_chroma_shift = 1; + img->y_chroma_shift = 1; + img->planes[VPX_PLANE_Y] = yv12->y_buffer; + img->planes[VPX_PLANE_U] = yv12->u_buffer; + img->planes[VPX_PLANE_V] = yv12->v_buffer; + img->planes[VPX_PLANE_ALPHA] = NULL; + img->stride[VPX_PLANE_Y] = yv12->y_stride; + img->stride[VPX_PLANE_U] = yv12->uv_stride; + img->stride[VPX_PLANE_V] = yv12->uv_stride; + img->stride[VPX_PLANE_ALPHA] = yv12->y_stride; + img->bps = 12; + img->user_priv = user_priv; + img->img_data = yv12->buffer_alloc; + img->img_data_owner = 0; + img->self_allocd = 0; +} static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *data, @@ -429,21 +459,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx, if (!res && 0 == vp8dx_get_raw_frame(ctx->pbi, &sd, &time_stamp, &time_end_stamp, &flags)) { - /* Align width/height */ - unsigned int a_w = (sd.y_width + 15) & ~15; - unsigned int a_h = (sd.y_height + 15) & ~15; - - vpx_img_wrap(&ctx->img, VPX_IMG_FMT_I420, - a_w + 2 * VP8BORDERINPIXELS, - a_h + 2 * VP8BORDERINPIXELS, - 1, - sd.buffer_alloc); - vpx_img_set_rect(&ctx->img, - VP8BORDERINPIXELS, VP8BORDERINPIXELS, - sd.y_width, sd.y_height); - ctx->img.user_priv = user_priv; + yuvconfig2image(&ctx->img, &sd, user_priv); ctx->img_avail = 1; - } } diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c index d02cde28f..eff594e2d 100644 --- a/vpx_scale/generic/yv12config.c +++ b/vpx_scale/generic/yv12config.c @@ -49,25 +49,33 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int if (ybf) { + int y_stride = ((width + 2 * border) + 31) & ~31; + int yplane_size = (height + 2 * border) * y_stride; int uv_width = width >> 1; int uv_height = height >> 1; - int yplane_size = (height + 2 * border) * (width + 2 * border); - int uvplane_size = (uv_height + border) * (uv_width + border); + /** There is currently a bunch of code which assumes + * uv_stride == y_stride/2, so enforce this here. */ + int uv_stride = y_stride >> 1; + int uvplane_size = (uv_height + border) * uv_stride; vp8_yv12_de_alloc_frame_buffer(ybf); - /* only support allocating buffers that have - a height and width that are multiples of 16 */ - if ((width & 0xf) | (height & 0xf)) + /** Only support allocating buffers that have a height and width that + * are multiples of 16, and a border that's a multiple of 32. + * The border restriction is required to get 16-byte alignment of the + * start of the chroma rows without intoducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if ((width & 0xf) | (height & 0xf) | (border & 0x1f)) return -3; ybf->y_width = width; ybf->y_height = height; - ybf->y_stride = width + 2 * border; + ybf->y_stride = y_stride; ybf->uv_width = uv_width; ybf->uv_height = uv_height; - ybf->uv_stride = uv_width + border; + ybf->uv_stride = uv_stride; ybf->border = border; ybf->frame_size = yplane_size + 2 * uvplane_size; @@ -77,9 +85,9 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int if (ybf->buffer_alloc == NULL) return -1; - ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border; - ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2; - ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * ybf->uv_stride) + border / 2; + ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border; + ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * uv_stride) + border / 2; + ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * uv_stride) + border / 2; ybf->corrupted = 0; /* assume not currupted by errors */ } From 0453aca5af7e6d36307f9487e2126076bdbf9bb1 Mon Sep 17 00:00:00 2001 From: "Timothy B. Terriberry" Date: Tue, 19 Jul 2011 13:09:22 -0700 Subject: [PATCH 3/4] Mark ARM asm objects as allowing a non-executable stack. This adds the magic .note.GNU-stack section at the end of each ARM asm file (when built with gas), indicating that a non-executable stack is allowed. Without this section, the linker will assume the object requires an executable stack by default, forcing an executable stack for the entire program. Change-Id: Ie86de6a449b52d392b9e5e0479833ed8c508ee65 --- build/make/ads2gas.pl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl index be4658253..362fcf4e8 100755 --- a/build/make/ads2gas.pl +++ b/build/make/ads2gas.pl @@ -154,3 +154,6 @@ while () next if /^\s*END\s*$/; print; } + +# Mark that this object doesn't need an executable stack. +printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n"); From 1647f00c2941b47b624fcb330aa63a933b3324ed Mon Sep 17 00:00:00 2001 From: "Timothy B. Terriberry" Date: Tue, 19 Jul 2011 12:13:18 -0700 Subject: [PATCH 4/4] Add .size directive to ARM asm functions. This makes them show up properly in debugging tools like gdb and valgrind. Change-Id: I0c72548a1090de88ba226314e5efe63360b7e07f --- build/make/ads2gas.pl | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl index be4658253..a9532d319 100755 --- a/build/make/ads2gas.pl +++ b/build/make/ads2gas.pl @@ -21,6 +21,9 @@ print "@ This file was created from a .asm file\n"; print "@ using the ads2gas.pl script.\n"; print "\t.equ DO1STROUNDING, 0\n"; +# Stack of procedure names. +@proc_stack = (); + while () { # Load and store alignment @@ -133,9 +136,23 @@ while () # Strip PRESERVE8 s/\sPRESERVE8/@ PRESERVE8/g; - # Strip PROC and ENDPROC - s/\sPROC/@/g; - s/\sENDP/@/g; + # Use PROC and ENDP to give the symbols a .size directive. + # This makes them show up properly in debugging tools like gdb and valgrind. + if (/\bPROC\b/) + { + my $proc; + /^_([\.0-9A-Z_a-z]\w+)\b/; + $proc = $1; + push(@proc_stack, $proc) if ($proc); + s/\bPROC\b/@ $&/; + } + if (/\bENDP\b/) + { + my $proc; + s/\bENDP\b/@ $&/; + $proc = pop(@proc_stack); + $_ = "\t.size $proc, .-$proc".$_ if ($proc); + } # EQU directive s/(.*)EQU(.*)/.equ $1, $2/;