diff --git a/build/make/configure.sh b/build/make/configure.sh index 3324be36e..28a0247d8 100755 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -980,6 +980,9 @@ EOF esac fi + # for sysconf(3) and friends. + check_header unistd.h + # glibc needs these if enabled linux; then add_cflags -D_LARGEFILE_SOURCE diff --git a/configure b/configure index 338922bcf..05e4f3974 100755 --- a/configure +++ b/configure @@ -211,6 +211,7 @@ HAVE_LIST=" alt_tree_layout pthread_h sys_mman_h + unistd_h " EXPERIMENT_LIST=" extend_qrange diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm index d3a79f640..e73dd6401 100644 --- a/vp8/common/arm/neon/loopfilter_neon.asm +++ b/vp8/common/arm/neon/loopfilter_neon.asm @@ -308,7 +308,6 @@ ; q9 q2 ; q10 q3 |vp8_loop_filter_neon| PROC - ldr r12, _lf_coeff_ ; vp8_filter_mask vabd.u8 q11, q3, q4 ; abs(p3 - p2) @@ -339,7 +338,7 @@ vqadd.u8 q9, q9, q2 ; a = b + a vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - vld1.u8 {q0}, [r12]! + vmov.u8 q0, #0x80 ; 0x80 ; vp8_filter() function ; convert to signed @@ -348,7 +347,7 @@ veor q5, q5, q0 ; ps1 veor q8, q8, q0 ; qs1 - vld1.u8 {q10}, [r12]! + vmov.u8 q10, #3 ; #3 vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q11, d15, d13 @@ -367,7 +366,7 @@ vaddw.s8 q2, q2, d2 vaddw.s8 q11, q11, d3 - vld1.u8 {q9}, [r12]! + vmov.u8 q9, #4 ; #4 ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d2, q2 @@ -399,12 +398,4 @@ ;----------------- -_lf_coeff_ - DCD lf_coeff -lf_coeff - DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 - DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 - DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 - DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 - END diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm index 5fe7e7e6d..7c5ea3644 100644 --- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -22,20 +22,19 @@ ; r1 int p, //pitch ; r2 const signed char *flimit, ; r3 const signed char *limit, -; stack(r4) const signed char *thresh, +; stack(r4) const signed char *thresh (unused) ; //stack(r5) int count --unused |vp8_loop_filter_simple_horizontal_edge_neon| PROC sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines - ldr r12, _lfhy_coeff_ vld1.u8 {q5}, [r0], r1 ; p1 vld1.s8 {d2[], d3[]}, [r2] ; flimit vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 vld1.u8 {q6}, [r0], r1 ; p0 - vld1.u8 {q0}, [r12]! ; 0x80 + vmov.u8 q0, #0x80 ; 0x80 vld1.u8 {q7}, [r0], r1 ; q0 - vld1.u8 {q10}, [r12]! ; 0x03 + vmov.u8 q10, #0x03 ; 0x03 vld1.u8 {q8}, [r0] ; q1 ;vp8_filter_mask() function @@ -66,7 +65,7 @@ vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0) vadd.s16 q12, q3, q3 - vld1.u8 {q9}, [r12]! ; 0x04 + vmov.u8 q9, #0x04 ; 0x04 vadd.s16 q2, q2, q11 vadd.s16 q3, q3, q12 @@ -105,11 +104,4 @@ ;----------------- -_lfhy_coeff_ - DCD lfhy_coeff -lfhy_coeff - DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 - DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 - DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 - END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm index c30378b9c..a7f7b690e 100644 --- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -22,7 +22,7 @@ ; r1 int p, //pitch ; r2 const signed char *flimit, ; r3 const signed char *limit, -; stack(r4) const signed char *thresh, +; stack(r4) const signed char *thresh (unused) ; //stack(r5) int count --unused |vp8_loop_filter_simple_vertical_edge_neon| PROC @@ -32,7 +32,6 @@ vld1.s8 {d2[], d3[]}, [r2] ; flimit vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1 - ldr r12, _vlfy_coeff_ vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1 vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1 vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1 @@ -41,11 +40,11 @@ vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vld1.u8 {q0}, [r12]! ; 0x80 + vmov.u8 q0, #0x80 ; 0x80 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vld1.u8 {q11}, [r12]! ; 0x03 + vmov.u8 q11, #0x03 ; 0x03 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vld1.u8 {q12}, [r12]! ; 0x04 + vmov.u8 q12, #0x04 ; 0x04 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 @@ -146,11 +145,4 @@ ;----------------- -_vlfy_coeff_ - DCD vlfy_coeff -vlfy_coeff - DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 - DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 - DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 - END diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm index 981adffd1..72f0f9271 100644 --- a/vp8/common/arm/neon/mbloopfilter_neon.asm +++ b/vp8/common/arm/neon/mbloopfilter_neon.asm @@ -372,7 +372,6 @@ ; q10 q3 |vp8_mbloop_filter_neon| PROC - ldr r12, _mblf_coeff_ ; vp8_filter_mask vabd.u8 q11, q3, q4 ; abs(p3 - p2) @@ -396,7 +395,7 @@ vld1.s8 {d4[], d5[]}, [r2] ; flimit - vld1.u8 {q0}, [r12]! + vmov.u8 q0, #0x80 ; 0x80 vadd.u8 q2, q2, q2 ; flimit * 2 vadd.u8 q2, q2, q1 ; flimit * 2 + limit @@ -431,12 +430,12 @@ vadd.s16 q2, q2, q10 vadd.s16 q13, q13, q11 - vld1.u8 {q12}, [r12]! ; #3 + vmov.u8 q12, #3 ; #3 vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) vaddw.s8 q13, q13, d3 - vld1.u8 {q11}, [r12]! ; #4 + vmov.u8 q11, #4 ; #4 ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d2, q2 @@ -444,16 +443,16 @@ vand q1, q1, q15 ; vp8_filter &= mask - vld1.u8 {q15}, [r12]! ; #63 - ; + vmov.u16 q15, #63 ; #63 + vand q13, q1, q14 ; Filter2 &= hev - vld1.u8 {d7}, [r12]! ; #9 + vmov.u8 d7, #9 ; #9 vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - vld1.u8 {d6}, [r12]! ; #18 + vmov.u8 d6, #18 ; #18 vshr.s8 q2, q2, #3 ; Filter1 >>= 3 vshr.s8 q13, q13, #3 ; Filter2 >>= 3 @@ -463,7 +462,7 @@ vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - vld1.u8 {d5}, [r12]! ; #27 + vmov.u8 d5, #27 ; #27 vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) @@ -507,14 +506,4 @@ ;----------------- -_mblf_coeff_ - DCD mblf_coeff -mblf_coeff - DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 - DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 - DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 - DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f - DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 - DCD 0x1b1b1b1b, 0x1b1b1b1b - END diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index fea6dcd23..c7fbb3e09 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -17,9 +17,54 @@ #include "vp8/common/idct.h" #include "vp8/common/onyxc_int.h" +#if CONFIG_MULTITHREAD +#if HAVE_UNISTD_H +#include +#elif defined(_WIN32) +#include +typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO); +#endif +#endif + extern void vp8_arch_x86_common_init(VP8_COMMON *ctx); extern void vp8_arch_arm_common_init(VP8_COMMON *ctx); +#if CONFIG_MULTITHREAD +static int get_cpu_count() +{ + int core_count = 16; + +#if HAVE_UNISTD_H +#if defined(_SC_NPROCESSORS_ONLN) + core_count = sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(_SC_NPROC_ONLN) + core_count = sysconf(_SC_NPROC_ONLN); +#endif +#elif defined(_WIN32) + { + PGNSI pGNSI; + SYSTEM_INFO sysinfo; + + /* Call GetNativeSystemInfo if supported or + * GetSystemInfo otherwise. */ + + pGNSI = (PGNSI) GetProcAddress( + GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo"); + if (pGNSI != NULL) + pGNSI(&sysinfo); + else + GetSystemInfo(&sysinfo); + + core_count = sysinfo.dwNumberOfProcessors; + } +#else + /* other platforms */ +#endif + + return core_count > 0 ? core_count : 1; +} +#endif + void vp8_machine_specific_config(VP8_COMMON *ctx) { #if CONFIG_RUNTIME_CPU_DETECT @@ -98,4 +143,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) #endif +#if CONFIG_MULTITHREAD + ctx->processor_core_count = get_cpu_count(); +#endif /* CONFIG_MULTITHREAD */ } diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index e67d39cbb..0565127e1 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -195,6 +195,9 @@ typedef struct VP8Common #if CONFIG_RUNTIME_CPU_DETECT VP8_COMMON_RTCD rtcd; +#endif +#if CONFIG_MULTITHREAD + int processor_core_count; #endif struct postproc_state postproc_state; } VP8_COMMON; diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index 56275940e..9ef85e9cd 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -439,12 +439,18 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) pbi->b_multithreaded_rd = 0; pbi->allocated_decoding_thread_count = 0; - core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads; + + /* limit decoding threads to the max number of token partitions */ + core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads; + + /* limit decoding threads to the available cores */ + if (core_count > pbi->common.processor_core_count) + core_count = pbi->common.processor_core_count; if (core_count > 1) { pbi->b_multithreaded_rd = 1; - pbi->decoding_thread_count = core_count -1; + pbi->decoding_thread_count = core_count - 1; CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count)); CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count)); diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 8aef915b8..4c79b1590 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -459,15 +459,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi) cpi->b_multi_threaded = 0; cpi->encoding_thread_count = 0; - cpi->processor_core_count = 32; //vp8_get_proc_core_count(); - if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) + if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) { int ithread; int th_count = cpi->oxcf.multi_threaded - 1; - if (cpi->oxcf.multi_threaded > cpi->processor_core_count) - th_count = cpi->processor_core_count - 1; + /* don't allocate more threads than cores available */ + if (cpi->oxcf.multi_threaded > cm->processor_core_count) + th_count = cm->processor_core_count - 1; /* we have th_count + 1 (main) threads processing one row each */ /* no point to have more threads than the sync range allows */ @@ -514,6 +514,7 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi) LPFTHREAD_DATA * lpfthd = &cpi->lpf_thread_data; sem_init(&cpi->h_event_start_lpf, 0, 0); + sem_init(&cpi->h_event_end_picklpf, 0, 0); sem_init(&cpi->h_event_end_lpf, 0, 0); lpfthd->ptr1 = (void *)cpi; @@ -547,6 +548,7 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) sem_destroy(&cpi->h_event_end_encoding); sem_destroy(&cpi->h_event_end_lpf); + sem_destroy(&cpi->h_event_end_picklpf); sem_destroy(&cpi->h_event_start_lpf); //free thread related resources diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index b1e9e25a6..4d0e9fa08 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -3211,7 +3211,7 @@ void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) #if CONFIG_MULTITHREAD if (cpi->b_multi_threaded) - sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */ + sem_post(&cpi->h_event_end_picklpf); /* signal that we have set filter_level */ #endif if (cm->filter_level > 0) @@ -4221,7 +4221,7 @@ static void encode_frame_to_data_rate #if CONFIG_MULTITHREAD /* wait that filter_level is picked so that we can continue with stream packing */ if (cpi->b_multi_threaded) - sem_wait(&cpi->h_event_end_lpf); + sem_wait(&cpi->h_event_end_picklpf); #endif // build the bitstream diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 982b24aae..e009c0812 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -580,7 +580,6 @@ typedef struct // multithread data int * mt_current_mb_col; int mt_sync_range; - int processor_core_count; int b_multi_threaded; int encoding_thread_count; @@ -595,6 +594,7 @@ typedef struct sem_t *h_event_start_encoding; sem_t h_event_end_encoding; sem_t h_event_start_lpf; + sem_t h_event_end_picklpf; sem_t h_event_end_lpf; #endif