servo: Merge #18231 - Bug 1376883 - stylo: Drop thread pool stack size to 100k. r=bholley (from julian-seward1:master); r=bholley

Bug 1376883 - stylo: Drop thread pool stack size to 100k. r=bholley. ---  - [ ] `./mach build -d` does not report any errors - [ ] `./mach test-tidy` does not report any errors - [ ] These changes fix #__ (github issue number if applicable).  - [ ] There are tests for these changes OR - [ ] These changes do not require tests because _____   Source-Repo: https://github.com/servo/servo Source-Revision: 7fa6e355836901726e20af0d5e74cf3d37b944db --HG-- extra : subtree_source : https%3A//hg.mozilla.org/projects/converted-servo-linear extra : subtree_revision : eeac903d3db70a77e940762fa1fc81a60cb07d2c
2017-08-25 10:14:23 -05:00 · 2017-08-25 10:14:23 -05:00 · d25fe41b86
--- a/servo/components/style/context.rs
+++ b/servo/components/style/context.rs
@ -16,6 +16,7 @@ use euclid::Size2D;
 use fnv::FnvHashMap;
 use font_metrics::FontMetricsProvider;
 #[cfg(feature = "gecko")] use gecko_bindings::structs;
 use parallel::STYLE_THREAD_STACK_SIZE_KB;
 #[cfg(feature = "servo")] use parking_lot::RwLock;
 use properties::ComputedValues;
 #[cfg(feature = "servo")] use properties::PropertyId;
@ -605,6 +606,61 @@ where
    }
 }
 /// A helper type for stack limit checking.  This assumes that stacks grow
 /// down, which is true for all non-ancient CPU architectures.
 pub struct StackLimitChecker {
   lower_limit: usize
 }
 impl StackLimitChecker {
    /// Create a new limit checker, for this thread, allowing further use
    /// of up to |stack_size| bytes beyond (below) the current stack pointer.
    #[inline(never)]
    pub fn new(stack_size_limit: usize) -> Self {
        StackLimitChecker {
            lower_limit: StackLimitChecker::get_sp() - stack_size_limit
        }
    }
    /// Checks whether the previously stored stack limit has now been exceeded.
    #[inline(never)]
    pub fn limit_exceeded(&self) -> bool {
        let curr_sp = StackLimitChecker::get_sp();
        // Try to assert if we're called from a different thread than the
        // one that originally created this object.  This is a bit subtle
        // and relies on wraparound behaviour of unsigned integers.
        //
        // * If we're called from a thread whose stack has a higher address
        //   than the one that created this object, then
        //   |curr_sp - self.lower_limit| will (almost certainly) be larger
        //   than the thread stack size, so the check will fail.
        //
        // * If we're called from a thread whose stack has a lower address
        //   than the one that created this object, then
        //   |curr_sp - self.lower_limit| will be negative, which will look
        //   like a very large unsigned value, so the check will also fail.
        //
        // The correctness of depends on the assumption that no stack wraps
        // around the end of the address space.
        debug_assert!(curr_sp - self.lower_limit
                      <= STYLE_THREAD_STACK_SIZE_KB * 1024);
        // The actual bounds check.
        curr_sp <= self.lower_limit
    }
    // Technically, rustc can optimize this away, but shouldn't for now.
    // We should fix this once black_box is stable.
    #[inline(always)]
    fn get_sp() -> usize {
        let mut foo: usize = 42;
        (&mut foo as *mut usize) as usize
    }
 }
 /// A thread-local style context.
 ///
 /// This context contains data that needs to be used during restyling, but is
@ -639,6 +695,9 @@ pub struct ThreadLocalStyleContext<E: TElement> {
    /// The struct used to compute and cache font metrics from style
    /// for evaluation of the font-relative em/ch units and font-size
    pub font_metrics_provider: E::FontMetricsProvider,
    /// A checker used to ensure that parallel.rs does not recurse indefinitely
    /// even on arbitrarily deep trees.  See Gecko bug 1376883.
    pub stack_limit_checker: StackLimitChecker,
 }
 impl<E: TElement> ThreadLocalStyleContext<E> {
@ -654,6 +713,8 @@ impl<E: TElement> ThreadLocalStyleContext<E> {
            statistics: TraversalStatistics::default(),
            current_element_info: None,
            font_metrics_provider: E::FontMetricsProvider::create_from(shared),
            stack_limit_checker: StackLimitChecker::new(
                (STYLE_THREAD_STACK_SIZE_KB - 40) * 1024),
        }
    }
@ -668,6 +729,15 @@ impl<E: TElement> ThreadLocalStyleContext<E> {
            statistics: TraversalStatistics::default(),
            current_element_info: None,
            font_metrics_provider: E::FontMetricsProvider::create_from(shared),
            // Threads in the styling pool have small stacks, and we have to
            // be careful not to run out of stack during recursion in
            // parallel.rs.  Therefore set up a stack limit checker, in
            // which we reserve 40KB of stack as a safety buffer.  Currently
            // the stack size is 128KB, so this allows 88KB for recursive
            // DOM traversal, which encompasses 53 levels of recursion before
            // the limiter kicks in, on x86_64-Linux.  See Gecko bug 1376883.
            stack_limit_checker: StackLimitChecker::new(
                (STYLE_THREAD_STACK_SIZE_KB - 40) * 1024),
        }
    }
--- a/servo/components/style/gecko/global_style_data.rs
+++ b/servo/components/style/gecko/global_style_data.rs
@ -9,6 +9,7 @@ use gecko_bindings::bindings;
 use gecko_bindings::bindings::{Gecko_RegisterProfilerThread, Gecko_UnregisterProfilerThread};
 use gecko_bindings::bindings::Gecko_SetJemallocThreadLocalArena;
 use num_cpus;
 use parallel::STYLE_THREAD_STACK_SIZE_KB;
 use rayon;
 use shared_lock::SharedRwLock;
 use std::cmp;
@ -92,7 +93,9 @@ lazy_static! {
                .breadth_first()
                .thread_name(thread_name)
                .start_handler(thread_startup)
-                .exit_handler(thread_shutdown);
+                .exit_handler(thread_shutdown)
                // Set thread stack size to 128KB.  See Gecko bug 1376883.
                .stack_size(STYLE_THREAD_STACK_SIZE_KB * 1024);
            let pool = rayon::ThreadPool::new(configuration).ok();
            pool
        };
--- a/servo/components/style/parallel.rs
+++ b/servo/components/style/parallel.rs
@ -32,6 +32,9 @@ use std::borrow::Borrow;
 use time;
 use traversal::{DomTraversal, PerLevelTraversalData, PreTraverseToken};
 /// The minimum stack size for a thread in the styling pool, in kilobytes.
 pub const STYLE_THREAD_STACK_SIZE_KB: usize = 128;
 /// The maximum number of child nodes that we will process as a single unit.
 ///
 /// Larger values will increase style sharing cache hits and general DOM
@ -77,7 +80,7 @@ pub fn traverse_dom<E, D>(traversal: &D,
            let root_opaque = root.opaque();
            traverse_nodes(&[root],
                           DispatchMode::TailCall,
-                           0,
+                           true,
                           root_opaque,
                           traversal_data,
                           scope,
@ -132,7 +135,6 @@ fn create_thread_local_context<'scope, E, D>(
 #[inline(always)]
 #[allow(unsafe_code)]
 fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
                                  recursion_depth: usize,
                                  root: OpaqueNode,
                                  mut traversal_data: PerLevelTraversalData,
                                  scope: &'a rayon::Scope<'scope>,
@ -144,6 +146,10 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
 {
    debug_assert!(nodes.len() <= WORK_UNIT_MAX);
    // We set this below, when we have a borrow of the thread-local-context
    // available.
    let recursion_ok;
    // Collect all the children of the elements in our work unit. This will
    // contain the combined children of up to WORK_UNIT_MAX nodes, which may
    // be numerous. As such, we store it in a large SmallVec to minimize heap-
@ -154,6 +160,10 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
        // a potential recursive call when we pass TailCall.
        let mut tlc = tls.ensure(
            |slot: &mut Option<ThreadLocalStyleContext<E>>| create_thread_local_context(traversal, slot));
        // Check that we're not in danger of running out of stack.
        recursion_ok = !tlc.stack_limit_checker.limit_exceeded();
        let mut context = StyleContext {
            shared: traversal.shared_context(),
            thread_local: &mut *tlc,
@ -202,7 +212,7 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
                traversal_data_copy.current_dom_depth += 1;
                traverse_nodes(&*discovered_child_nodes,
                               DispatchMode::NotTailCall,
-                               recursion_depth,
+                               recursion_ok,
                               root,
                               traversal_data_copy,
                               scope,
@ -232,7 +242,7 @@ fn top_down_dom<'a, 'scope, E, D>(nodes: &'a [SendNode<E::ConcreteNode>],
        traversal_data.current_dom_depth += 1;
        traverse_nodes(&discovered_child_nodes,
                       DispatchMode::TailCall,
-                       recursion_depth,
+                       recursion_ok,
                       root,
                       traversal_data,
                       scope,
@ -254,16 +264,10 @@ impl DispatchMode {
    fn is_tail_call(&self) -> bool { matches!(*self, DispatchMode::TailCall) }
 }
 // On x86_64-linux, a recursive cycle requires 3472 bytes of stack.  Limiting
 // the depth to 150 therefore should keep the stack use by the recursion to
 // 520800 bytes, which would give a generously conservative margin should we
 // decide to reduce the thread stack size from its default of 2MB down to 1MB.
 const RECURSION_DEPTH_LIMIT: usize = 150;
 #[inline]
 fn traverse_nodes<'a, 'scope, E, D>(nodes: &[SendNode<E::ConcreteNode>],
                                    mode: DispatchMode,
-                                    recursion_depth: usize,
+                                    recursion_ok: bool,
                                    root: OpaqueNode,
                                    traversal_data: PerLevelTraversalData,
                                    scope: &'a rayon::Scope<'scope>,
@ -279,12 +283,11 @@ fn traverse_nodes<'a, 'scope, E, D>(nodes: &[SendNode<E::ConcreteNode>],
    // want to actually dispatch the job as a tail call if there's nothing left
    // in our local queue. Otherwise we need to return to it to maintain proper
    // breadth-first ordering. We also need to take care to avoid stack
-    // overflow due to excessive tail recursion. The stack overflow isn't
+    // overflow due to excessive tail recursion. The stack overflow avoidance
-    // observable to content -- we're still completely correct, just not
+    // isn't observable to content -- we're still completely correct, just not
-    // using tail recursion any more. See bug 1368302.
+    // using tail recursion any more. See Gecko bugs 1368302 and 1376883.
    debug_assert!(recursion_depth <= RECURSION_DEPTH_LIMIT);
    let may_dispatch_tail = mode.is_tail_call() &&
-        recursion_depth != RECURSION_DEPTH_LIMIT &&
+        recursion_ok &&
        !pool.current_thread_has_pending_tasks().unwrap();
    // In the common case, our children fit within a single work unit, in which
@ -292,12 +295,12 @@ fn traverse_nodes<'a, 'scope, E, D>(nodes: &[SendNode<E::ConcreteNode>],
    if nodes.len() <= WORK_UNIT_MAX {
        let work = nodes.iter().cloned().collect::<WorkUnit<E::ConcreteNode>>();
        if may_dispatch_tail {
-            top_down_dom(&work, recursion_depth + 1, root,
+            top_down_dom(&work, root,
                         traversal_data, scope, pool, traversal, tls);
        } else {
            scope.spawn(move |scope| {
                let work = work;
-                top_down_dom(&work, 0, root,
+                top_down_dom(&work, root,
                             traversal_data, scope, pool, traversal, tls);
            });
        }
@ -307,7 +310,7 @@ fn traverse_nodes<'a, 'scope, E, D>(nodes: &[SendNode<E::ConcreteNode>],
            let traversal_data_copy = traversal_data.clone();
            scope.spawn(move |scope| {
                let n = nodes;
-                top_down_dom(&*n, 0, root,
+                top_down_dom(&*n, root,
                             traversal_data_copy, scope, pool, traversal, tls)
            });
        }