// SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano #define pr_fmt(fmt) "irq_timings: " fmt #include #include #include #include #include #include #include #include #include #include #include #include "internals.h" DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); DEFINE_PER_CPU(struct irq_timings, irq_timings); static DEFINE_IDR(irqt_stats); void irq_timings_enable(void) { static_branch_enable(&irq_timing_enabled); } void irq_timings_disable(void) { static_branch_disable(&irq_timing_enabled); } /* * The main goal of this algorithm is to predict the next interrupt * occurrence on the current CPU. * * Currently, the interrupt timings are stored in a circular array * buffer every time there is an interrupt, as a tuple: the interrupt * number and the associated timestamp when the event occurred . * * For every interrupt occurring in a short period of time, we can * measure the elapsed time between the occurrences for the same * interrupt and we end up with a suite of intervals. The experience * showed the interrupts are often coming following a periodic * pattern. * * The objective of the algorithm is to find out this periodic pattern * in a fastest way and use its period to predict the next irq event. * * When the next interrupt event is requested, we are in the situation * where the interrupts are disabled and the circular buffer * containing the timings is filled with the events which happened * after the previous next-interrupt-event request. * * At this point, we read the circular buffer and we fill the irq * related statistics structure. After this step, the circular array * containing the timings is empty because all the values are * dispatched in their corresponding buffers. * * Now for each interrupt, we can predict the next event by using the * suffix array, log interval and exponential moving average * * 1. Suffix array * * Suffix array is an array of all the suffixes of a string. It is * widely used as a data structure for compression, text search, ... * For instance for the word 'banana', the suffixes will be: 'banana' * 'anana' 'nana' 'ana' 'na' 'a' * * Usually, the suffix array is sorted but for our purpose it is * not necessary and won't provide any improvement in the context of * the solved problem where we clearly define the boundaries of the * search by a max period and min period. * * The suffix array will build a suite of intervals of different * length and will look for the repetition of each suite. If the suite * is repeating then we have the period because it is the length of * the suite whatever its position in the buffer. * * 2. Log interval * * We saw the irq timings allow to compute the interval of the * occurrences for a specific interrupt. We can reasonibly assume the * longer is the interval, the higher is the error for the next event * and we can consider storing those interval values into an array * where each slot in the array correspond to an interval at the power * of 2 of the index. For example, index 12 will contain values * between 2^11 and 2^12. * * At the end we have an array of values where at each index defines a * [2^index - 1, 2 ^ index] interval values allowing to store a large * number of values inside a small array. * * For example, if we have the value 1123, then we store it at * ilog2(1123) = 10 index value. * * Storing those value at the specific index is done by computing an * exponential moving average for this specific slot. For instance, * for values 1800, 1123, 1453, ... fall under the same slot (10) and * the exponential moving average is computed every time a new value * is stored at this slot. * * 3. Exponential Moving Average * * The EMA is largely used to track a signal for stocks or as a low * pass filter. The magic of the formula, is it is very simple and the * reactivity of the average can be tuned with the factors called * alpha. * * The higher the alphas are, the faster the average respond to the * signal change. In our case, if a slot in the array is a big * interval, we can have numbers with a big difference between * them. The impact of those differences in the average computation * can be tuned by changing the alpha value. * * * -- The algorithm -- * * We saw the different processing above, now let's see how they are * used together. * * For each interrupt: * For each interval: * Compute the index = ilog2(interval) * Compute a new_ema(buffer[index], interval) * Store the index in a circular buffer * * Compute the suffix array of the indexes * * For each suffix: * If the suffix is reverse-found 3 times * Return suffix * * Return Not found * * However we can not have endless suffix array to be build, it won't * make sense and it will add an extra overhead, so we can restrict * this to a maximum suffix length of 5 and a minimum suffix length of * 2. The experience showed 5 is the majority of the maximum pattern * period found for different devices. * * The result is a pattern finding less than 1us for an interrupt. * * Example based on real values: * * Example 1 : MMC write/read interrupt interval: * * 223947, 1240, 1384, 1386, 1386, * 217416, 1236, 1384, 1386, 1387, * 214719, 1241, 1386, 1387, 1384, * 213696, 1234, 1384, 1386, 1388, * 219904, 1240, 1385, 1389, 1385, * 212240, 1240, 1386, 1386, 1386, * 214415, 1236, 1384, 1386, 1387, * 214276, 1234, 1384, 1388, ? * * For each element, apply ilog2(value) * * 15, 8, 8, 8, 8, * 15, 8, 8, 8, 8, * 15, 8, 8, 8, 8, * 15, 8, 8, 8, 8, * 15, 8, 8, 8, 8, * 15, 8, 8, 8, 8, * 15, 8, 8, 8, 8, * 15, 8, 8, 8, ? * * Max period of 5, we take the last (max_period * 3) 15 elements as * we can be confident if the pattern repeats itself three times it is * a repeating pattern. * * 8, * 15, 8, 8, 8, 8, * 15, 8, 8, 8, 8, * 15, 8, 8, 8, ? * * Suffixes are: * * 1) 8, 15, 8, 8, 8 <- max period * 2) 8, 15, 8, 8 * 3) 8, 15, 8 * 4) 8, 15 <- min period * * From there we search the repeating pattern for each suffix. * * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8 * | | | | | | | | | | | | | | | * 8, 15, 8, 8, 8 | | | | | | | | | | * 8, 15, 8, 8, 8 | | | | | * 8, 15, 8, 8, 8 * * When moving the suffix, we found exactly 3 matches. * * The first suffix with period 5 is repeating. * * The next event is (3 * max_period) % suffix_period * * In this example, the result 0, so the next event is suffix[0] => 8 * * However, 8 is the index in the array of exponential moving average * which was calculated on the fly when storing the values, so the * interval is ema[8] = 1366 * * * Example 2: * * 4, 3, 5, 100, * 3, 3, 5, 117, * 4, 4, 5, 112, * 4, 3, 4, 110, * 3, 5, 3, 117, * 4, 4, 5, 112, * 4, 3, 4, 110, * 3, 4, 5, 112, * 4, 3, 4, 110 * * ilog2 * * 0, 0, 0, 4, * 0, 0, 0, 4, * 0, 0, 0, 4, * 0, 0, 0, 4, * 0, 0, 0, 4, * 0, 0, 0, 4, * 0, 0, 0, 4, * 0, 0, 0, 4, * 0, 0, 0, 4 * * Max period 5: * 0, 0, 4, * 0, 0, 0, 4, * 0, 0, 0, 4, * 0, 0, 0, 4 * * Suffixes: * * 1) 0, 0, 4, 0, 0 * 2) 0, 0, 4, 0 * 3) 0, 0, 4 * 4) 0, 0 * * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 * | | | | | | X * 0, 0, 4, 0, 0, | X * 0, 0 * * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 * | | | | | | | | | | | | | | | * 0, 0, 4, 0, | | | | | | | | | | | * 0, 0, 4, 0, | | | | | | | * 0, 0, 4, 0, | | | * 0 0 4 * * Pattern is found 3 times, the remaining is 1 which results from * (max_period * 3) % suffix_period. This value is the index in the * suffix arrays. The suffix array for a period 4 has the value 4 * at index 1. */ #define EMA_ALPHA_VAL 64 #define EMA_ALPHA_SHIFT 7 #define PREDICTION_PERIOD_MIN 3 #define PREDICTION_PERIOD_MAX 5 #define PREDICTION_FACTOR 4 #define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ #define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ /* * Number of elements in the circular buffer: If it happens it was * flushed before, then the number of elements could be smaller than * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is * used as we wrapped. The index begins from zero when we did not * wrap. That could be done in a nicer way with the proper circular * array structure type but with the cost of extra computation in the * interrupt handler hot path. We choose efficiency. */ #define for_each_irqts(i, irqts) \ for (i = irqts->count < IRQ_TIMINGS_SIZE ? \ 0 : irqts->count & IRQ_TIMINGS_MASK, \ irqts->count = min(IRQ_TIMINGS_SIZE, \ irqts->count); \ irqts->count > 0; irqts->count--, \ i = (i + 1) & IRQ_TIMINGS_MASK) struct irqt_stat { u64 last_ts; u64 ema_time[PREDICTION_BUFFER_SIZE]; int timings[IRQ_TIMINGS_SIZE]; int circ_timings[IRQ_TIMINGS_SIZE]; int count; }; /* * Exponential moving average computation */ static u64 irq_timings_ema_new(u64 value, u64 ema_old) { s64 diff; if (unlikely(!ema_old)) return value; diff = (value - ema_old) * EMA_ALPHA_VAL; /* * We can use a s64 type variable to be added with the u64 * ema_old variable as this one will never have its topmost * bit set, it will be always smaller than 2^63 nanosec * interrupt interval (292 years). */ return ema_old + (diff >> EMA_ALPHA_SHIFT); } static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) { int period; /* * Move the beginning pointer to the end minus the max period x 3. * We are at the point we can begin searching the pattern */ buffer = &buffer[len - (period_max * 3)]; /* Adjust the length to the maximum allowed period x 3 */ len = period_max * 3; /* * The buffer contains the suite of intervals, in a ilog2 * basis, we are looking for a repetition. We point the * beginning of the search three times the length of the * period beginning at the end of the buffer. We do that for * each suffix. */ for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) { /* * The first comparison always succeed because the * suffix is deduced from the first n-period bytes of * the buffer and we compare the initial suffix with * itself, so we can skip the first iteration. */ int idx = period; size_t size = period; /* * We look if the suite with period 'i' repeat * itself. If it is truncated at the end, as it * repeats we can use the period to find out the next * element with the modulo. */ while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) { /* * Move the index in a period basis */ idx += size; /* * If this condition is reached, all previous * memcmp were successful, so the period is * found. */ if (idx == len) return buffer[len % period]; /* * If the remaining elements to compare are * smaller than the period, readjust the size * of the comparison for the last iteration. */ if (len - idx < period) size = len - idx; } } return -1; } static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) { int index, i, period_max, count, start, min = INT_MAX; if ((now - irqs->last_ts) >= NSEC_PER_SEC) { irqs->count = irqs->last_ts = 0; return U64_MAX; } /* * As we want to find three times the repetition, we need a * number of intervals greater or equal to three times the * maximum period, otherwise we truncate the max period. */ period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ? PREDICTION_PERIOD_MAX : irqs->count / 3; /* * If we don't have enough irq timings for this prediction, * just bail out. */ if (period_max <= PREDICTION_PERIOD_MIN) return U64_MAX; /* * 'count' will depends if the circular buffer wrapped or not */ count = irqs->count < IRQ_TIMINGS_SIZE ? irqs->count : IRQ_TIMINGS_SIZE; start = irqs->count < IRQ_TIMINGS_SIZE ? 0 : (irqs->count & IRQ_TIMINGS_MASK); /* * Copy the content of the circular buffer into another buffer * in order to linearize the buffer instead of dealing with * wrapping indexes and shifted array which will be prone to * error and extremelly difficult to debug. */ for (i = 0; i < count; i++) { int index = (start + i) & IRQ_TIMINGS_MASK; irqs->timings[i] = irqs->circ_timings[index]; min = min_t(int, irqs->timings[i], min); } index = irq_timings_next_event_index(irqs->timings, count, period_max); if (index < 0) return irqs->last_ts + irqs->ema_time[min]; return irqs->last_ts + irqs->ema_time[index]; } static __always_inline int irq_timings_interval_index(u64 interval) { /* * The PREDICTION_FACTOR increase the interval size for the * array of exponential average. */ u64 interval_us = (interval >> 10) / PREDICTION_FACTOR; return likely(interval_us) ? ilog2(interval_us) : 0; } static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs, u64 interval) { int index; /* * Get the index in the ema table for this interrupt. */ index = irq_timings_interval_index(interval); /* * Store the index as an element of the pattern in another * circular array. */ irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; irqs->ema_time[index] = irq_timings_ema_new(interval, irqs->ema_time[index]); irqs->count++; } static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) { u64 old_ts = irqs->last_ts; u64 interval; /* * The timestamps are absolute time values, we need to compute * the timing interval between two interrupts. */ irqs->last_ts = ts; /* * The interval type is u64 in order to deal with the same * type in our computation, that prevent mindfuck issues with * overflow, sign and division. */ interval = ts - old_ts; /* * The interrupt triggered more than one second apart, that * ends the sequence as predictible for our purpose. In this * case, assume we have the beginning of a sequence and the * timestamp is the first value. As it is impossible to * predict anything at this point, return. * * Note the first timestamp of the sequence will always fall * in this test because the old_ts is zero. That is what we * want as we need another timestamp to compute an interval. */ if (interval >= NSEC_PER_SEC) { irqs->count = 0; return; } __irq_timings_store(irq, irqs, interval); } /** * irq_timings_next_event - Return when the next event is supposed to arrive * * During the last busy cycle, the number of interrupts is incremented * and stored in the irq_timings structure. This information is * necessary to: * * - know if the index in the table wrapped up: * * If more than the array size interrupts happened during the * last busy/idle cycle, the index wrapped up and we have to * begin with the next element in the array which is the last one * in the sequence, otherwise it is a the index 0. * * - have an indication of the interrupts activity on this CPU * (eg. irq/sec) * * The values are 'consumed' after inserting in the statistical model, * thus the count is reinitialized. * * The array of values **must** be browsed in the time direction, the * timestamp must increase between an element and the next one. * * Returns a nanosec time based estimation of the earliest interrupt, * U64_MAX otherwise. */ u64 irq_timings_next_event(u64 now) { struct irq_timings *irqts = this_cpu_ptr(&irq_timings); struct irqt_stat *irqs; struct irqt_stat __percpu *s; u64 ts, next_evt = U64_MAX; int i, irq = 0; /* * This function must be called with the local irq disabled in * order to prevent the timings circular buffer to be updated * while we are reading it. */ lockdep_assert_irqs_disabled(); if (!irqts->count) return next_evt; /* * Number of elements in the circular buffer: If it happens it * was flushed before, then the number of elements could be * smaller than IRQ_TIMINGS_SIZE, so the count is used, * otherwise the array size is used as we wrapped. The index * begins from zero when we did not wrap. That could be done * in a nicer way with the proper circular array structure * type but with the cost of extra computation in the * interrupt handler hot path. We choose efficiency. * * Inject measured irq/timestamp to the pattern prediction * model while decrementing the counter because we consume the * data from our circular buffer. */ for_each_irqts(i, irqts) { irq = irq_timing_decode(irqts->values[i], &ts); s = idr_find(&irqt_stats, irq); if (s) irq_timings_store(irq, this_cpu_ptr(s), ts); } /* * Look in the list of interrupts' statistics, the earliest * next event. */ idr_for_each_entry(&irqt_stats, s, i) { irqs = this_cpu_ptr(s); ts = __irq_timings_next_event(irqs, i, now); if (ts <= now) return now; if (ts < next_evt) next_evt = ts; } return next_evt; } void irq_timings_free(int irq) { struct irqt_stat __percpu *s; s = idr_find(&irqt_stats, irq); if (s) { free_percpu(s); idr_remove(&irqt_stats, irq); } } int irq_timings_alloc(int irq) { struct irqt_stat __percpu *s; int id; /* * Some platforms can have the same private interrupt per cpu, * so this function may be be called several times with the * same interrupt number. Just bail out in case the per cpu * stat structure is already allocated. */ s = idr_find(&irqt_stats, irq); if (s) return 0; s = alloc_percpu(*s); if (!s) return -ENOMEM; idr_preload(GFP_KERNEL); id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT); idr_preload_end(); if (id < 0) { free_percpu(s); return id; } return 0; } #ifdef CONFIG_TEST_IRQ_TIMINGS struct timings_intervals { u64 *intervals; size_t count; }; /* * Intervals are given in nanosecond base */ static u64 intervals0[] __initdata = { 10000, 50000, 200000, 500000, 10000, 50000, 200000, 500000, 10000, 50000, 200000, 500000, 10000, 50000, 200000, 500000, 10000, 50000, 200000, 500000, 10000, 50000, 200000, 500000, 10000, 50000, 200000, 500000, 10000, 50000, 200000, 500000, 10000, 50000, 200000, }; static u64 intervals1[] __initdata = { 223947000, 1240000, 1384000, 1386000, 1386000, 217416000, 1236000, 1384000, 1386000, 1387000, 214719000, 1241000, 1386000, 1387000, 1384000, 213696000, 1234000, 1384000, 1386000, 1388000, 219904000, 1240000, 1385000, 1389000, 1385000, 212240000, 1240000, 1386000, 1386000, 1386000, 214415000, 1236000, 1384000, 1386000, 1387000, 214276000, 1234000, }; static u64 intervals2[] __initdata = { 4000, 3000, 5000, 100000, 3000, 3000, 5000, 117000, 4000, 4000, 5000, 112000, 4000, 3000, 4000, 110000, 3000, 5000, 3000, 117000, 4000, 4000, 5000, 112000, 4000, 3000, 4000, 110000, 3000, 4000, 5000, 112000, 4000, }; static u64 intervals3[] __initdata = { 1385000, 212240000, 1240000, 1386000, 214415000, 1236000, 1384000, 214276000, 1234000, 1386000, 214415000, 1236000, 1385000, 212240000, 1240000, 1386000, 214415000, 1236000, 1384000, 214276000, 1234000, 1386000, 214415000, 1236000, 1385000, 212240000, 1240000, }; static u64 intervals4[] __initdata = { 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, 50000, 10000, }; static struct timings_intervals tis[] __initdata = { { intervals0, ARRAY_SIZE(intervals0) }, { intervals1, ARRAY_SIZE(intervals1) }, { intervals2, ARRAY_SIZE(intervals2) }, { intervals3, ARRAY_SIZE(intervals3) }, { intervals4, ARRAY_SIZE(intervals4) }, }; static int __init irq_timings_test_irqs(struct timings_intervals *ti) { struct irqt_stat __percpu *s; struct irqt_stat *irqs; int i, index, ret, irq = 0xACE5; ret = irq_timings_alloc(irq); if (ret) { pr_err("Failed to allocate irq timings\n"); return ret; } s = idr_find(&irqt_stats, irq); if (!s) { ret = -EIDRM; goto out; } irqs = this_cpu_ptr(s); for (i = 0; i < ti->count; i++) { index = irq_timings_interval_index(ti->intervals[i]); pr_debug("%d: interval=%llu ema_index=%d\n", i, ti->intervals[i], index); __irq_timings_store(irq, irqs, ti->intervals[i]); if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) { pr_err("Failed to store in the circular buffer\n"); goto out; } } if (irqs->count != ti->count) { pr_err("Count differs\n"); goto out; } ret = 0; out: irq_timings_free(irq); return ret; } static int __init irq_timings_irqs_selftest(void) { int i, ret; for (i = 0; i < ARRAY_SIZE(tis); i++) { pr_info("---> Injecting intervals number #%d (count=%zd)\n", i, tis[i].count); ret = irq_timings_test_irqs(&tis[i]); if (ret) break; } return ret; } static int __init irq_timings_test_irqts(struct irq_timings *irqts, unsigned count) { int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0; int i, irq, oirq = 0xBEEF; u64 ots = 0xDEAD, ts; /* * Fill the circular buffer by using the dedicated function. */ for (i = 0; i < count; i++) { pr_debug("%d: index=%d, ts=%llX irq=%X\n", i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i); irq_timings_push(ots + i, oirq + i); } /* * Compute the first elements values after the index wrapped * up or not. */ ots += start; oirq += start; /* * Test the circular buffer count is correct. */ pr_debug("---> Checking timings array count (%d) is right\n", count); if (WARN_ON(irqts->count != count)) return -EINVAL; /* * Test the macro allowing to browse all the irqts. */ pr_debug("---> Checking the for_each_irqts() macro\n"); for_each_irqts(i, irqts) { irq = irq_timing_decode(irqts->values[i], &ts); pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n", i, ts, ots, irq, oirq); if (WARN_ON(ts != ots || irq != oirq)) return -EINVAL; ots++; oirq++; } /* * The circular buffer should have be flushed when browsed * with for_each_irqts */ pr_debug("---> Checking timings array is empty after browsing it\n"); if (WARN_ON(irqts->count)) return -EINVAL; return 0; } static int __init irq_timings_irqts_selftest(void) { struct irq_timings *irqts = this_cpu_ptr(&irq_timings); int i, ret; /* * Test the circular buffer with different number of * elements. The purpose is to test at the limits (empty, half * full, full, wrapped with the cursor at the boundaries, * wrapped several times, etc ... */ int count[] = { 0, IRQ_TIMINGS_SIZE >> 1, IRQ_TIMINGS_SIZE, IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1), 2 * IRQ_TIMINGS_SIZE, (2 * IRQ_TIMINGS_SIZE) + 3, }; for (i = 0; i < ARRAY_SIZE(count); i++) { pr_info("---> Checking the timings with %d/%d values\n", count[i], IRQ_TIMINGS_SIZE); ret = irq_timings_test_irqts(irqts, count[i]); if (ret) break; } return ret; } static int __init irq_timings_selftest(void) { int ret; pr_info("------------------- selftest start -----------------\n"); /* * At this point, we don't except any subsystem to use the irq * timings but us, so it should not be enabled. */ if (static_branch_unlikely(&irq_timing_enabled)) { pr_warn("irq timings already initialized, skipping selftest\n"); return 0; } ret = irq_timings_irqts_selftest(); if (ret) goto out; ret = irq_timings_irqs_selftest(); out: pr_info("---------- selftest end with %s -----------\n", ret ? "failure" : "success"); return ret; } early_initcall(irq_timings_selftest); #endif