|
|
|
@ -622,22 +622,48 @@ retry:
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Multi-cpu list version. */
|
|
|
|
|
#define CPU_MONDO_COUNTER(cpuid) (cpu_mondo_counter[cpuid])
|
|
|
|
|
#define MONDO_USEC_WAIT_MIN 2
|
|
|
|
|
#define MONDO_USEC_WAIT_MAX 100
|
|
|
|
|
#define MONDO_RETRY_LIMIT 500000
|
|
|
|
|
|
|
|
|
|
/* Multi-cpu list version.
|
|
|
|
|
*
|
|
|
|
|
* Deliver xcalls to 'cnt' number of cpus in 'cpu_list'.
|
|
|
|
|
* Sometimes not all cpus receive the mondo, requiring us to re-send
|
|
|
|
|
* the mondo until all cpus have received, or cpus are truly stuck
|
|
|
|
|
* unable to receive mondo, and we timeout.
|
|
|
|
|
* Occasionally a target cpu strand is borrowed briefly by hypervisor to
|
|
|
|
|
* perform guest service, such as PCIe error handling. Consider the
|
|
|
|
|
* service time, 1 second overall wait is reasonable for 1 cpu.
|
|
|
|
|
* Here two in-between mondo check wait time are defined: 2 usec for
|
|
|
|
|
* single cpu quick turn around and up to 100usec for large cpu count.
|
|
|
|
|
* Deliver mondo to large number of cpus could take longer, we adjusts
|
|
|
|
|
* the retry count as long as target cpus are making forward progress.
|
|
|
|
|
*/
|
|
|
|
|
static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
|
|
|
|
|
{
|
|
|
|
|
int retries, this_cpu, prev_sent, i, saw_cpu_error;
|
|
|
|
|
int this_cpu, tot_cpus, prev_sent, i, rem;
|
|
|
|
|
int usec_wait, retries, tot_retries;
|
|
|
|
|
u16 first_cpu = 0xffff;
|
|
|
|
|
unsigned long xc_rcvd = 0;
|
|
|
|
|
unsigned long status;
|
|
|
|
|
int ecpuerror_id = 0;
|
|
|
|
|
int enocpu_id = 0;
|
|
|
|
|
u16 *cpu_list;
|
|
|
|
|
u16 cpu;
|
|
|
|
|
|
|
|
|
|
this_cpu = smp_processor_id();
|
|
|
|
|
|
|
|
|
|
cpu_list = __va(tb->cpu_list_pa);
|
|
|
|
|
|
|
|
|
|
saw_cpu_error = 0;
|
|
|
|
|
retries = 0;
|
|
|
|
|
usec_wait = cnt * MONDO_USEC_WAIT_MIN;
|
|
|
|
|
if (usec_wait > MONDO_USEC_WAIT_MAX)
|
|
|
|
|
usec_wait = MONDO_USEC_WAIT_MAX;
|
|
|
|
|
retries = tot_retries = 0;
|
|
|
|
|
tot_cpus = cnt;
|
|
|
|
|
prev_sent = 0;
|
|
|
|
|
|
|
|
|
|
do {
|
|
|
|
|
int forward_progress, n_sent;
|
|
|
|
|
int n_sent, mondo_delivered, target_cpu_busy;
|
|
|
|
|
|
|
|
|
|
status = sun4v_cpu_mondo_send(cnt,
|
|
|
|
|
tb->cpu_list_pa,
|
|
|
|
@ -645,94 +671,113 @@ static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
|
|
|
|
|
|
|
|
|
|
/* HV_EOK means all cpus received the xcall, we're done. */
|
|
|
|
|
if (likely(status == HV_EOK))
|
|
|
|
|
break;
|
|
|
|
|
goto xcall_done;
|
|
|
|
|
|
|
|
|
|
/* If not these non-fatal errors, panic */
|
|
|
|
|
if (unlikely((status != HV_EWOULDBLOCK) &&
|
|
|
|
|
(status != HV_ECPUERROR) &&
|
|
|
|
|
(status != HV_ENOCPU)))
|
|
|
|
|
goto fatal_errors;
|
|
|
|
|
|
|
|
|
|
/* First, see if we made any forward progress.
|
|
|
|
|
*
|
|
|
|
|
* Go through the cpu_list, count the target cpus that have
|
|
|
|
|
* received our mondo (n_sent), and those that did not (rem).
|
|
|
|
|
* Re-pack cpu_list with the cpus remain to be retried in the
|
|
|
|
|
* front - this simplifies tracking the truly stalled cpus.
|
|
|
|
|
*
|
|
|
|
|
* The hypervisor indicates successful sends by setting
|
|
|
|
|
* cpu list entries to the value 0xffff.
|
|
|
|
|
*
|
|
|
|
|
* EWOULDBLOCK means some target cpus did not receive the
|
|
|
|
|
* mondo and retry usually helps.
|
|
|
|
|
*
|
|
|
|
|
* ECPUERROR means at least one target cpu is in error state,
|
|
|
|
|
* it's usually safe to skip the faulty cpu and retry.
|
|
|
|
|
*
|
|
|
|
|
* ENOCPU means one of the target cpu doesn't belong to the
|
|
|
|
|
* domain, perhaps offlined which is unexpected, but not
|
|
|
|
|
* fatal and it's okay to skip the offlined cpu.
|
|
|
|
|
*/
|
|
|
|
|
rem = 0;
|
|
|
|
|
n_sent = 0;
|
|
|
|
|
for (i = 0; i < cnt; i++) {
|
|
|
|
|
if (likely(cpu_list[i] == 0xffff))
|
|
|
|
|
cpu = cpu_list[i];
|
|
|
|
|
if (likely(cpu == 0xffff)) {
|
|
|
|
|
n_sent++;
|
|
|
|
|
} else if ((status == HV_ECPUERROR) &&
|
|
|
|
|
(sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) {
|
|
|
|
|
ecpuerror_id = cpu + 1;
|
|
|
|
|
} else if (status == HV_ENOCPU && !cpu_online(cpu)) {
|
|
|
|
|
enocpu_id = cpu + 1;
|
|
|
|
|
} else {
|
|
|
|
|
cpu_list[rem++] = cpu;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
forward_progress = 0;
|
|
|
|
|
if (n_sent > prev_sent)
|
|
|
|
|
forward_progress = 1;
|
|
|
|
|
/* No cpu remained, we're done. */
|
|
|
|
|
if (rem == 0)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
/* Otherwise, update the cpu count for retry. */
|
|
|
|
|
cnt = rem;
|
|
|
|
|
|
|
|
|
|
/* Record the overall number of mondos received by the
|
|
|
|
|
* first of the remaining cpus.
|
|
|
|
|
*/
|
|
|
|
|
if (first_cpu != cpu_list[0]) {
|
|
|
|
|
first_cpu = cpu_list[0];
|
|
|
|
|
xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Was any mondo delivered successfully? */
|
|
|
|
|
mondo_delivered = (n_sent > prev_sent);
|
|
|
|
|
prev_sent = n_sent;
|
|
|
|
|
|
|
|
|
|
/* If we get a HV_ECPUERROR, then one or more of the cpus
|
|
|
|
|
* in the list are in error state. Use the cpu_state()
|
|
|
|
|
* hypervisor call to find out which cpus are in error state.
|
|
|
|
|
/* or, was any target cpu busy processing other mondos? */
|
|
|
|
|
target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu));
|
|
|
|
|
xc_rcvd = CPU_MONDO_COUNTER(first_cpu);
|
|
|
|
|
|
|
|
|
|
/* Retry count is for no progress. If we're making progress,
|
|
|
|
|
* reset the retry count.
|
|
|
|
|
*/
|
|
|
|
|
if (unlikely(status == HV_ECPUERROR)) {
|
|
|
|
|
for (i = 0; i < cnt; i++) {
|
|
|
|
|
long err;
|
|
|
|
|
u16 cpu;
|
|
|
|
|
|
|
|
|
|
cpu = cpu_list[i];
|
|
|
|
|
if (cpu == 0xffff)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
err = sun4v_cpu_state(cpu);
|
|
|
|
|
if (err == HV_CPU_STATE_ERROR) {
|
|
|
|
|
saw_cpu_error = (cpu + 1);
|
|
|
|
|
cpu_list[i] = 0xffff;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else if (unlikely(status != HV_EWOULDBLOCK))
|
|
|
|
|
goto fatal_mondo_error;
|
|
|
|
|
|
|
|
|
|
/* Don't bother rewriting the CPU list, just leave the
|
|
|
|
|
* 0xffff and non-0xffff entries in there and the
|
|
|
|
|
* hypervisor will do the right thing.
|
|
|
|
|
*
|
|
|
|
|
* Only advance timeout state if we didn't make any
|
|
|
|
|
* forward progress.
|
|
|
|
|
*/
|
|
|
|
|
if (unlikely(!forward_progress)) {
|
|
|
|
|
if (unlikely(++retries > 10000))
|
|
|
|
|
goto fatal_mondo_timeout;
|
|
|
|
|
|
|
|
|
|
/* Delay a little bit to let other cpus catch up
|
|
|
|
|
* on their cpu mondo queue work.
|
|
|
|
|
*/
|
|
|
|
|
udelay(2 * cnt);
|
|
|
|
|
if (likely(mondo_delivered || target_cpu_busy)) {
|
|
|
|
|
tot_retries += retries;
|
|
|
|
|
retries = 0;
|
|
|
|
|
} else if (unlikely(retries > MONDO_RETRY_LIMIT)) {
|
|
|
|
|
goto fatal_mondo_timeout;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Delay a little bit to let other cpus catch up on
|
|
|
|
|
* their cpu mondo queue work.
|
|
|
|
|
*/
|
|
|
|
|
if (!mondo_delivered)
|
|
|
|
|
udelay(usec_wait);
|
|
|
|
|
|
|
|
|
|
retries++;
|
|
|
|
|
} while (1);
|
|
|
|
|
|
|
|
|
|
if (unlikely(saw_cpu_error))
|
|
|
|
|
goto fatal_mondo_cpu_error;
|
|
|
|
|
|
|
|
|
|
xcall_done:
|
|
|
|
|
if (unlikely(ecpuerror_id > 0)) {
|
|
|
|
|
pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n",
|
|
|
|
|
this_cpu, ecpuerror_id - 1);
|
|
|
|
|
} else if (unlikely(enocpu_id > 0)) {
|
|
|
|
|
pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n",
|
|
|
|
|
this_cpu, enocpu_id - 1);
|
|
|
|
|
}
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
fatal_mondo_cpu_error:
|
|
|
|
|
printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
|
|
|
|
|
"(including %d) were in error state\n",
|
|
|
|
|
this_cpu, saw_cpu_error - 1);
|
|
|
|
|
return;
|
|
|
|
|
fatal_errors:
|
|
|
|
|
/* fatal errors include bad alignment, etc */
|
|
|
|
|
pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n",
|
|
|
|
|
this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
|
|
|
|
|
panic("Unexpected SUN4V mondo error %lu\n", status);
|
|
|
|
|
|
|
|
|
|
fatal_mondo_timeout:
|
|
|
|
|
printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
|
|
|
|
|
" progress after %d retries.\n",
|
|
|
|
|
this_cpu, retries);
|
|
|
|
|
goto dump_cpu_list_and_out;
|
|
|
|
|
|
|
|
|
|
fatal_mondo_error:
|
|
|
|
|
printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
|
|
|
|
|
this_cpu, status);
|
|
|
|
|
printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
|
|
|
|
|
"mondo_block_pa(%lx)\n",
|
|
|
|
|
this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
|
|
|
|
|
|
|
|
|
|
dump_cpu_list_and_out:
|
|
|
|
|
printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
|
|
|
|
|
for (i = 0; i < cnt; i++)
|
|
|
|
|
printk("%u ", cpu_list[i]);
|
|
|
|
|
printk("]\n");
|
|
|
|
|
/* some cpus being non-responsive to the cpu mondo */
|
|
|
|
|
pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n",
|
|
|
|
|
this_cpu, first_cpu, (tot_retries + retries), tot_cpus);
|
|
|
|
|
panic("SUN4V mondo timeout panic\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
|
|
|
|
|