From 7c5c92ed56d932b2c19c3f8aea86369509407d33 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 9 Dec 2014 10:58:19 +1100 Subject: [PATCH 01/15] powerpc: Secondary CPUs must set cpu_callin_map after setting active and online I have a busy ppc64le KVM box where guests sometimes hit the infamous "kernel BUG at kernel/smpboot.c:134!" issue during boot: BUG_ON(td->cpu != smp_processor_id()); Basically a per CPU hotplug thread scheduled on the wrong CPU. The oops output confirms it: CPU: 0 Comm: watchdog/130 The problem is that we aren't ensuring the CPU active and online bits are set before allowing the master to continue on. The master unparks the secondary CPUs kthreads and the scheduler looks for a CPU to run on. It calls select_task_rq and realises the suggested CPU is not in the cpus_allowed mask. It then ends up in select_fallback_rq, and since the active and online bits aren't set we choose some other CPU to run on. Cc: stable@vger.kernel.org Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8b2d2dc8ef10..8ec017cb4446 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -700,7 +700,6 @@ void start_secondary(void *unused) smp_store_cpu_info(cpu); set_dec(tb_ticks_per_jiffy); preempt_disable(); - cpu_callin_map[cpu] = 1; if (smp_ops->setup_cpu) smp_ops->setup_cpu(cpu); @@ -739,6 +738,14 @@ void start_secondary(void *unused) notify_cpu_starting(cpu); set_cpu_online(cpu, true); + /* + * CPU must be marked active and online before we signal back to the + * master, because the scheduler needs to see the cpu_online and + * cpu_active bits set. + */ + smp_wmb(); + cpu_callin_map[cpu] = 1; + local_irq_enable(); cpu_startup_entry(CPUHP_ONLINE); From ee41d11d53c8fc4968f0816504651541d606cf40 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Mon, 8 Dec 2014 19:17:55 +1100 Subject: [PATCH 02/15] cxl: Change contexts_lock to a mutex to fix sleep while atomic bug We had a known sleep while atomic bug if a CXL device was forcefully unbound while it was in use. This could occur as a result of EEH, or manually induced with something like this while the device was in use: echo 0000:01:00.0 > /sys/bus/pci/drivers/cxl-pci/unbind The issue was that in this code path we iterated over each context and forcefully detached it with the contexts_lock spin lock held, however the detach also needed to take the spu_mutex, and call schedule. This patch changes the contexts_lock to a mutex so that we are not in atomic context while doing the detach, thereby avoiding the sleep while atomic. Also delete the related TODO comment, which suggested an alternate solution which turned out to not be workable. Cc: stable@vger.kernel.org Signed-off-by: Ian Munsie Signed-off-by: Michael Ellerman --- drivers/misc/cxl/context.c | 15 ++++++++------- drivers/misc/cxl/cxl.h | 2 +- drivers/misc/cxl/native.c | 7 ------- drivers/misc/cxl/pci.c | 2 +- drivers/misc/cxl/sysfs.c | 10 +++++----- 5 files changed, 15 insertions(+), 21 deletions(-) diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index cca472109135..4aa31a3fb448 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -82,12 +82,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master) * Allocating IDR! We better make sure everything's setup that * dereferences from it. */ + mutex_lock(&afu->contexts_lock); idr_preload(GFP_KERNEL); - spin_lock(&afu->contexts_lock); i = idr_alloc(&ctx->afu->contexts_idr, ctx, 0, ctx->afu->num_procs, GFP_NOWAIT); - spin_unlock(&afu->contexts_lock); idr_preload_end(); + mutex_unlock(&afu->contexts_lock); if (i < 0) return i; @@ -168,21 +168,22 @@ void cxl_context_detach_all(struct cxl_afu *afu) struct cxl_context *ctx; int tmp; - rcu_read_lock(); - idr_for_each_entry(&afu->contexts_idr, ctx, tmp) + mutex_lock(&afu->contexts_lock); + idr_for_each_entry(&afu->contexts_idr, ctx, tmp) { /* * Anything done in here needs to be setup before the IDR is * created and torn down after the IDR removed */ __detach_context(ctx); - rcu_read_unlock(); + } + mutex_unlock(&afu->contexts_lock); } void cxl_context_free(struct cxl_context *ctx) { - spin_lock(&ctx->afu->contexts_lock); + mutex_lock(&ctx->afu->contexts_lock); idr_remove(&ctx->afu->contexts_idr, ctx->pe); - spin_unlock(&ctx->afu->contexts_lock); + mutex_unlock(&ctx->afu->contexts_lock); synchronize_rcu(); free_page((u64)ctx->sstp); diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index b5b6bda44a00..7c05239359df 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -351,7 +351,7 @@ struct cxl_afu { struct device *chardev_s, *chardev_m, *chardev_d; struct idr contexts_idr; struct dentry *debugfs; - spinlock_t contexts_lock; + struct mutex contexts_lock; struct mutex spa_mutex; spinlock_t afu_cntl_lock; diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 9a5a442269a8..1001cf49af94 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -610,13 +610,6 @@ static inline int detach_process_native_dedicated(struct cxl_context *ctx) return 0; } -/* - * TODO: handle case when this is called inside a rcu_read_lock() which may - * happen when we unbind the driver (ie. cxl_context_detach_all()) . Terminate - * & remove use a mutex lock and schedule which will not good with lock held. - * May need to write do_process_element_cmd() that handles outstanding page - * faults synchronously. - */ static inline int detach_process_native_afu_directed(struct cxl_context *ctx) { if (!ctx->pe_inserted) diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 10c98ab7f46e..0f2cc9f8b4db 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -502,7 +502,7 @@ static struct cxl_afu *cxl_alloc_afu(struct cxl *adapter, int slice) afu->dev.release = cxl_release_afu; afu->slice = slice; idr_init(&afu->contexts_idr); - spin_lock_init(&afu->contexts_lock); + mutex_init(&afu->contexts_lock); spin_lock_init(&afu->afu_cntl_lock); mutex_init(&afu->spa_mutex); diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c index ce7ec06d87d1..461bdbd5d483 100644 --- a/drivers/misc/cxl/sysfs.c +++ b/drivers/misc/cxl/sysfs.c @@ -121,7 +121,7 @@ static ssize_t reset_store_afu(struct device *device, int rc; /* Not safe to reset if it is currently in use */ - spin_lock(&afu->contexts_lock); + mutex_lock(&afu->contexts_lock); if (!idr_is_empty(&afu->contexts_idr)) { rc = -EBUSY; goto err; @@ -132,7 +132,7 @@ static ssize_t reset_store_afu(struct device *device, rc = count; err: - spin_unlock(&afu->contexts_lock); + mutex_unlock(&afu->contexts_lock); return rc; } @@ -247,7 +247,7 @@ static ssize_t mode_store(struct device *device, struct device_attribute *attr, int rc = -EBUSY; /* can't change this if we have a user */ - spin_lock(&afu->contexts_lock); + mutex_lock(&afu->contexts_lock); if (!idr_is_empty(&afu->contexts_idr)) goto err; @@ -271,7 +271,7 @@ static ssize_t mode_store(struct device *device, struct device_attribute *attr, afu->current_mode = 0; afu->num_procs = 0; - spin_unlock(&afu->contexts_lock); + mutex_unlock(&afu->contexts_lock); if ((rc = _cxl_afu_deactivate_mode(afu, old_mode))) return rc; @@ -280,7 +280,7 @@ static ssize_t mode_store(struct device *device, struct device_attribute *attr, return count; err: - spin_unlock(&afu->contexts_lock); + mutex_unlock(&afu->contexts_lock); return rc; } From a98e6e9f4e0224d85b4d951edc44af16dfe6094a Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Mon, 8 Dec 2014 19:17:56 +1100 Subject: [PATCH 03/15] cxl: Add timeout to process element commands In the event that something goes wrong in the hardware and it is unable to complete a process element comment we would end up polling forever, effectively making the associated process unkillable. This patch adds a timeout to the process element command code path, so that we will give up if the hardware does not respond in a reasonable time. Cc: stable@vger.kernel.org Signed-off-by: Ian Munsie Signed-off-by: Michael Ellerman --- drivers/misc/cxl/native.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 1001cf49af94..f2b37b41a0da 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -277,6 +277,7 @@ static int do_process_element_cmd(struct cxl_context *ctx, u64 cmd, u64 pe_state) { u64 state; + unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT); WARN_ON(!ctx->afu->enabled); @@ -286,6 +287,10 @@ static int do_process_element_cmd(struct cxl_context *ctx, smp_mb(); cxl_p1n_write(ctx->afu, CXL_PSL_LLCMD_An, cmd | ctx->pe); while (1) { + if (time_after_eq(jiffies, timeout)) { + dev_warn(&ctx->afu->dev, "WARNING: Process Element Command timed out!\n"); + return -EBUSY; + } state = be64_to_cpup(ctx->afu->sw_command_status); if (state == ~0ULL) { pr_err("cxl: Error adding process element to AFU\n"); From b123429e6a9e8d03aacf888d23262835f0081448 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Mon, 8 Dec 2014 19:18:01 +1100 Subject: [PATCH 04/15] cxl: Unmap MMIO regions when detaching a context If we need to force detach a context (e.g. due to EEH or simply force unbinding the driver) we should prevent the userspace contexts from being able to access the Problem State Area MMIO region further, which they may have mapped with mmap(). This patch unmaps any mapped MMIO regions when detaching a userspace context. Cc: stable@vger.kernel.org Signed-off-by: Ian Munsie Signed-off-by: Michael Ellerman --- drivers/misc/cxl/context.c | 11 ++++++++++- drivers/misc/cxl/cxl.h | 7 ++++++- drivers/misc/cxl/file.c | 6 +++++- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 4aa31a3fb448..51fd6b524371 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -34,7 +34,8 @@ struct cxl_context *cxl_context_alloc(void) /* * Initialises a CXL context. */ -int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master) +int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, + struct address_space *mapping) { int i; @@ -42,6 +43,8 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master) ctx->afu = afu; ctx->master = master; ctx->pid = NULL; /* Set in start work ioctl */ + mutex_init(&ctx->mapping_lock); + ctx->mapping = mapping; /* * Allocate the segment table before we put it in the IDR so that we @@ -147,6 +150,12 @@ static void __detach_context(struct cxl_context *ctx) afu_release_irqs(ctx); flush_work(&ctx->fault_work); /* Only needed for dedicated process */ wake_up_all(&ctx->wq); + + /* Release Problem State Area mapping */ + mutex_lock(&ctx->mapping_lock); + if (ctx->mapping) + unmap_mapping_range(ctx->mapping, 0, 0, 1); + mutex_unlock(&ctx->mapping_lock); } /* diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 7c05239359df..28078f8894a5 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -398,6 +398,10 @@ struct cxl_context { phys_addr_t psn_phys; u64 psn_size; + /* Used to unmap any mmaps when force detaching */ + struct address_space *mapping; + struct mutex mapping_lock; + spinlock_t sste_lock; /* Protects segment table entries */ struct cxl_sste *sstp; u64 sstp0, sstp1; @@ -599,7 +603,8 @@ int cxl_alloc_sst(struct cxl_context *ctx); void init_cxl_native(void); struct cxl_context *cxl_context_alloc(void); -int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master); +int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, + struct address_space *mapping); void cxl_context_free(struct cxl_context *ctx); int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma); diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 378b099e7c0b..e9f2f10dbb37 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -77,7 +77,7 @@ static int __afu_open(struct inode *inode, struct file *file, bool master) goto err_put_afu; } - if ((rc = cxl_context_init(ctx, afu, master))) + if ((rc = cxl_context_init(ctx, afu, master, inode->i_mapping))) goto err_put_afu; pr_devel("afu_open pe: %i\n", ctx->pe); @@ -113,6 +113,10 @@ static int afu_release(struct inode *inode, struct file *file) __func__, ctx->pe); cxl_context_detach(ctx); + mutex_lock(&ctx->mapping_lock); + ctx->mapping = NULL; + mutex_unlock(&ctx->mapping_lock); + put_device(&ctx->afu->dev); /* From f34b6c72c3ebaa286d3311a825ef79eccbcca82f Mon Sep 17 00:00:00 2001 From: "sukadev@linux.vnet.ibm.com" Date: Wed, 10 Dec 2014 14:29:13 -0800 Subject: [PATCH 05/15] powerpc/perf/hv-24x7: Use per-cpu page buffer The 24x7 counters are continuously running and not updated on an interrupt. So we record the event counts when stopping the event or deleting it. But to "read" a single counter in 24x7, we allocate a page and pass it into the hypervisor (The HV returns the page full of counters from which we extract the specific counter for this event). We allocate a page using GFP_USER and when deleting the event, we end up with the following warning because we are blocking in interrupt context. [ 698.641709] BUG: scheduling while atomic: swapper/0/0/0x10010000 We could use GFP_ATOMIC but that could result in failures. Pre-allocate a buffer so we don't have to allocate in interrupt context. Further as Michael Ellerman suggested, use Per-CPU buffer so we only need to allocate once per CPU. Cc: stable@vger.kernel.org Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index dba34088da28..d073e0679a0c 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -217,11 +217,14 @@ static bool is_physical_domain(int domain) domain == HV_24X7_PERF_DOMAIN_PHYSICAL_CORE; } +DEFINE_PER_CPU(char, hv_24x7_reqb[4096]) __aligned(4096); +DEFINE_PER_CPU(char, hv_24x7_resb[4096]) __aligned(4096); + static unsigned long single_24x7_request(u8 domain, u32 offset, u16 ix, u16 lpar, u64 *res, bool success_expected) { - unsigned long ret = -ENOMEM; + unsigned long ret; /* * request_buffer and result_buffer are not required to be 4k aligned, @@ -243,13 +246,11 @@ static unsigned long single_24x7_request(u8 domain, u32 offset, u16 ix, BUILD_BUG_ON(sizeof(*request_buffer) > 4096); BUILD_BUG_ON(sizeof(*result_buffer) > 4096); - request_buffer = kmem_cache_zalloc(hv_page_cache, GFP_USER); - if (!request_buffer) - goto out; + request_buffer = (void *)get_cpu_var(hv_24x7_reqb); + result_buffer = (void *)get_cpu_var(hv_24x7_resb); - result_buffer = kmem_cache_zalloc(hv_page_cache, GFP_USER); - if (!result_buffer) - goto out_free_request_buffer; + memset(request_buffer, 0, 4096); + memset(result_buffer, 0, 4096); *request_buffer = (struct reqb) { .buf = { @@ -278,15 +279,11 @@ static unsigned long single_24x7_request(u8 domain, u32 offset, u16 ix, domain, offset, ix, lpar, ret, ret, result_buffer->buf.detailed_rc, result_buffer->buf.failing_request_ix); - goto out_free_result_buffer; + goto out; } *res = be64_to_cpu(result_buffer->result); -out_free_result_buffer: - kfree(result_buffer); -out_free_request_buffer: - kfree(request_buffer); out: return ret; } From ec2aef5a8d3c14272f7a2d29b34f1f8e71f2be5b Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 10 Dec 2014 01:43:34 -0500 Subject: [PATCH 06/15] power/perf/hv-24x7: Use kmem_cache_free() instead of kfree Use kmem_cache_free() to free a buffer allocated with kmem_cache_alloc(). Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index d073e0679a0c..f162d0b8eea3 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -177,7 +177,7 @@ static ssize_t _name##_show(struct device *dev, \ } \ ret = sprintf(buf, _fmt, _expr); \ e_free: \ - kfree(page); \ + kmem_cache_free(hv_page_cache, page); \ return ret; \ } \ static DEVICE_ATTR_RO(_name) From 63f13448d81c910a284b096149411a719cbed501 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 9 Dec 2014 15:37:07 -0500 Subject: [PATCH 07/15] powerpc: add little endian flag to syscall_get_arch() Since both ppc and ppc64 have LE variants which are now reported by uname, add that flag (__AUDIT_ARCH_LE) to syscall_get_arch() and add AUDIT_ARCH_PPC64LE variant. Without this, perf trace and auditctl fail. Mainline kernel reports ppc64le (per a058801) but there is no matching AUDIT_ARCH_PPC64LE. Since 32-bit PPC LE is not supported by audit, don't advertise it in AUDIT_ARCH_PPC* variants. See: https://www.redhat.com/archives/linux-audit/2014-August/msg00082.html https://www.redhat.com/archives/linux-audit/2014-December/msg00004.html Signed-off-by: Richard Guy Briggs Acked-by: Paul Moore Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/syscall.h | 6 +++++- include/uapi/linux/audit.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 6240698fee9a..ff21b7a2f0cc 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -90,6 +90,10 @@ static inline void syscall_set_arguments(struct task_struct *task, static inline int syscall_get_arch(void) { - return is_32bit_task() ? AUDIT_ARCH_PPC : AUDIT_ARCH_PPC64; + int arch = is_32bit_task() ? AUDIT_ARCH_PPC : AUDIT_ARCH_PPC64; +#ifdef __LITTLE_ENDIAN__ + arch |= __AUDIT_ARCH_LE; +#endif + return arch; } #endif /* _ASM_SYSCALL_H */ diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index d4dbef14d4df..584bb0113e25 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -365,7 +365,9 @@ enum { #define AUDIT_ARCH_PARISC (EM_PARISC) #define AUDIT_ARCH_PARISC64 (EM_PARISC|__AUDIT_ARCH_64BIT) #define AUDIT_ARCH_PPC (EM_PPC) +/* do not define AUDIT_ARCH_PPCLE since it is not supported by audit */ #define AUDIT_ARCH_PPC64 (EM_PPC64|__AUDIT_ARCH_64BIT) +#define AUDIT_ARCH_PPC64LE (EM_PPC64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) #define AUDIT_ARCH_S390 (EM_S390) #define AUDIT_ARCH_S390X (EM_S390|__AUDIT_ARCH_64BIT) #define AUDIT_ARCH_SH (EM_SH) From 470834508f87877f680738a10a305280582c7aed Mon Sep 17 00:00:00 2001 From: Neelesh Gupta Date: Sat, 13 Dec 2014 23:31:05 +0530 Subject: [PATCH 08/15] i2c: Driver to expose PowerNV platform i2c busses The patch exposes the available i2c busses on the PowerNV platform to the kernel and implements the bus driver to support i2c and smbus commands. The driver uses the platform device infrastructure to probe the busses on the platform and registers them with the i2c driver framework. Signed-off-by: Neelesh Gupta Signed-off-by: Benjamin Herrenschmidt Acked-by: Wolfram Sang (I2C part, excluding the bindings) Signed-off-by: Michael Ellerman --- .../devicetree/bindings/i2c/i2c-opal.txt | 37 +++ arch/powerpc/include/asm/opal.h | 29 ++ .../powerpc/platforms/powernv/opal-wrappers.S | 1 + arch/powerpc/platforms/powernv/opal.c | 12 + drivers/i2c/busses/Kconfig | 11 + drivers/i2c/busses/Makefile | 1 + drivers/i2c/busses/i2c-opal.c | 294 ++++++++++++++++++ 7 files changed, 385 insertions(+) create mode 100644 Documentation/devicetree/bindings/i2c/i2c-opal.txt create mode 100644 drivers/i2c/busses/i2c-opal.c diff --git a/Documentation/devicetree/bindings/i2c/i2c-opal.txt b/Documentation/devicetree/bindings/i2c/i2c-opal.txt new file mode 100644 index 000000000000..12bc61465ee5 --- /dev/null +++ b/Documentation/devicetree/bindings/i2c/i2c-opal.txt @@ -0,0 +1,37 @@ +Device-tree bindings for I2C OPAL driver +---------------------------------------- + +Most of the device node and properties layout is specific to the firmware and +used by the firmware itself for configuring the port. From the linux +perspective, the properties of use are "ibm,port-name" and "ibm,opal-id". + +Required properties: + +- reg: Port-id within a given master +- compatible: must be "ibm,opal-i2c" +- ibm,opal-id: Refers to a specific bus and used to identify it when calling + the relevant OPAL functions. +- bus-frequency: Operating frequency of the i2c bus (in HZ). Informational for + linux, used by the FW though. + +Optional properties: +- ibm,port-name: Firmware provides this name that uniquely identifies the i2c + port. + +The node contains a number of other properties that are used by the FW itself +and depend on the specific hardware implementation. The example below depicts +a P8 on-chip bus. + +Example: + +i2c-bus@0 { + reg = <0x0>; + bus-frequency = <0x61a80>; + compatible = "ibm,power8-i2c-port", "ibm,opal-i2c"; + ibm,opal-id = <0x1>; + ibm,port-name = "p8_00000000_e1p0"; + #address-cells = <0x1>; + phandle = <0x10000006>; + #size-cells = <0x0>; + linux,phandle = <0x10000006>; +}; diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 5cd8d2fddba9..4095749c973f 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -56,6 +56,14 @@ struct opal_sg_list { #define OPAL_HARDWARE_FROZEN -13 #define OPAL_WRONG_STATE -14 #define OPAL_ASYNC_COMPLETION -15 +#define OPAL_I2C_TIMEOUT -17 +#define OPAL_I2C_INVALID_CMD -18 +#define OPAL_I2C_LBUS_PARITY -19 +#define OPAL_I2C_BKEND_OVERRUN -20 +#define OPAL_I2C_BKEND_ACCESS -21 +#define OPAL_I2C_ARBT_LOST -22 +#define OPAL_I2C_NACK_RCVD -23 +#define OPAL_I2C_STOP_ERR -24 /* API Tokens (in r0) */ #define OPAL_INVALID_CALL -1 @@ -158,6 +166,7 @@ struct opal_sg_list { #define OPAL_READ_TPO 104 #define OPAL_IPMI_SEND 107 #define OPAL_IPMI_RECV 108 +#define OPAL_I2C_REQUEST 109 #ifndef __ASSEMBLY__ @@ -712,6 +721,24 @@ typedef struct oppanel_line { uint64_t line_len; } oppanel_line_t; +/* OPAL I2C request */ +struct opal_i2c_request { + uint8_t type; +#define OPAL_I2C_RAW_READ 0 +#define OPAL_I2C_RAW_WRITE 1 +#define OPAL_I2C_SM_READ 2 +#define OPAL_I2C_SM_WRITE 3 + uint8_t flags; +#define OPAL_I2C_ADDR_10 0x01 /* Not supported yet */ + uint8_t subaddr_sz; /* Max 4 */ + uint8_t reserved; + __be16 addr; /* 7 or 10 bit address */ + __be16 reserved2; + __be32 subaddr; /* Sub-address if any */ + __be32 size; /* Data size */ + __be64 buffer_ra; /* Buffer real address */ +}; + /* /sys/firmware/opal */ extern struct kobject *opal_kobj; @@ -881,6 +908,8 @@ int64_t opal_ipmi_send(uint64_t interface, struct opal_ipmi_msg *msg, uint64_t msg_len); int64_t opal_ipmi_recv(uint64_t interface, struct opal_ipmi_msg *msg, uint64_t *msg_len); +int64_t opal_i2c_request(uint64_t async_token, uint32_t bus_id, + struct opal_i2c_request *oreq); /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 0a299be588af..2111e08d406b 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -254,3 +254,4 @@ OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); +OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index cb0b6de79cd4..aa316d820736 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -653,6 +653,14 @@ static void opal_ipmi_init(struct device_node *opal_node) of_platform_device_create(np, NULL, NULL); } +static void opal_i2c_create_devs(void) +{ + struct device_node *np; + + for_each_compatible_node(np, NULL, "ibm,opal-i2c") + of_platform_device_create(np, NULL, NULL); +} + static int __init opal_init(void) { struct device_node *np, *consoles; @@ -679,6 +687,9 @@ static int __init opal_init(void) of_node_put(consoles); } + /* Create i2c platform devices */ + opal_i2c_create_devs(); + /* Find all OPAL interrupts and request them */ irqs = of_get_property(opal_node, "opal-interrupts", &irqlen); pr_debug("opal: Found %d interrupts reserved for OPAL\n", @@ -824,3 +835,4 @@ EXPORT_SYMBOL_GPL(opal_rtc_read); EXPORT_SYMBOL_GPL(opal_rtc_write); EXPORT_SYMBOL_GPL(opal_tpo_read); EXPORT_SYMBOL_GPL(opal_tpo_write); +EXPORT_SYMBOL_GPL(opal_i2c_request); diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 917c3585f45b..71ad6e11efb7 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -1044,4 +1044,15 @@ config SCx200_ACB This support is also available as a module. If so, the module will be called scx200_acb. +config I2C_OPAL + tristate "IBM OPAL I2C driver" + depends on PPC_POWERNV + default y + help + This exposes the PowerNV platform i2c busses to the linux i2c layer, + the driver is based on the OPAL interfaces. + + This driver can also be built as a module. If so, the module will be + called as i2c-opal. + endmenu diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 78d56c54ba2b..e23ec815e21f 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -99,6 +99,7 @@ obj-$(CONFIG_I2C_ACORN) += i2c-acorn.o obj-$(CONFIG_I2C_BCM_KONA) += i2c-bcm-kona.o obj-$(CONFIG_I2C_CROS_EC_TUNNEL) += i2c-cros-ec-tunnel.o obj-$(CONFIG_I2C_ELEKTOR) += i2c-elektor.o +obj-$(CONFIG_I2C_OPAL) += i2c-opal.o obj-$(CONFIG_I2C_PCA_ISA) += i2c-pca-isa.o obj-$(CONFIG_I2C_SIBYTE) += i2c-sibyte.o obj-$(CONFIG_SCx200_ACB) += scx200_acb.o diff --git a/drivers/i2c/busses/i2c-opal.c b/drivers/i2c/busses/i2c-opal.c new file mode 100644 index 000000000000..16f90b1a7508 --- /dev/null +++ b/drivers/i2c/busses/i2c-opal.c @@ -0,0 +1,294 @@ +/* + * IBM OPAL I2C driver + * Copyright (C) 2014 IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static int i2c_opal_translate_error(int rc) +{ + switch (rc) { + case OPAL_NO_MEM: + return -ENOMEM; + case OPAL_PARAMETER: + return -EINVAL; + case OPAL_I2C_ARBT_LOST: + return -EAGAIN; + case OPAL_I2C_TIMEOUT: + return -ETIMEDOUT; + case OPAL_I2C_NACK_RCVD: + return -ENXIO; + case OPAL_I2C_STOP_ERR: + return -EBUSY; + default: + return -EIO; + } +} + +static int i2c_opal_send_request(u32 bus_id, struct opal_i2c_request *req) +{ + struct opal_msg msg; + int token, rc; + + token = opal_async_get_token_interruptible(); + if (token < 0) { + if (token != -ERESTARTSYS) + pr_err("Failed to get the async token\n"); + + return token; + } + + rc = opal_i2c_request(token, bus_id, req); + if (rc != OPAL_ASYNC_COMPLETION) { + rc = i2c_opal_translate_error(rc); + goto exit; + } + + rc = opal_async_wait_response(token, &msg); + if (rc) + goto exit; + + rc = be64_to_cpu(msg.params[1]); + if (rc != OPAL_SUCCESS) { + rc = i2c_opal_translate_error(rc); + goto exit; + } + +exit: + opal_async_release_token(token); + return rc; +} + +static int i2c_opal_master_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, + int num) +{ + unsigned long opal_id = (unsigned long)adap->algo_data; + struct opal_i2c_request req; + int rc, i; + + /* We only support fairly simple combinations here of one + * or two messages + */ + memset(&req, 0, sizeof(req)); + switch(num) { + case 0: + return 0; + case 1: + req.type = (msgs[0].flags & I2C_M_RD) ? + OPAL_I2C_RAW_READ : OPAL_I2C_RAW_WRITE; + req.addr = cpu_to_be16(msgs[0].addr); + req.size = cpu_to_be32(msgs[0].len); + req.buffer_ra = cpu_to_be64(__pa(msgs[0].buf)); + break; + case 2: + /* For two messages, we basically support only simple + * smbus transactions of a write plus a read. We might + * want to allow also two writes but we'd have to bounce + * the data into a single buffer. + */ + if ((msgs[0].flags & I2C_M_RD) || !(msgs[1].flags & I2C_M_RD)) + return -EOPNOTSUPP; + if (msgs[0].len > 4) + return -EOPNOTSUPP; + if (msgs[0].addr != msgs[1].addr) + return -EOPNOTSUPP; + req.type = OPAL_I2C_SM_READ; + req.addr = cpu_to_be16(msgs[0].addr); + req.subaddr_sz = msgs[0].len; + for (i = 0; i < msgs[0].len; i++) + req.subaddr = (req.subaddr << 8) | msgs[0].buf[i]; + req.subaddr = cpu_to_be32(req.subaddr); + req.size = cpu_to_be32(msgs[1].len); + req.buffer_ra = cpu_to_be64(__pa(msgs[1].buf)); + break; + default: + return -EOPNOTSUPP; + } + + rc = i2c_opal_send_request(opal_id, &req); + if (rc) + return rc; + + return num; +} + +static int i2c_opal_smbus_xfer(struct i2c_adapter *adap, u16 addr, + unsigned short flags, char read_write, + u8 command, int size, union i2c_smbus_data *data) +{ + unsigned long opal_id = (unsigned long)adap->algo_data; + struct opal_i2c_request req; + u8 local[2]; + int rc; + + memset(&req, 0, sizeof(req)); + + req.addr = cpu_to_be16(addr); + switch (size) { + case I2C_SMBUS_BYTE: + req.buffer_ra = cpu_to_be64(__pa(&data->byte)); + req.size = cpu_to_be32(1); + /* Fall through */ + case I2C_SMBUS_QUICK: + req.type = (read_write == I2C_SMBUS_READ) ? + OPAL_I2C_RAW_READ : OPAL_I2C_RAW_WRITE; + break; + case I2C_SMBUS_BYTE_DATA: + req.buffer_ra = cpu_to_be64(__pa(&data->byte)); + req.size = cpu_to_be32(1); + req.subaddr = cpu_to_be32(command); + req.subaddr_sz = 1; + req.type = (read_write == I2C_SMBUS_READ) ? + OPAL_I2C_SM_READ : OPAL_I2C_SM_WRITE; + break; + case I2C_SMBUS_WORD_DATA: + if (!read_write) { + local[0] = data->word & 0xff; + local[1] = (data->word >> 8) & 0xff; + } + req.buffer_ra = cpu_to_be64(__pa(local)); + req.size = cpu_to_be32(2); + req.subaddr = cpu_to_be32(command); + req.subaddr_sz = 1; + req.type = (read_write == I2C_SMBUS_READ) ? + OPAL_I2C_SM_READ : OPAL_I2C_SM_WRITE; + break; + case I2C_SMBUS_I2C_BLOCK_DATA: + req.buffer_ra = cpu_to_be64(__pa(&data->block[1])); + req.size = cpu_to_be32(data->block[0]); + req.subaddr = cpu_to_be32(command); + req.subaddr_sz = 1; + req.type = (read_write == I2C_SMBUS_READ) ? + OPAL_I2C_SM_READ : OPAL_I2C_SM_WRITE; + break; + default: + return -EINVAL; + } + + rc = i2c_opal_send_request(opal_id, &req); + if (!rc && read_write && size == I2C_SMBUS_WORD_DATA) { + data->word = ((u16)local[1]) << 8; + data->word |= local[0]; + } + + return rc; +} + +static u32 i2c_opal_func(struct i2c_adapter *adapter) +{ + return I2C_FUNC_I2C | I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE | + I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA | + I2C_FUNC_SMBUS_I2C_BLOCK; +} + +static const struct i2c_algorithm i2c_opal_algo = { + .master_xfer = i2c_opal_master_xfer, + .smbus_xfer = i2c_opal_smbus_xfer, + .functionality = i2c_opal_func, +}; + +static int i2c_opal_probe(struct platform_device *pdev) +{ + struct i2c_adapter *adapter; + const char *pname; + u32 opal_id; + int rc; + + if (!pdev->dev.of_node) + return -ENODEV; + + rc = of_property_read_u32(pdev->dev.of_node, "ibm,opal-id", &opal_id); + if (rc) { + dev_err(&pdev->dev, "Missing ibm,opal-id property !\n"); + return -EIO; + } + + adapter = devm_kzalloc(&pdev->dev, sizeof(*adapter), GFP_KERNEL); + if (!adapter) + return -ENOMEM; + + adapter->algo = &i2c_opal_algo; + adapter->algo_data = (void *)(unsigned long)opal_id; + adapter->dev.parent = &pdev->dev; + adapter->dev.of_node = of_node_get(pdev->dev.of_node); + pname = of_get_property(pdev->dev.of_node, "ibm,port-name", NULL); + if (pname) + strlcpy(adapter->name, pname, sizeof(adapter->name)); + else + strlcpy(adapter->name, "opal", sizeof(adapter->name)); + + platform_set_drvdata(pdev, adapter); + rc = i2c_add_adapter(adapter); + if (rc) + dev_err(&pdev->dev, "Failed to register the i2c adapter\n"); + + return rc; +} + +static int i2c_opal_remove(struct platform_device *pdev) +{ + struct i2c_adapter *adapter = platform_get_drvdata(pdev); + + i2c_del_adapter(adapter); + + return 0; +} + +static const struct of_device_id i2c_opal_of_match[] = { + { + .compatible = "ibm,opal-i2c", + }, + { } +}; +MODULE_DEVICE_TABLE(of, i2c_opal_of_match); + +static struct platform_driver i2c_opal_driver = { + .probe = i2c_opal_probe, + .remove = i2c_opal_remove, + .driver = { + .name = "i2c-opal", + .of_match_table = i2c_opal_of_match, + }, +}; + +static int __init i2c_opal_init(void) +{ + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return -ENODEV; + + return platform_driver_register(&i2c_opal_driver); +} +module_init(i2c_opal_init); + +static void __exit i2c_opal_exit(void) +{ + return platform_driver_unregister(&i2c_opal_driver); +} +module_exit(i2c_opal_exit); + +MODULE_AUTHOR("Neelesh Gupta "); +MODULE_DESCRIPTION("IBM OPAL I2C driver"); +MODULE_LICENSE("GPL"); From 8117ac6a6c2fa0f847ff6a21a1f32c8d2c8501d0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Dec 2014 00:26:50 +0530 Subject: [PATCH 09/15] powerpc/powernv: Switch off MMU before entering nap/sleep/rvwinkle mode Currently, when going idle, we set the flag indicating that we are in nap mode (paca->kvm_hstate.hwthread_state) and then execute the nap (or sleep or rvwinkle) instruction, all with the MMU on. This is bad for two reasons: (a) the architecture specifies that those instructions must be executed with the MMU off, and in fact with only the SF, HV, ME and possibly RI bits set, and (b) this introduces a race, because as soon as we set the flag, another thread can switch the MMU to a guest context. If the race is lost, this thread will typically start looping on relocation-on ISIs at 0xc...4400. This fixes it by setting the MSR as required by the architecture before setting the flag or executing the nap/sleep/rvwinkle instruction. Cc: stable@vger.kernel.org [ shreyas@linux.vnet.ibm.com: Edited to handle LE ] Signed-off-by: Paul Mackerras Signed-off-by: Shreyas B. Prabhu Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/reg.h | 2 ++ arch/powerpc/kernel/idle_power7.S | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c998279bd85b..a68ee15964b3 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -118,8 +118,10 @@ #define __MSR (MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_ISF |MSR_HV) #ifdef __BIG_ENDIAN__ #define MSR_ __MSR +#define MSR_IDLE (MSR_ME | MSR_SF | MSR_HV) #else #define MSR_ (__MSR | MSR_LE) +#define MSR_IDLE (MSR_ME | MSR_SF | MSR_HV | MSR_LE) #endif #define MSR_KERNEL (MSR_ | MSR_64BIT) #define MSR_USER32 (MSR_ | MSR_PR | MSR_EE) diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index 18c0687e5ab3..e5aba6abbe6c 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -101,7 +101,23 @@ _GLOBAL(power7_powersave_common) std r9,_MSR(r1) std r1,PACAR1(r13) -_GLOBAL(power7_enter_nap_mode) + /* + * Go to real mode to do the nap, as required by the architecture. + * Also, we need to be in real mode before setting hwthread_state, + * because as soon as we do that, another thread can switch + * the MMU context to the guest. + */ + LOAD_REG_IMMEDIATE(r5, MSR_IDLE) + li r6, MSR_RI + andc r6, r9, r6 + LOAD_REG_ADDR(r7, power7_enter_nap_mode) + mtmsrd r6, 1 /* clear RI before setting SRR0/1 */ + mtspr SPRN_SRR0, r7 + mtspr SPRN_SRR1, r5 + rfid + + .globl power7_enter_nap_mode +power7_enter_nap_mode: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE /* Tell KVM we're napping */ li r4,KVM_HWTHREAD_IN_NAP From 8eb8ac89a364305d05ad16be983b7890eb462cc3 Mon Sep 17 00:00:00 2001 From: "Shreyas B. Prabhu" Date: Wed, 10 Dec 2014 00:26:51 +0530 Subject: [PATCH 10/15] powerpc/powernv: Enable Offline CPUs to enter deep idle states The secondary threads should enter deep idle states so as to gain maximum powersavings when the entire core is offline. To do so the offline path must be made aware of the available deepest idle state. Hence probe the device tree for the possible idle states in powernv core code and expose the deepest idle state through flags. Since the device tree is probed by the cpuidle driver as well, move the parameters required to discover the idle states into an appropriate common place to both the driver and the powernv core code. Another point is that fastsleep idle state may require workarounds in the kernel to function properly. This workaround is introduced in the subsequent patches. However neither the cpuidle driver or the hotplug path need be bothered about this workaround. They will be taken care of by the core powernv code. Originally-by: Srivatsa S. Bhat Signed-off-by: Preeti U. Murthy Signed-off-by: Shreyas B. Prabhu Reviewed-by: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Rafael J. Wysocki Cc: linux-pm@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal.h | 8 ++++ arch/powerpc/platforms/powernv/powernv.h | 2 + arch/powerpc/platforms/powernv/setup.c | 49 ++++++++++++++++++++++++ arch/powerpc/platforms/powernv/smp.c | 7 +++- drivers/cpuidle/cpuidle-powernv.c | 9 ++--- 5 files changed, 68 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 4095749c973f..6dedf9b05a86 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -168,6 +168,14 @@ struct opal_sg_list { #define OPAL_IPMI_RECV 108 #define OPAL_I2C_REQUEST 109 +/* Device tree flags */ + +/* Flags set in power-mgmt nodes in device tree if + * respective idle states are supported in the platform. + */ +#define OPAL_PM_NAP_ENABLED 0x00010000 +#define OPAL_PM_SLEEP_ENABLED 0x00020000 + #ifndef __ASSEMBLY__ #include diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 6c8e2d188cd0..604c48e7879a 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -29,6 +29,8 @@ static inline u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev) } #endif +extern u32 pnv_get_supported_cpuidle_states(void); + extern void pnv_lpc_init(void); bool cpu_core_split_required(void); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 30b1c3e298a6..88e579e62a73 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -288,6 +288,55 @@ static void __init pnv_setup_machdep_rtas(void) } #endif /* CONFIG_PPC_POWERNV_RTAS */ +static u32 supported_cpuidle_states; + +u32 pnv_get_supported_cpuidle_states(void) +{ + return supported_cpuidle_states; +} + +static int __init pnv_init_idle_states(void) +{ + struct device_node *power_mgt; + int dt_idle_states; + const __be32 *idle_state_flags; + u32 len_flags, flags; + int i; + + supported_cpuidle_states = 0; + + if (cpuidle_disable != IDLE_NO_OVERRIDE) + return 0; + + if (!firmware_has_feature(FW_FEATURE_OPALv3)) + return 0; + + power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); + if (!power_mgt) { + pr_warn("opal: PowerMgmt Node not found\n"); + return 0; + } + + idle_state_flags = of_get_property(power_mgt, + "ibm,cpu-idle-state-flags", &len_flags); + if (!idle_state_flags) { + pr_warn("DT-PowerMgmt: missing ibm,cpu-idle-state-flags\n"); + return 0; + } + + dt_idle_states = len_flags / sizeof(u32); + + for (i = 0; i < dt_idle_states; i++) { + flags = be32_to_cpu(idle_state_flags[i]); + supported_cpuidle_states |= flags; + } + + return 0; +} + +subsys_initcall(pnv_init_idle_states); + + static int __init pnv_probe(void) { unsigned long root = of_get_flat_dt_root(); diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index b716f666e48a..83299ef2dc3d 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -150,6 +150,7 @@ static void pnv_smp_cpu_kill_self(void) { unsigned int cpu; unsigned long srr1; + u32 idle_states; /* Standard hot unplug procedure */ local_irq_disable(); @@ -160,13 +161,17 @@ static void pnv_smp_cpu_kill_self(void) generic_set_cpu_dead(cpu); smp_wmb(); + idle_states = pnv_get_supported_cpuidle_states(); /* We don't want to take decrementer interrupts while we are offline, * so clear LPCR:PECE1. We keep PECE2 enabled. */ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); while (!generic_check_cpu_restart(cpu)) { ppc64_runlatch_off(); - srr1 = power7_nap(1); + if (idle_states & OPAL_PM_SLEEP_ENABLED) + srr1 = power7_sleep(); + else + srr1 = power7_nap(1); ppc64_runlatch_on(); /* diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 7d3a3497dd4c..0a7d827897e4 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -16,13 +16,10 @@ #include #include +#include #include -/* Flags and constants used in PowerNV platform */ - #define MAX_POWERNV_IDLE_STATES 8 -#define IDLE_USE_INST_NAP 0x00010000 /* Use nap instruction */ -#define IDLE_USE_INST_SLEEP 0x00020000 /* Use sleep instruction */ struct cpuidle_driver powernv_idle_driver = { .name = "powernv_idle", @@ -198,7 +195,7 @@ static int powernv_add_idle_states(void) * target residency to be 10x exit_latency */ latency_ns = be32_to_cpu(idle_state_latency[i]); - if (flags & IDLE_USE_INST_NAP) { + if (flags & OPAL_PM_NAP_ENABLED) { /* Add NAP state */ strcpy(powernv_states[nr_idle_states].name, "Nap"); strcpy(powernv_states[nr_idle_states].desc, "Nap"); @@ -211,7 +208,7 @@ static int powernv_add_idle_states(void) nr_idle_states++; } - if (flags & IDLE_USE_INST_SLEEP) { + if (flags & OPAL_PM_SLEEP_ENABLED) { /* Add FASTSLEEP state */ strcpy(powernv_states[nr_idle_states].name, "FastSleep"); strcpy(powernv_states[nr_idle_states].desc, "FastSleep"); From 7cba160ad789a3ad7e68b92bf20eaad6ed171f80 Mon Sep 17 00:00:00 2001 From: "Shreyas B. Prabhu" Date: Wed, 10 Dec 2014 00:26:52 +0530 Subject: [PATCH 11/15] powernv/cpuidle: Redesign idle states management Deep idle states like sleep and winkle are per core idle states. A core enters these states only when all the threads enter either the particular idle state or a deeper one. There are tasks like fastsleep hardware bug workaround and hypervisor core state save which have to be done only by the last thread of the core entering deep idle state and similarly tasks like timebase resync, hypervisor core register restore that have to be done only by the first thread waking up from these state. The current idle state management does not have a way to distinguish the first/last thread of the core waking/entering idle states. Tasks like timebase resync are done for all the threads. This is not only is suboptimal, but can cause functionality issues when subcores and kvm is involved. This patch adds the necessary infrastructure to track idle states of threads in a per-core structure. It uses this info to perform tasks like fastsleep workaround and timebase resync only once per core. Signed-off-by: Shreyas B. Prabhu Originally-by: Preeti U. Murthy Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Rafael J. Wysocki Cc: linux-pm@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/cpuidle.h | 20 ++ arch/powerpc/include/asm/opal.h | 2 + arch/powerpc/include/asm/paca.h | 8 + arch/powerpc/include/asm/processor.h | 2 +- arch/powerpc/kernel/asm-offsets.c | 9 + arch/powerpc/kernel/exceptions-64s.S | 24 ++- arch/powerpc/kernel/idle_power7.S | 199 ++++++++++++++---- .../powerpc/platforms/powernv/opal-wrappers.S | 37 ++++ arch/powerpc/platforms/powernv/setup.c | 49 ++++- arch/powerpc/platforms/powernv/smp.c | 3 +- drivers/cpuidle/cpuidle-powernv.c | 3 +- 11 files changed, 297 insertions(+), 59 deletions(-) create mode 100644 arch/powerpc/include/asm/cpuidle.h diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h new file mode 100644 index 000000000000..d2f99ca1e3a6 --- /dev/null +++ b/arch/powerpc/include/asm/cpuidle.h @@ -0,0 +1,20 @@ +#ifndef _ASM_POWERPC_CPUIDLE_H +#define _ASM_POWERPC_CPUIDLE_H + +#ifdef CONFIG_PPC_POWERNV +/* Used in powernv idle state management */ +#define PNV_THREAD_RUNNING 0 +#define PNV_THREAD_NAP 1 +#define PNV_THREAD_SLEEP 2 +#define PNV_THREAD_WINKLE 3 +#define PNV_CORE_IDLE_LOCK_BIT 0x100 +#define PNV_CORE_IDLE_THREAD_BITS 0x0FF + +#ifndef __ASSEMBLY__ +extern u32 pnv_fastsleep_workaround_at_entry[]; +extern u32 pnv_fastsleep_workaround_at_exit[]; +#endif + +#endif + +#endif diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 6dedf9b05a86..3dea31c1080c 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -160,6 +160,7 @@ struct opal_sg_list { #define OPAL_PCI_ERR_INJECT 96 #define OPAL_PCI_EEH_FREEZE_SET 97 #define OPAL_HANDLE_HMI 98 +#define OPAL_CONFIG_CPU_IDLE_STATE 99 #define OPAL_REGISTER_DUMP_REGION 101 #define OPAL_UNREGISTER_DUMP_REGION 102 #define OPAL_WRITE_TPO 103 @@ -175,6 +176,7 @@ struct opal_sg_list { */ #define OPAL_PM_NAP_ENABLED 0x00010000 #define OPAL_PM_SLEEP_ENABLED 0x00020000 +#define OPAL_PM_SLEEP_ENABLED_ER1 0x00080000 #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 24a386cbb928..a0a16847bd40 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -152,6 +152,14 @@ struct paca_struct { u64 tm_scratch; /* TM scratch area for reclaim */ #endif +#ifdef CONFIG_PPC_POWERNV + /* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */ + u32 *core_idle_state_ptr; + u8 thread_idle_state; /* PNV_THREAD_RUNNING/NAP/SLEEP */ + /* Mask to indicate thread id in core */ + u8 thread_mask; +#endif + #ifdef CONFIG_PPC_BOOK3S_64 /* Exclusive emergency stack pointer for machine check exception. */ void *mc_emergency_sp; diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 29c3798cf800..f5c45b37c0d4 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -452,7 +452,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern unsigned long power7_nap(int check_irq); -extern void power7_sleep(void); +extern unsigned long power7_sleep(void); extern void flush_instruction_cache(void); extern void hard_reset_now(void); extern void poweroff_now(void); diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index c161ef3f28a1..bbd27fe0c039 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -726,5 +726,14 @@ int main(void) arch.timing_last_enter.tv32.tbl)); #endif +#ifdef CONFIG_PPC_POWERNV + DEFINE(PACA_CORE_IDLE_STATE_PTR, + offsetof(struct paca_struct, core_idle_state_ptr)); + DEFINE(PACA_THREAD_IDLE_STATE, + offsetof(struct paca_struct, thread_idle_state)); + DEFINE(PACA_THREAD_MASK, + offsetof(struct paca_struct, thread_mask)); +#endif + return 0; } diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index db08382e19f1..289fe718ecd4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -15,6 +15,7 @@ #include #include #include +#include /* * We layout physical memory as follows: @@ -109,15 +110,19 @@ BEGIN_FTR_SECTION rlwinm. r13,r13,47-31,30,31 beq 9f - /* waking up from powersave (nap) state */ - cmpwi cr1,r13,2 - /* Total loss of HV state is fatal, we could try to use the - * PIR to locate a PACA, then use an emergency stack etc... - * OPAL v3 based powernv platforms have new idle states - * which fall in this catagory. - */ - bgt cr1,8f + cmpwi cr3,r13,2 + GET_PACA(r13) + lbz r0,PACA_THREAD_IDLE_STATE(r13) + cmpwi cr2,r0,PNV_THREAD_NAP + bgt cr2,8f /* Either sleep or Winkle */ + + /* Waking up from nap should not cause hypervisor state loss */ + bgt cr3,. + + /* Waking up from nap */ + li r0,PNV_THREAD_RUNNING + stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE li r0,KVM_HWTHREAD_IN_KERNEL @@ -133,7 +138,7 @@ BEGIN_FTR_SECTION /* Return SRR1 from power7_nap() */ mfspr r3,SPRN_SRR1 - beq cr1,2f + beq cr3,2f b power7_wakeup_noloss 2: b power7_wakeup_loss @@ -1382,6 +1387,7 @@ machine_check_handle_early: MACHINE_CHECK_HANDLER_WINDUP GET_PACA(r13) ld r1,PACAR1(r13) + li r3,PNV_THREAD_NAP b power7_enter_nap_mode 4: #endif diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index e5aba6abbe6c..0f2c113c8ca5 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -18,6 +18,7 @@ #include #include #include +#include #undef DEBUG @@ -37,8 +38,7 @@ /* * Pass requested state in r3: - * 0 - nap - * 1 - sleep + * r3 - PNV_THREAD_NAP/SLEEP/WINKLE * * To check IRQ_HAPPENED in r4 * 0 - don't check @@ -123,12 +123,58 @@ power7_enter_nap_mode: li r4,KVM_HWTHREAD_IN_NAP stb r4,HSTATE_HWTHREAD_STATE(r13) #endif - cmpwi cr0,r3,1 - beq 2f + stb r3,PACA_THREAD_IDLE_STATE(r13) + cmpwi cr1,r3,PNV_THREAD_SLEEP + bge cr1,2f IDLE_STATE_ENTER_SEQ(PPC_NAP) /* No return */ -2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP) - /* No return */ +2: + /* Sleep or winkle */ + lbz r7,PACA_THREAD_MASK(r13) + ld r14,PACA_CORE_IDLE_STATE_PTR(r13) +lwarx_loop1: + lwarx r15,0,r14 + andc r15,r15,r7 /* Clear thread bit */ + + andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS + +/* + * If cr0 = 0, then current thread is the last thread of the core entering + * sleep. Last thread needs to execute the hardware bug workaround code if + * required by the platform. + * Make the workaround call unconditionally here. The below branch call is + * patched out when the idle states are discovered if the platform does not + * require it. + */ +.global pnv_fastsleep_workaround_at_entry +pnv_fastsleep_workaround_at_entry: + beq fastsleep_workaround_at_entry + + stwcx. r15,0,r14 + bne- lwarx_loop1 + isync + +common_enter: /* common code for all the threads entering sleep */ + IDLE_STATE_ENTER_SEQ(PPC_SLEEP) + +fastsleep_workaround_at_entry: + ori r15,r15,PNV_CORE_IDLE_LOCK_BIT + stwcx. r15,0,r14 + bne- lwarx_loop1 + isync + + /* Fast sleep workaround */ + li r3,1 + li r4,1 + li r0,OPAL_CONFIG_CPU_IDLE_STATE + bl opal_call_realmode + + /* Clear Lock bit */ + li r0,0 + lwsync + stw r0,0(r14) + b common_enter + _GLOBAL(power7_idle) /* Now check if user or arch enabled NAP mode */ @@ -141,49 +187,16 @@ _GLOBAL(power7_idle) _GLOBAL(power7_nap) mr r4,r3 - li r3,0 + li r3,PNV_THREAD_NAP b power7_powersave_common /* No return */ _GLOBAL(power7_sleep) - li r3,1 + li r3,PNV_THREAD_SLEEP li r4,1 b power7_powersave_common /* No return */ -/* - * Make opal call in realmode. This is a generic function to be called - * from realmode from reset vector. It handles endianess. - * - * r13 - paca pointer - * r1 - stack pointer - * r3 - opal token - */ -opal_call_realmode: - mflr r12 - std r12,_LINK(r1) - ld r2,PACATOC(r13) - /* Set opal return address */ - LOAD_REG_ADDR(r0,return_from_opal_call) - mtlr r0 - /* Handle endian-ness */ - li r0,MSR_LE - mfmsr r12 - andc r12,r12,r0 - mtspr SPRN_HSRR1,r12 - mr r0,r3 /* Move opal token to r0 */ - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid - -return_from_opal_call: - FIXUP_ENDIAN - ld r0,_LINK(r1) - mtlr r0 - blr - #define CHECK_HMI_INTERRUPT \ mfspr r0,SPRN_SRR1; \ BEGIN_FTR_SECTION_NESTED(66); \ @@ -197,7 +210,7 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ ld r2,PACATOC(r13); \ ld r1,PACAR1(r13); \ std r3,ORIG_GPR3(r1); /* Save original r3 */ \ - li r3,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \ + li r0,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \ bl opal_call_realmode; \ ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \ 20: nop; @@ -206,16 +219,105 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ _GLOBAL(power7_wakeup_tb_loss) ld r2,PACATOC(r13); ld r1,PACAR1(r13) + /* + * Before entering any idle state, the NVGPRs are saved in the stack + * and they are restored before switching to the process context. Hence + * until they are restored, they are free to be used. + * + * Save SRR1 in a NVGPR as it might be clobbered in opal_call_realmode + * (called in CHECK_HMI_INTERRUPT). SRR1 is required to determine the + * wakeup reason if we branch to kvm_start_guest. + */ + mfspr r16,SPRN_SRR1 BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - /* Time base re-sync */ - li r3,OPAL_RESYNC_TIMEBASE - bl opal_call_realmode; + lbz r7,PACA_THREAD_MASK(r13) + ld r14,PACA_CORE_IDLE_STATE_PTR(r13) +lwarx_loop2: + lwarx r15,0,r14 + andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT + /* + * Lock bit is set in one of the 2 cases- + * a. In the sleep/winkle enter path, the last thread is executing + * fastsleep workaround code. + * b. In the wake up path, another thread is executing fastsleep + * workaround undo code or resyncing timebase or restoring context + * In either case loop until the lock bit is cleared. + */ + bne core_idle_lock_held + + cmpwi cr2,r15,0 + or r15,r15,r7 /* Set thread bit */ + + beq cr2,first_thread + + /* Not first thread in core to wake up */ + stwcx. r15,0,r14 + bne- lwarx_loop2 + isync + b common_exit + +core_idle_lock_held: + HMT_LOW +core_idle_lock_loop: + lwz r15,0(14) + andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT + bne core_idle_lock_loop + HMT_MEDIUM + b lwarx_loop2 + +first_thread: + /* First thread in core to wakeup */ + ori r15,r15,PNV_CORE_IDLE_LOCK_BIT + stwcx. r15,0,r14 + bne- lwarx_loop2 + isync + + /* + * First thread in the core waking up from fastsleep. It needs to + * call the fastsleep workaround code if the platform requires it. + * Call it unconditionally here. The below branch instruction will + * be patched out when the idle states are discovered if platform + * does not require workaround. + */ +.global pnv_fastsleep_workaround_at_exit +pnv_fastsleep_workaround_at_exit: + b fastsleep_workaround_at_exit + +timebase_resync: + /* Do timebase resync if we are waking up from sleep. Use cr3 value + * set in exceptions-64s.S */ + ble cr3,clear_lock + /* Time base re-sync */ + li r0,OPAL_RESYNC_TIMEBASE + bl opal_call_realmode; /* TODO: Check r3 for failure */ +clear_lock: + andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS + lwsync + stw r15,0(r14) + +common_exit: + li r5,PNV_THREAD_RUNNING + stb r5,PACA_THREAD_IDLE_STATE(r13) + + mtspr SPRN_SRR1,r16 +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beq 6f + b kvm_start_guest +6: +#endif + REST_NVGPRS(r1) REST_GPR(2, r1) ld r3,_CCR(r1) @@ -228,6 +330,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) mtspr SPRN_SRR0,r5 rfid +fastsleep_workaround_at_exit: + li r3,1 + li r4,0 + li r0,OPAL_CONFIG_CPU_IDLE_STATE + bl opal_call_realmode + b timebase_resync + /* * R3 here contains the value that will be returned to the caller * of power7_nap. diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 2111e08d406b..78289ed7058c 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -158,6 +158,43 @@ opal_tracepoint_return: blr #endif +/* + * Make opal call in realmode. This is a generic function to be called + * from realmode. It handles endianness. + * + * r13 - paca pointer + * r1 - stack pointer + * r0 - opal token + */ +_GLOBAL(opal_call_realmode) + mflr r12 + std r12,PPC_LR_STKOFF(r1) + ld r2,PACATOC(r13) + /* Set opal return address */ + LOAD_REG_ADDR(r12,return_from_opal_call) + mtlr r12 + + mfmsr r12 +#ifdef __LITTLE_ENDIAN__ + /* Handle endian-ness */ + li r11,MSR_LE + andc r12,r12,r11 +#endif + mtspr SPRN_HSRR1,r12 + LOAD_REG_ADDR(r11,opal) + ld r12,8(r11) + ld r2,0(r11) + mtspr SPRN_HSRR0,r12 + hrfid + +return_from_opal_call: +#ifdef __LITTLE_ENDIAN__ + FIXUP_ENDIAN +#endif + ld r12,PPC_LR_STKOFF(r1) + mtlr r12 + blr + OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 88e579e62a73..2e9b53bb73e2 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -36,6 +36,9 @@ #include #include #include +#include +#include +#include #include "powernv.h" @@ -290,10 +293,45 @@ static void __init pnv_setup_machdep_rtas(void) static u32 supported_cpuidle_states; +static void pnv_alloc_idle_core_states(void) +{ + int i, j; + int nr_cores = cpu_nr_cores(); + u32 *core_idle_state; + + /* + * core_idle_state - First 8 bits track the idle state of each thread + * of the core. The 8th bit is the lock bit. Initially all thread bits + * are set. They are cleared when the thread enters deep idle state + * like sleep and winkle. Initially the lock bit is cleared. + * The lock bit has 2 purposes + * a. While the first thread is restoring core state, it prevents + * other threads in the core from switching to process context. + * b. While the last thread in the core is saving the core state, it + * prevents a different thread from waking up. + */ + for (i = 0; i < nr_cores; i++) { + int first_cpu = i * threads_per_core; + int node = cpu_to_node(first_cpu); + + core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); + *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; + + for (j = 0; j < threads_per_core; j++) { + int cpu = first_cpu + j; + + paca[cpu].core_idle_state_ptr = core_idle_state; + paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; + paca[cpu].thread_mask = 1 << j; + } + } +} + u32 pnv_get_supported_cpuidle_states(void) { return supported_cpuidle_states; } +EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); static int __init pnv_init_idle_states(void) { @@ -330,13 +368,20 @@ static int __init pnv_init_idle_states(void) flags = be32_to_cpu(idle_state_flags[i]); supported_cpuidle_states |= flags; } - + if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { + patch_instruction( + (unsigned int *)pnv_fastsleep_workaround_at_entry, + PPC_INST_NOP); + patch_instruction( + (unsigned int *)pnv_fastsleep_workaround_at_exit, + PPC_INST_NOP); + } + pnv_alloc_idle_core_states(); return 0; } subsys_initcall(pnv_init_idle_states); - static int __init pnv_probe(void) { unsigned long root = of_get_flat_dt_root(); diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 83299ef2dc3d..c0691d0fb385 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -168,7 +168,8 @@ static void pnv_smp_cpu_kill_self(void) mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); while (!generic_check_cpu_restart(cpu)) { ppc64_runlatch_off(); - if (idle_states & OPAL_PM_SLEEP_ENABLED) + if ((idle_states & OPAL_PM_SLEEP_ENABLED) || + (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) srr1 = power7_sleep(); else srr1 = power7_nap(1); diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 0a7d827897e4..a489b56e92df 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -208,7 +208,8 @@ static int powernv_add_idle_states(void) nr_idle_states++; } - if (flags & OPAL_PM_SLEEP_ENABLED) { + if (flags & OPAL_PM_SLEEP_ENABLED || + flags & OPAL_PM_SLEEP_ENABLED_ER1) { /* Add FASTSLEEP state */ strcpy(powernv_states[nr_idle_states].name, "FastSleep"); strcpy(powernv_states[nr_idle_states].desc, "FastSleep"); From 77b54e9f213f76a23736940cf94bcd765fc00f40 Mon Sep 17 00:00:00 2001 From: "Shreyas B. Prabhu" Date: Wed, 10 Dec 2014 00:26:53 +0530 Subject: [PATCH 12/15] powernv/powerpc: Add winkle support for offline cpus Winkle is a deep idle state supported in power8 chips. A core enters winkle when all the threads of the core enter winkle. In this state power supply to the entire chiplet i.e core, private L2 and private L3 is turned off. As a result it gives higher powersavings compared to sleep. But entering winkle results in a total hypervisor state loss. Hence the hypervisor context has to be preserved before entering winkle and restored upon wake up. Power-on Reset Engine (PORE) is a dedicated engine which is responsible for powering on the chiplet during wake up. It can be programmed to restore the register contests of a few specific registers. This patch uses PORE to restore register state wherever possible and uses stack to save and restore rest of the necessary registers. With hypervisor state restore things fall under three categories- per-core state, per-subcore state and per-thread state. To manage this, extend the infrastructure introduced for sleep. Mainly we add a paca variable subcore_sibling_mask. Using this and the core_idle_state we can distingush first thread in core and subcore. Signed-off-by: Shreyas B. Prabhu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal.h | 3 + arch/powerpc/include/asm/paca.h | 2 + arch/powerpc/include/asm/ppc-opcode.h | 2 + arch/powerpc/include/asm/processor.h | 1 + arch/powerpc/include/asm/reg.h | 2 + arch/powerpc/kernel/asm-offsets.c | 2 + arch/powerpc/kernel/exceptions-64s.S | 13 +- arch/powerpc/kernel/idle_power7.S | 145 +++++++++++++++++- .../powerpc/platforms/powernv/opal-wrappers.S | 1 + arch/powerpc/platforms/powernv/setup.c | 72 +++++++++ arch/powerpc/platforms/powernv/smp.c | 7 +- arch/powerpc/platforms/powernv/subcore.c | 34 ++++ arch/powerpc/platforms/powernv/subcore.h | 9 +- 13 files changed, 281 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 3dea31c1080c..eb95b675109b 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -161,6 +161,7 @@ struct opal_sg_list { #define OPAL_PCI_EEH_FREEZE_SET 97 #define OPAL_HANDLE_HMI 98 #define OPAL_CONFIG_CPU_IDLE_STATE 99 +#define OPAL_SLW_SET_REG 100 #define OPAL_REGISTER_DUMP_REGION 101 #define OPAL_UNREGISTER_DUMP_REGION 102 #define OPAL_WRITE_TPO 103 @@ -176,6 +177,7 @@ struct opal_sg_list { */ #define OPAL_PM_NAP_ENABLED 0x00010000 #define OPAL_PM_SLEEP_ENABLED 0x00020000 +#define OPAL_PM_WINKLE_ENABLED 0x00040000 #define OPAL_PM_SLEEP_ENABLED_ER1 0x00080000 #ifndef __ASSEMBLY__ @@ -913,6 +915,7 @@ int64_t opal_sensor_read(uint32_t sensor_hndl, int token, __be32 *sensor_data); int64_t opal_handle_hmi(void); int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end); int64_t opal_unregister_dump_region(uint32_t id); +int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val); int64_t opal_pci_set_phb_cxl_mode(uint64_t phb_id, uint64_t mode, uint64_t pe_number); int64_t opal_ipmi_send(uint64_t interface, struct opal_ipmi_msg *msg, uint64_t msg_len); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index a0a16847bd40..e5f22c6c4bf9 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -158,6 +158,8 @@ struct paca_struct { u8 thread_idle_state; /* PNV_THREAD_RUNNING/NAP/SLEEP */ /* Mask to indicate thread id in core */ u8 thread_mask; + /* Mask to denote subcore sibling threads */ + u8 subcore_sibling_mask; #endif #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 6f8536208049..5155be7c0d48 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -194,6 +194,7 @@ #define PPC_INST_NAP 0x4c000364 #define PPC_INST_SLEEP 0x4c0003a4 +#define PPC_INST_WINKLE 0x4c0003e4 /* A2 specific instructions */ #define PPC_INST_ERATWE 0x7c0001a6 @@ -374,6 +375,7 @@ #define PPC_NAP stringify_in_c(.long PPC_INST_NAP) #define PPC_SLEEP stringify_in_c(.long PPC_INST_SLEEP) +#define PPC_WINKLE stringify_in_c(.long PPC_INST_WINKLE) /* BHRB instructions */ #define PPC_CLRBHRB stringify_in_c(.long PPC_INST_CLRBHRB) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index f5c45b37c0d4..bf117d8fb45f 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -453,6 +453,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern unsigned long power7_nap(int check_irq); extern unsigned long power7_sleep(void); +extern unsigned long power7_winkle(void); extern void flush_instruction_cache(void); extern void hard_reset_now(void); extern void poweroff_now(void); diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index a68ee15964b3..1c874fb533bb 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -373,6 +373,7 @@ #define SPRN_DBAT7L 0x23F /* Data BAT 7 Lower Register */ #define SPRN_DBAT7U 0x23E /* Data BAT 7 Upper Register */ #define SPRN_PPR 0x380 /* SMT Thread status Register */ +#define SPRN_TSCR 0x399 /* Thread Switch Control Register */ #define SPRN_DEC 0x016 /* Decrement Register */ #define SPRN_DER 0x095 /* Debug Enable Regsiter */ @@ -730,6 +731,7 @@ #define SPRN_BESCR 806 /* Branch event status and control register */ #define BESCR_GE 0x8000000000000000ULL /* Global Enable */ #define SPRN_WORT 895 /* Workload optimization register - thread */ +#define SPRN_WORC 863 /* Workload optimization register - core */ #define SPRN_PMC1 787 #define SPRN_PMC2 788 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index bbd27fe0c039..f68de7a73faa 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -733,6 +733,8 @@ int main(void) offsetof(struct paca_struct, thread_idle_state)); DEFINE(PACA_THREAD_MASK, offsetof(struct paca_struct, thread_mask)); + DEFINE(PACA_SUBCORE_SIBLING_MASK, + offsetof(struct paca_struct, subcore_sibling_mask)); #endif return 0; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 289fe718ecd4..c2df8150bd7a 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -102,9 +102,7 @@ system_reset_pSeries: #ifdef CONFIG_PPC_P7_NAP BEGIN_FTR_SECTION /* Running native on arch 2.06 or later, check if we are - * waking up from nap. We only handle no state loss and - * supervisor state loss. We do -not- handle hypervisor - * state loss at this time. + * waking up from nap/sleep/winkle. */ mfspr r13,SPRN_SRR1 rlwinm. r13,r13,47-31,30,31 @@ -112,7 +110,16 @@ BEGIN_FTR_SECTION cmpwi cr3,r13,2 + /* + * Check if last bit of HSPGR0 is set. This indicates whether we are + * waking up from winkle. + */ GET_PACA(r13) + clrldi r5,r13,63 + clrrdi r13,r13,1 + cmpwi cr4,r5,1 + mtspr SPRN_HSPRG0,r13 + lbz r0,PACA_THREAD_IDLE_STATE(r13) cmpwi cr2,r0,PNV_THREAD_NAP bgt cr2,8f /* Either sleep or Winkle */ diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index 0f2c113c8ca5..05adc8bbdef8 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -19,9 +19,24 @@ #include #include #include +#include #undef DEBUG +/* + * Use unused space in the interrupt stack to save and restore + * registers for winkle support. + */ +#define _SDR1 GPR3 +#define _RPR GPR4 +#define _SPURR GPR5 +#define _PURR GPR6 +#define _TSCR GPR7 +#define _DSCR GPR8 +#define _AMOR GPR9 +#define _WORT GPR10 +#define _WORC GPR11 + /* Idle state entry routines */ #define IDLE_STATE_ENTER_SEQ(IDLE_INST) \ @@ -124,8 +139,8 @@ power7_enter_nap_mode: stb r4,HSTATE_HWTHREAD_STATE(r13) #endif stb r3,PACA_THREAD_IDLE_STATE(r13) - cmpwi cr1,r3,PNV_THREAD_SLEEP - bge cr1,2f + cmpwi cr3,r3,PNV_THREAD_SLEEP + bge cr3,2f IDLE_STATE_ENTER_SEQ(PPC_NAP) /* No return */ 2: @@ -154,7 +169,8 @@ pnv_fastsleep_workaround_at_entry: bne- lwarx_loop1 isync -common_enter: /* common code for all the threads entering sleep */ +common_enter: /* common code for all the threads entering sleep or winkle */ + bgt cr3,enter_winkle IDLE_STATE_ENTER_SEQ(PPC_SLEEP) fastsleep_workaround_at_entry: @@ -175,6 +191,30 @@ fastsleep_workaround_at_entry: stw r0,0(r14) b common_enter +enter_winkle: + /* + * Note all register i.e per-core, per-subcore or per-thread is saved + * here since any thread in the core might wake up first + */ + mfspr r3,SPRN_SDR1 + std r3,_SDR1(r1) + mfspr r3,SPRN_RPR + std r3,_RPR(r1) + mfspr r3,SPRN_SPURR + std r3,_SPURR(r1) + mfspr r3,SPRN_PURR + std r3,_PURR(r1) + mfspr r3,SPRN_TSCR + std r3,_TSCR(r1) + mfspr r3,SPRN_DSCR + std r3,_DSCR(r1) + mfspr r3,SPRN_AMOR + std r3,_AMOR(r1) + mfspr r3,SPRN_WORT + std r3,_WORT(r1) + mfspr r3,SPRN_WORC + std r3,_WORC(r1) + IDLE_STATE_ENTER_SEQ(PPC_WINKLE) _GLOBAL(power7_idle) /* Now check if user or arch enabled NAP mode */ @@ -197,6 +237,12 @@ _GLOBAL(power7_sleep) b power7_powersave_common /* No return */ +_GLOBAL(power7_winkle) + li r3,3 + li r4,1 + b power7_powersave_common + /* No return */ + #define CHECK_HMI_INTERRUPT \ mfspr r0,SPRN_SRR1; \ BEGIN_FTR_SECTION_NESTED(66); \ @@ -250,11 +296,23 @@ lwarx_loop2: bne core_idle_lock_held cmpwi cr2,r15,0 + lbz r4,PACA_SUBCORE_SIBLING_MASK(r13) + and r4,r4,r15 + cmpwi cr1,r4,0 /* Check if first in subcore */ + + /* + * At this stage + * cr1 - 0b0100 if first thread to wakeup in subcore + * cr2 - 0b0100 if first thread to wakeup in core + * cr3- 0b0010 if waking up from sleep or winkle + * cr4 - 0b0100 if waking up from winkle + */ + or r15,r15,r7 /* Set thread bit */ - beq cr2,first_thread + beq cr1,first_thread_in_subcore - /* Not first thread in core to wake up */ + /* Not first thread in subcore to wake up */ stwcx. r15,0,r14 bne- lwarx_loop2 isync @@ -269,13 +327,36 @@ core_idle_lock_loop: HMT_MEDIUM b lwarx_loop2 -first_thread: - /* First thread in core to wakeup */ +first_thread_in_subcore: + /* First thread in subcore to wakeup */ ori r15,r15,PNV_CORE_IDLE_LOCK_BIT stwcx. r15,0,r14 bne- lwarx_loop2 isync + /* + * If waking up from sleep, subcore state is not lost. Hence + * skip subcore state restore + */ + bne cr4,subcore_state_restored + + /* Restore per-subcore state */ + ld r4,_SDR1(r1) + mtspr SPRN_SDR1,r4 + ld r4,_RPR(r1) + mtspr SPRN_RPR,r4 + ld r4,_AMOR(r1) + mtspr SPRN_AMOR,r4 + +subcore_state_restored: + /* + * Check if the thread is also the first thread in the core. If not, + * skip to clear_lock. + */ + bne cr2,clear_lock + +first_thread_in_core: + /* * First thread in the core waking up from fastsleep. It needs to * call the fastsleep workaround code if the platform requires it. @@ -296,12 +377,62 @@ timebase_resync: bl opal_call_realmode; /* TODO: Check r3 for failure */ + /* + * If waking up from sleep, per core state is not lost, skip to + * clear_lock. + */ + bne cr4,clear_lock + + /* Restore per core state */ + ld r4,_TSCR(r1) + mtspr SPRN_TSCR,r4 + ld r4,_WORC(r1) + mtspr SPRN_WORC,r4 + clear_lock: andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS lwsync stw r15,0(r14) common_exit: + /* + * Common to all threads. + * + * If waking up from sleep, hypervisor state is not lost. Hence + * skip hypervisor state restore. + */ + bne cr4,hypervisor_state_restored + + /* Waking up from winkle */ + + /* Restore per thread state */ + bl __restore_cpu_power8 + + /* Restore SLB from PACA */ + ld r8,PACA_SLBSHADOWPTR(r13) + + .rept SLB_NUM_BOLTED + li r3, SLBSHADOW_SAVEAREA + LDX_BE r5, r8, r3 + addi r3, r3, 8 + LDX_BE r6, r8, r3 + andis. r7,r5,SLB_ESID_V@h + beq 1f + slbmte r6,r5 +1: addi r8,r8,16 + .endr + + ld r4,_SPURR(r1) + mtspr SPRN_SPURR,r4 + ld r4,_PURR(r1) + mtspr SPRN_PURR,r4 + ld r4,_DSCR(r1) + mtspr SPRN_DSCR,r4 + ld r4,_WORT(r1) + mtspr SPRN_WORT,r4 + +hypervisor_state_restored: + li r5,PNV_THREAD_RUNNING stb r5,PACA_THREAD_IDLE_STATE(r13) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 78289ed7058c..54eca8b3b288 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -284,6 +284,7 @@ OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); OPAL_CALL(opal_get_param, OPAL_GET_PARAM); OPAL_CALL(opal_set_param, OPAL_SET_PARAM); OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); +OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CXL_MODE); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 2e9b53bb73e2..b700a329c31d 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -41,6 +41,7 @@ #include #include "powernv.h" +#include "subcore.h" static void __init pnv_setup_arch(void) { @@ -293,6 +294,72 @@ static void __init pnv_setup_machdep_rtas(void) static u32 supported_cpuidle_states; +int pnv_save_sprs_for_winkle(void) +{ + int cpu; + int rc; + + /* + * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross + * all cpus at boot. Get these reg values of current cpu and use the + * same accross all cpus. + */ + uint64_t lpcr_val = mfspr(SPRN_LPCR); + uint64_t hid0_val = mfspr(SPRN_HID0); + uint64_t hid1_val = mfspr(SPRN_HID1); + uint64_t hid4_val = mfspr(SPRN_HID4); + uint64_t hid5_val = mfspr(SPRN_HID5); + uint64_t hmeer_val = mfspr(SPRN_HMEER); + + for_each_possible_cpu(cpu) { + uint64_t pir = get_hard_smp_processor_id(cpu); + uint64_t hsprg0_val = (uint64_t)&paca[cpu]; + + /* + * HSPRG0 is used to store the cpu's pointer to paca. Hence last + * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0 + * with 63rd bit set, so that when a thread wakes up at 0x100 we + * can use this bit to distinguish between fastsleep and + * deep winkle. + */ + hsprg0_val |= 1; + + rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); + if (rc != 0) + return rc; + + /* HIDs are per core registers */ + if (cpu_thread_in_core(cpu) == 0) { + + rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val); + if (rc != 0) + return rc; + + rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val); + if (rc != 0) + return rc; + } + } + + return 0; +} + static void pnv_alloc_idle_core_states(void) { int i, j; @@ -325,6 +392,11 @@ static void pnv_alloc_idle_core_states(void) paca[cpu].thread_mask = 1 << j; } } + + update_subcore_sibling_mask(); + + if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) + pnv_save_sprs_for_winkle(); } u32 pnv_get_supported_cpuidle_states(void) diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index c0691d0fb385..6c551a28e899 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -167,12 +167,17 @@ static void pnv_smp_cpu_kill_self(void) */ mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); while (!generic_check_cpu_restart(cpu)) { + ppc64_runlatch_off(); - if ((idle_states & OPAL_PM_SLEEP_ENABLED) || + + if (idle_states & OPAL_PM_WINKLE_ENABLED) + srr1 = power7_winkle(); + else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) srr1 = power7_sleep(); else srr1 = power7_nap(1); + ppc64_runlatch_on(); /* diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index c87f96b79d1a..f60f80ada903 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -160,6 +160,18 @@ static void wait_for_sync_step(int step) mb(); } +static void update_hid_in_slw(u64 hid0) +{ + u64 idle_states = pnv_get_supported_cpuidle_states(); + + if (idle_states & OPAL_PM_WINKLE_ENABLED) { + /* OPAL call to patch slw with the new HID0 value */ + u64 cpu_pir = hard_smp_processor_id(); + + opal_slw_set_reg(cpu_pir, SPRN_HID0, hid0); + } +} + static void unsplit_core(void) { u64 hid0, mask; @@ -179,6 +191,7 @@ static void unsplit_core(void) hid0 = mfspr(SPRN_HID0); hid0 &= ~HID0_POWER8_DYNLPARDIS; mtspr(SPRN_HID0, hid0); + update_hid_in_slw(hid0); while (mfspr(SPRN_HID0) & mask) cpu_relax(); @@ -215,6 +228,7 @@ static void split_core(int new_mode) hid0 = mfspr(SPRN_HID0); hid0 |= HID0_POWER8_DYNLPARDIS | split_parms[i].value; mtspr(SPRN_HID0, hid0); + update_hid_in_slw(hid0); /* Wait for it to happen */ while (!(mfspr(SPRN_HID0) & split_parms[i].mask)) @@ -251,6 +265,25 @@ bool cpu_core_split_required(void) return true; } +void update_subcore_sibling_mask(void) +{ + int cpu; + /* + * sibling mask for the first cpu. Left shift this by required bits + * to get sibling mask for the rest of the cpus. + */ + int sibling_mask_first_cpu = (1 << threads_per_subcore) - 1; + + for_each_possible_cpu(cpu) { + int tid = cpu_thread_in_core(cpu); + int offset = (tid / threads_per_subcore) * threads_per_subcore; + int mask = sibling_mask_first_cpu << offset; + + paca[cpu].subcore_sibling_mask = mask; + + } +} + static int cpu_update_split_mode(void *data) { int cpu, new_mode = *(int *)data; @@ -284,6 +317,7 @@ static int cpu_update_split_mode(void *data) /* Make the new mode public */ subcores_per_core = new_mode; threads_per_subcore = threads_per_core / subcores_per_core; + update_subcore_sibling_mask(); /* Make sure the new mode is written before we exit */ mb(); diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h index 148abc91debf..84e02ae52895 100644 --- a/arch/powerpc/platforms/powernv/subcore.h +++ b/arch/powerpc/platforms/powernv/subcore.h @@ -14,5 +14,12 @@ #define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */ #ifndef __ASSEMBLY__ + +#ifdef CONFIG_SMP void split_core_secondary_loop(u8 *state); -#endif +extern void update_subcore_sibling_mask(void); +#else +static inline void update_subcore_sibling_mask(void) { }; +#endif /* CONFIG_SMP */ + +#endif /* __ASSEMBLY__ */ From c8742f85125dcfb27a2daa389283fffebffebb00 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 11 Dec 2014 12:39:49 +1100 Subject: [PATCH 13/15] powerpc/powernv: Expose OPAL firmware symbol map Newer versions of OPAL will provide this, so let's expose it to user space so tools like perf can use it to properly decode samples in firmware space. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 38 ++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index aa316d820736..f10b9ec8c1f5 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -9,8 +9,9 @@ * 2 of the License, or (at your option) any later version. */ -#undef DEBUG +#define pr_fmt(fmt) "opal: " fmt +#include #include #include #include @@ -625,6 +626,39 @@ static int opal_sysfs_init(void) return 0; } +static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + return memory_read_from_buffer(buf, count, &off, bin_attr->private, + bin_attr->size); +} + +static BIN_ATTR_RO(symbol_map, 0); + +static void opal_export_symmap(void) +{ + const __be64 *syms; + unsigned int size; + struct device_node *fw; + int rc; + + fw = of_find_node_by_path("/ibm,opal/firmware"); + if (!fw) + return; + syms = of_get_property(fw, "symbol-map", &size); + if (!syms || size != 2 * sizeof(__be64)) + return; + + /* Setup attributes */ + bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0])); + bin_attr_symbol_map.size = be64_to_cpu(syms[1]); + + rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map); + if (rc) + pr_warn("Error %d creating OPAL symbols file\n", rc); +} + static void __init opal_dump_region_init(void) { void *addr; @@ -713,6 +747,8 @@ static int __init opal_init(void) /* Create "opal" kobject under /sys/firmware */ rc = opal_sysfs_init(); if (rc == 0) { + /* Export symbol map to userspace */ + opal_export_symmap(); /* Setup dump region interface */ opal_dump_region_init(); /* Setup error log interface */ From 505e428374bc17a2c0bd388c2e8d892e9cd8eef2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 14 Dec 2014 18:52:51 +0200 Subject: [PATCH 14/15] powerpc/uaccess: Allow get_user() with bitwise types At the moment, if p and x are both of the same bitwise type (eg. __le32), get_user(x, p) produces a sparse warning. This is because *p is loaded into a long then cast back to typeof(*p). When typeof(*p) is a bitwise type (which is uncommon), such a cast needs __force, otherwise sparse produces a warning. For non-bitwise types __force should have no effect, and should not hide any legitimate errors. Note that we are casting to typeof(*p) not typeof(x). Even with the cast, if x and *p are of different types we should get the warning, so I think we are not loosing the ability to detect any actual errors. virtio would like to use bitwise types with get_user() so fix these spurious warnings by adding __force. Signed-off-by: Michael S. Tsirkin [mpe: Fill in changelog with more details] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/uaccess.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 9485b43a7c00..a0c071d24e0e 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -284,7 +284,7 @@ do { \ if (!is_kernel_addr((unsigned long)__gu_addr)) \ might_fault(); \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ - (x) = (__typeof__(*(ptr)))__gu_val; \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ __gu_err; \ }) #endif /* __powerpc64__ */ @@ -297,7 +297,7 @@ do { \ might_fault(); \ if (access_ok(VERIFY_READ, __gu_addr, (size))) \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ - (x) = (__typeof__(*(ptr)))__gu_val; \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ __gu_err; \ }) @@ -308,7 +308,7 @@ do { \ const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ - (x) = (__typeof__(*(ptr)))__gu_val; \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ __gu_err; \ }) From d70a54e2d08510a99b1f10eceeae6f2f7086e226 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 12 Dec 2014 12:37:40 +0100 Subject: [PATCH 15/15] powerpc/powernv: Ignore smt-enabled on Power8 and later Starting with POWER8, the subcore logic relies on all threads of a core being booted so that they can participate in split mode switches. So on those machines we ignore the smt_enabled_at_boot setting (smt-enabled on the kernel command line). Signed-off-by: Greg Kurz [mpe: Update comment and change log to be more precise] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/smp.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 6c551a28e899..fc34025ef822 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -209,13 +209,27 @@ static void pnv_smp_cpu_kill_self(void) #endif /* CONFIG_HOTPLUG_CPU */ +static int pnv_cpu_bootable(unsigned int nr) +{ + /* + * Starting with POWER8, the subcore logic relies on all threads of a + * core being booted so that they can participate in split mode + * switches. So on those machines we ignore the smt_enabled_at_boot + * setting (smt-enabled on the kernel command line). + */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return 1; + + return smp_generic_cpu_bootable(nr); +} + static struct smp_ops_t pnv_smp_ops = { .message_pass = smp_muxed_ipi_message_pass, .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ .probe = xics_smp_probe, .kick_cpu = pnv_smp_kick_cpu, .setup_cpu = pnv_smp_setup_cpu, - .cpu_bootable = smp_generic_cpu_bootable, + .cpu_bootable = pnv_cpu_bootable, #ifdef CONFIG_HOTPLUG_CPU .cpu_disable = pnv_smp_cpu_disable, .cpu_die = generic_cpu_die,