diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst index c8f7a16cd0e3..3498d38d7cbd 100644 --- a/Documentation/driver-api/cxl/memory-devices.rst +++ b/Documentation/driver-api/cxl/memory-devices.rst @@ -14,6 +14,303 @@ that optionally define a device's contribution to an interleaved address range across multiple devices underneath a host-bridge or interleaved across host-bridges. +CXL Bus: Theory of Operation +============================ +Similar to how a RAID driver takes disk objects and assembles them into a new +logical device, the CXL subsystem is tasked to take PCIe and ACPI objects and +assemble them into a CXL.mem decode topology. The need for runtime configuration +of the CXL.mem topology is also similar to RAID in that different environments +with the same hardware configuration may decide to assemble the topology in +contrasting ways. One may choose performance (RAID0) striping memory across +multiple Host Bridges and endpoints while another may opt for fault tolerance +and disable any striping in the CXL.mem topology. + +Platform firmware enumerates a menu of interleave options at the "CXL root port" +(Linux term for the top of the CXL decode topology). From there, PCIe topology +dictates which endpoints can participate in which Host Bridge decode regimes. +Each PCIe Switch in the path between the root and an endpoint introduces a point +at which the interleave can be split. For example platform firmware may say at a +given range only decodes to 1 one Host Bridge, but that Host Bridge may in turn +interleave cycles across multiple Root Ports. An intervening Switch between a +port and an endpoint may interleave cycles across multiple Downstream Switch +Ports, etc. + +Here is a sample listing of a CXL topology defined by 'cxl_test'. The 'cxl_test' +module generates an emulated CXL topology of 2 Host Bridges each with 2 Root +Ports. Each of those Root Ports are connected to 2-way switches with endpoints +connected to those downstream ports for a total of 8 endpoints:: + + # cxl list -BEMPu -b cxl_test + { + "bus":"root3", + "provider":"cxl_test", + "ports:root3":[ + { + "port":"port5", + "host":"cxl_host_bridge.1", + "ports:port5":[ + { + "port":"port8", + "host":"cxl_switch_uport.1", + "endpoints:port8":[ + { + "endpoint":"endpoint9", + "host":"mem2", + "memdev":{ + "memdev":"mem2", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x1", + "numa_node":1, + "host":"cxl_mem.1" + } + }, + { + "endpoint":"endpoint15", + "host":"mem6", + "memdev":{ + "memdev":"mem6", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x5", + "numa_node":1, + "host":"cxl_mem.5" + } + } + ] + }, + { + "port":"port12", + "host":"cxl_switch_uport.3", + "endpoints:port12":[ + { + "endpoint":"endpoint17", + "host":"mem8", + "memdev":{ + "memdev":"mem8", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x7", + "numa_node":1, + "host":"cxl_mem.7" + } + }, + { + "endpoint":"endpoint13", + "host":"mem4", + "memdev":{ + "memdev":"mem4", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x3", + "numa_node":1, + "host":"cxl_mem.3" + } + } + ] + } + ] + }, + { + "port":"port4", + "host":"cxl_host_bridge.0", + "ports:port4":[ + { + "port":"port6", + "host":"cxl_switch_uport.0", + "endpoints:port6":[ + { + "endpoint":"endpoint7", + "host":"mem1", + "memdev":{ + "memdev":"mem1", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0", + "numa_node":0, + "host":"cxl_mem.0" + } + }, + { + "endpoint":"endpoint14", + "host":"mem5", + "memdev":{ + "memdev":"mem5", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x4", + "numa_node":0, + "host":"cxl_mem.4" + } + } + ] + }, + { + "port":"port10", + "host":"cxl_switch_uport.2", + "endpoints:port10":[ + { + "endpoint":"endpoint16", + "host":"mem7", + "memdev":{ + "memdev":"mem7", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x6", + "numa_node":0, + "host":"cxl_mem.6" + } + }, + { + "endpoint":"endpoint11", + "host":"mem3", + "memdev":{ + "memdev":"mem3", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x2", + "numa_node":0, + "host":"cxl_mem.2" + } + } + ] + } + ] + } + ] + } + +In that listing each "root", "port", and "endpoint" object correspond a kernel +'struct cxl_port' object. A 'cxl_port' is a device that can decode CXL.mem to +its descendants. So "root" claims non-PCIe enumerable platform decode ranges and +decodes them to "ports", "ports" decode to "endpoints", and "endpoints" +represent the decode from SPA (System Physical Address) to DPA (Device Physical +Address). + +Continuing the RAID analogy, disks have both topology metadata and on device +metadata that determine RAID set assembly. CXL Port topology and CXL Port link +status is metadata for CXL.mem set assembly. The CXL Port topology is enumerated +by the arrival of a CXL.mem device. I.e. unless and until the PCIe core attaches +the cxl_pci driver to a CXL Memory Expander there is no role for CXL Port +objects. Conversely for hot-unplug / removal scenarios, there is no need for +the Linux PCI core to tear down switch-level CXL resources because the endpoint +->remove() event cleans up the port data that was established to support that +Memory Expander. + +The port metadata and potential decode schemes that a give memory device may +participate can be determined via a command like:: + + # cxl list -BDMu -d root -m mem3 + { + "bus":"root3", + "provider":"cxl_test", + "decoders:root3":[ + { + "decoder":"decoder3.1", + "resource":"0x8030000000", + "size":"512.00 MiB (536.87 MB)", + "volatile_capable":true, + "nr_targets":2 + }, + { + "decoder":"decoder3.3", + "resource":"0x8060000000", + "size":"512.00 MiB (536.87 MB)", + "pmem_capable":true, + "nr_targets":2 + }, + { + "decoder":"decoder3.0", + "resource":"0x8020000000", + "size":"256.00 MiB (268.44 MB)", + "volatile_capable":true, + "nr_targets":1 + }, + { + "decoder":"decoder3.2", + "resource":"0x8050000000", + "size":"256.00 MiB (268.44 MB)", + "pmem_capable":true, + "nr_targets":1 + } + ], + "memdevs:root3":[ + { + "memdev":"mem3", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x2", + "numa_node":0, + "host":"cxl_mem.2" + } + ] + } + +...which queries the CXL topology to ask "given CXL Memory Expander with a kernel +device name of 'mem3' which platform level decode ranges may this device +participate". A given expander can participate in multiple CXL.mem interleave +sets simultaneously depending on how many decoder resource it has. In this +example mem3 can participate in one or more of a PMEM interleave that spans to +Host Bridges, a PMEM interleave that targets a single Host Bridge, a Volatile +memory interleave that spans 2 Host Bridges, and a Volatile memory interleave +that only targets a single Host Bridge. + +Conversely the memory devices that can participate in a given platform level +decode scheme can be determined via a command like the following:: + + # cxl list -MDu -d 3.2 + [ + { + "memdevs":[ + { + "memdev":"mem1", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0", + "numa_node":0, + "host":"cxl_mem.0" + }, + { + "memdev":"mem5", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x4", + "numa_node":0, + "host":"cxl_mem.4" + }, + { + "memdev":"mem7", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x6", + "numa_node":0, + "host":"cxl_mem.6" + }, + { + "memdev":"mem3", + "pmem_size":"256.00 MiB (268.44 MB)", + "ram_size":"256.00 MiB (268.44 MB)", + "serial":"0x2", + "numa_node":0, + "host":"cxl_mem.2" + } + ] + }, + { + "root decoders":[ + { + "decoder":"decoder3.2", + "resource":"0x8050000000", + "size":"256.00 MiB (268.44 MB)", + "pmem_capable":true, + "nr_targets":1 + } + ] + } + ] + +...where the naming scheme for decoders is "decoder.". + Driver Infrastructure ===================== @@ -28,6 +325,11 @@ CXL Memory Device .. kernel-doc:: drivers/cxl/pci.c :internal: +CXL Port +-------- +.. kernel-doc:: drivers/cxl/port.c + :doc: cxl port + CXL Core -------- .. kernel-doc:: drivers/cxl/cxl.h diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index ef05e96f8f97..4f4f7587f6ca 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -77,4 +77,9 @@ config CXL_PMEM provisioning the persistent memory capacity of CXL memory expanders. If unsure say 'm'. + +config CXL_PORT + default CXL_BUS + tristate + endif diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile index cf07ae6cea17..56fcac2323cb 100644 --- a/drivers/cxl/Makefile +++ b/drivers/cxl/Makefile @@ -3,7 +3,9 @@ obj-$(CONFIG_CXL_BUS) += core/ obj-$(CONFIG_CXL_PCI) += cxl_pci.o obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o +obj-$(CONFIG_CXL_PORT) += cxl_port.o cxl_pci-y := pci.o cxl_acpi-y := acpi.o cxl_pmem-y := pmem.o +cxl_port-y := port.o diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 8c2ced91518b..82591642ea90 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -169,7 +169,6 @@ static int add_host_bridge_uport(struct device *match, void *arg) struct acpi_device *bridge = to_cxl_host_bridge(host, match); struct acpi_pci_root *pci_root; struct cxl_dport *dport; - struct cxl_hdm *cxlhdm; struct cxl_port *port; int rc; @@ -197,28 +196,7 @@ static int add_host_bridge_uport(struct device *match, void *arg) return PTR_ERR(port); dev_dbg(host, "%s: add: %s\n", dev_name(match), dev_name(&port->dev)); - rc = devm_cxl_port_enumerate_dports(host, port); - if (rc < 0) - return rc; - cxl_device_lock(&port->dev); - if (rc == 1) { - rc = devm_cxl_add_passthrough_decoder(host, port); - goto out; - } - - cxlhdm = devm_cxl_setup_hdm(host, port); - if (IS_ERR(cxlhdm)) { - rc = PTR_ERR(cxlhdm); - goto out; - } - - rc = devm_cxl_enumerate_decoders(host, cxlhdm); - if (rc) - dev_err(&port->dev, "Couldn't enumerate decoders (%d)\n", rc); - -out: - cxl_device_unlock(&port->dev); - return rc; + return 0; } struct cxl_chbs_context { @@ -278,9 +256,7 @@ static int add_host_bridge_dport(struct device *match, void *arg) return 0; } - cxl_device_lock(&root_port->dev); dport = devm_cxl_add_dport(host, root_port, match, uid, ctx.chbcr); - cxl_device_unlock(&root_port->dev); if (IS_ERR(dport)) { dev_err(host, "failed to add downstream port: %s\n", dev_name(match)); diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index c5a9e03ed477..8ec5f74da679 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -47,10 +47,8 @@ static int match_add_dports(struct pci_dev *pdev, void *data) dev_dbg(&port->dev, "failed to find component registers\n"); port_num = FIELD_GET(PCI_EXP_LNKCAP_PN, lnkcap); - cxl_device_lock(&port->dev); dport = devm_cxl_add_dport(ctx->host, port, &pdev->dev, port_num, cxl_regmap_to_base(pdev, &map)); - cxl_device_unlock(&port->dev); if (IS_ERR(dport)) { ctx->error = PTR_ERR(dport); return PTR_ERR(dport); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 0bbd8fb8f35d..a66284b7eb1b 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -40,6 +40,11 @@ static int cxl_device_id(struct device *dev) return CXL_DEVICE_NVDIMM_BRIDGE; if (dev->type == &cxl_nvdimm_type) return CXL_DEVICE_NVDIMM; + if (is_cxl_port(dev)) { + if (is_cxl_root(to_cxl_port(dev))) + return CXL_DEVICE_ROOT; + return CXL_DEVICE_PORT; + } return 0; } @@ -298,6 +303,9 @@ static void unregister_port(void *_port) { struct cxl_port *port = _port; + if (!is_cxl_root(port)) + device_lock_assert(port->dev.parent); + device_unregister(&port->dev); } @@ -526,15 +534,34 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new) return dup ? -EEXIST : 0; } +/* + * Since root-level CXL dports cannot be enumerated by PCI they are not + * enumerated by the common port driver that acquires the port lock over + * dport add/remove. Instead, root dports are manually added by a + * platform driver and cond_cxl_root_lock() is used to take the missing + * port lock in that case. + */ +static void cond_cxl_root_lock(struct cxl_port *port) +{ + if (is_cxl_root(port)) + cxl_device_lock(&port->dev); +} + +static void cond_cxl_root_unlock(struct cxl_port *port) +{ + if (is_cxl_root(port)) + cxl_device_unlock(&port->dev); +} + static void cxl_dport_remove(void *data) { struct cxl_dport *dport = data; struct cxl_port *port = dport->port; put_device(dport->dport); - cxl_device_lock(&port->dev); + cond_cxl_root_lock(port); list_del(&dport->list); - cxl_device_unlock(&port->dev); + cond_cxl_root_unlock(port); } static void cxl_dport_unlink(void *data) @@ -587,7 +614,9 @@ struct cxl_dport *devm_cxl_add_dport(struct device *host, struct cxl_port *port, dport->component_reg_phys = component_reg_phys; dport->port = port; + cond_cxl_root_lock(port); rc = add_dport(port, dport); + cond_cxl_root_unlock(port); if (rc) return ERR_PTR(rc); @@ -895,6 +924,7 @@ static int cxl_bus_probe(struct device *dev) rc = to_cxl_drv(dev->driver)->probe(dev); cxl_nested_unlock(dev); + dev_dbg(dev, "probe: %d\n", rc); return rc; } diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index c127d5c0ac96..2b24eb56618f 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -163,6 +163,8 @@ int cxl_map_device_regs(struct pci_dev *pdev, enum cxl_regloc_type; int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type, struct cxl_register_map *map); +void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, + resource_size_t length); #define CXL_RESOURCE_NONE ((resource_size_t) -1) #define CXL_TARGET_STRLEN 20 @@ -354,6 +356,8 @@ void cxl_driver_unregister(struct cxl_driver *cxl_drv); #define CXL_DEVICE_NVDIMM_BRIDGE 1 #define CXL_DEVICE_NVDIMM 2 +#define CXL_DEVICE_PORT 3 +#define CXL_DEVICE_ROOT 4 #define MODULE_ALIAS_CXL(type) MODULE_ALIAS("cxl:t" __stringify(type) "*") #define CXL_MODALIAS_FMT "cxl:t%d" diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 103636fda198..47640f19e899 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -2,6 +2,7 @@ /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ #ifndef __CXL_PCI_H__ #define __CXL_PCI_H__ +#include #include "cxl.h" #define CXL_MEMORY_PROGIF 0x10 diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c new file mode 100644 index 000000000000..daa4c3c33aed --- /dev/null +++ b/drivers/cxl/port.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include +#include +#include + +#include "cxlmem.h" +#include "cxlpci.h" + +/** + * DOC: cxl port + * + * The port driver enumerates dport via PCI and scans for HDM + * (Host-managed-Device-Memory) decoder resources via the + * @component_reg_phys value passed in by the agent that registered the + * port. All descendant ports of a CXL root port (described by platform + * firmware) are managed in this drivers context. Each driver instance + * is responsible for tearing down the driver context of immediate + * descendant ports. The locking for this is validated by + * CONFIG_PROVE_CXL_LOCKING. + * + * The primary service this driver provides is presenting APIs to other + * drivers to utilize the decoders, and indicating to userspace (via bind + * status) the connectivity of the CXL.mem protocol throughout the + * PCIe topology. + */ + +static int cxl_port_probe(struct device *dev) +{ + struct cxl_port *port = to_cxl_port(dev); + struct cxl_hdm *cxlhdm; + int rc; + + rc = devm_cxl_port_enumerate_dports(dev, port); + if (rc < 0) + return rc; + + if (rc == 1) + return devm_cxl_add_passthrough_decoder(dev, port); + + cxlhdm = devm_cxl_setup_hdm(dev, port); + if (IS_ERR(cxlhdm)) + return PTR_ERR(cxlhdm); + + rc = devm_cxl_enumerate_decoders(dev, cxlhdm); + if (rc) { + dev_err(dev, "Couldn't enumerate decoders (%d)\n", rc); + return rc; + } + + return 0; +} + +static struct cxl_driver cxl_port_driver = { + .name = "cxl_port", + .probe = cxl_port_probe, + .id = CXL_DEVICE_PORT, +}; + +module_cxl_driver(cxl_port_driver); +MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS(CXL); +MODULE_ALIAS_CXL(CXL_DEVICE_PORT); diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 3045d7cba0db..27ae13e23e79 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -26,6 +26,11 @@ obj-m += cxl_pmem.o cxl_pmem-y := $(CXL_SRC)/pmem.o cxl_pmem-y += config_check.o +obj-m += cxl_port.o + +cxl_port-y := $(CXL_SRC)/port.o +cxl_port-y += config_check.o + obj-m += cxl_core.o cxl_core-y := $(CXL_CORE_SRC)/port.o diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 81c09380c537..ce6ace286fc7 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -437,10 +437,8 @@ static int mock_cxl_port_enumerate_dports(struct device *host, if (pdev->dev.parent != port->uport) continue; - cxl_device_lock(&port->dev); dport = devm_cxl_add_dport(host, port, &pdev->dev, pdev->id, CXL_RESOURCE_NONE); - cxl_device_unlock(&port->dev); if (IS_ERR(dport)) { dev_err(dev, "failed to add dport: %s (%ld)\n",