From 5a5e02a614e59db7536cd11029e6674adc41b191 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 09:35:44 +0100 Subject: [PATCH 1/8] intel-iommu: Fix dma vs. mm page confusion with aligned_nrpages() The aligned_nrpages() function rounds up to the next VM page, but returns its result as a number of DMA pages. Purely theoretical except on IA64, which doesn't boot with VT-d right now anyway. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 53075424a434..ad85e95d2dcc 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2368,15 +2368,15 @@ error: return ret; } +/* Returns a number of VTD pages, but aligned to MM page size */ static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size) { host_addr &= ~PAGE_MASK; - host_addr += size + PAGE_SIZE - 1; - - return host_addr >> VTD_PAGE_SHIFT; + return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; } +/* This takes a number of _MM_ pages, not VTD pages */ static struct iova *intel_alloc_iova(struct device *dev, struct dmar_domain *domain, unsigned long nrpages, uint64_t dma_mask) @@ -2506,7 +2506,8 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, iommu = domain_get_iommu(domain); size = aligned_nrpages(paddr, size); - iova = intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), + pdev->dma_mask); if (!iova) goto error; @@ -2797,7 +2798,8 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne for_each_sg(sglist, sg, nelems, i) size += aligned_nrpages(sg->offset, sg->length); - iova = intel_alloc_iova(hwdev, domain, size, pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), + pdev->dma_mask); if (!iova) { sglist->dma_length = 0; return 0; From 1e4c64c46d413de84cc0b786bd6a9b555ba7d111 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 10:40:38 +0100 Subject: [PATCH 2/8] intel-iommu: Don't set identity mapping for bypassed graphics devices We should check iommu_dummy() _first_, because that means it's attached to an iommu that we've just disabled completely. At the moment, we might try to put the device into the identity mapping domain. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index ad85e95d2dcc..3e3910127fc1 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2447,8 +2447,11 @@ static int iommu_no_mapping(struct pci_dev *pdev) { int found; + if (iommu_dummy(pdev)) + return 1; + if (!iommu_identity_mapping) - return iommu_dummy(pdev); + return 0; found = identity_mapping(pdev); if (found) { @@ -2480,7 +2483,7 @@ static int iommu_no_mapping(struct pci_dev *pdev) } } - return iommu_dummy(pdev); + return 0; } static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, From 1b7bc0a1618b4de1e6f55c6d95b790f4ab6fcd9e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 10:49:46 +0100 Subject: [PATCH 3/8] intel-iommu: Fix reattaching of devices to identity mapping domain When we reattach a device to the si_domain (because it's been removed from a VM), we weren't calling domain_context_mapping() to actually tell the hardware about that. We should really put the call to domain_context_mapping() into domain_add_dev_info() -- we never call the latter without also doing the former, and we can keep the error paths simple that way. But that's a cleanup which can wait for 2.6.32 now. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 3e3910127fc1..73a5c71dd37d 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2475,6 +2475,9 @@ static int iommu_no_mapping(struct pci_dev *pdev) if (pdev->dma_mask > DMA_BIT_MASK(32)) { int ret; ret = domain_add_dev_info(si_domain, pdev); + if (ret) + return 0; + ret = domain_context_mapping(si_domain, pdev, CONTEXT_TT_MULTI_LEVEL); if (!ret) { printk(KERN_INFO "64bit %s uses identity mapping\n", pci_name(pdev)); From 40e4aa34324bff3938a900014254f88943d05e15 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 10:55:41 +0100 Subject: [PATCH 4/8] intel-iommu: Add iommu_should_identity_map() function We do this twice, and it's about to get more complicated. This makes the code slightly clearer about what it's doing, too. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 73a5c71dd37d..ae5ccdf8b19f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2442,6 +2442,11 @@ static int iommu_dummy(struct pci_dev *pdev) return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO; } +static int iommu_should_identity_map(struct pci_dev *pdev) +{ + return pdev->dma_mask > DMA_BIT_MASK(32); +} + /* Check if the pdev needs to go through non-identity map and unmap process.*/ static int iommu_no_mapping(struct pci_dev *pdev) { @@ -2455,7 +2460,7 @@ static int iommu_no_mapping(struct pci_dev *pdev) found = identity_mapping(pdev); if (found) { - if (pdev->dma_mask > DMA_BIT_MASK(32)) + if (iommu_should_identity_map(pdev)) return 1; else { /* @@ -2472,7 +2477,7 @@ static int iommu_no_mapping(struct pci_dev *pdev) * In case of a detached 64 bit DMA device from vm, the device * is put into si_domain for identity mapping. */ - if (pdev->dma_mask > DMA_BIT_MASK(32)) { + if (iommu_should_identity_map(pdev)) { int ret; ret = domain_add_dev_info(si_domain, pdev); if (ret) From 62edf5dc4a524e4cb525e6020b955a1ad593d9ba Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 10:59:46 +0100 Subject: [PATCH 5/8] intel-iommu: Restore DMAR_BROKEN_GFX_WA option for broken graphics drivers We need to give people a little more time to fix the broken drivers. Re-introduce this, but tied in properly with the 'iommu=pt' support this time. Change the config option name and make it default to 'no' too. Signed-off-by: David Woodhouse --- arch/x86/Kconfig | 12 ++++++++++++ drivers/pci/intel-iommu.c | 29 +++++++++++++++++++---------- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c07f72205909..738bdc6b0f8b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1913,6 +1913,18 @@ config DMAR_DEFAULT_ON recommended you say N here while the DMAR code remains experimental. +config DMAR_BROKEN_GFX_WA + def_bool n + prompt "Workaround broken graphics drivers (going away soon)" + depends on DMAR + ---help--- + Current Graphics drivers tend to use physical address + for DMA and avoid using DMA APIs. Setting this config + option permits the IOMMU driver to set a unity map for + all the OS-visible memory. Hence the driver can continue + to use physical addresses for DMA, at least until this + option is removed in the 2.6.32 kernel. + config DMAR_FLOPPY_WA def_bool y depends on DMAR diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index ae5ccdf8b19f..5ee8305257ea 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2127,16 +2127,18 @@ static int iommu_prepare_static_identity_mapping(void) return -EFAULT; for_each_pci_dev(pdev) { - printk(KERN_INFO "IOMMU: identity mapping for device %s\n", - pci_name(pdev)); + if (iommu_identity_mapping == 1 || IS_GFX_DEVICE(pdev)) { + printk(KERN_INFO "IOMMU: identity mapping for device %s\n", + pci_name(pdev)); - ret = domain_context_mapping(si_domain, pdev, - CONTEXT_TT_MULTI_LEVEL); - if (ret) - return ret; - ret = domain_add_dev_info(si_domain, pdev); - if (ret) - return ret; + ret = domain_context_mapping(si_domain, pdev, + CONTEXT_TT_MULTI_LEVEL); + if (ret) + return ret; + ret = domain_add_dev_info(si_domain, pdev); + if (ret) + return ret; + } } return 0; @@ -2291,6 +2293,10 @@ int __init init_dmars(void) * identity mapping if iommu_identity_mapping is set. */ if (!iommu_pass_through) { +#ifdef CONFIG_DMAR_BROKEN_GFX_WA + if (!iommu_identity_mapping) + iommu_identity_mapping = 2; +#endif if (iommu_identity_mapping) iommu_prepare_static_identity_mapping(); /* @@ -2444,7 +2450,10 @@ static int iommu_dummy(struct pci_dev *pdev) static int iommu_should_identity_map(struct pci_dev *pdev) { - return pdev->dma_mask > DMA_BIT_MASK(32); + if (iommu_identity_mapping == 2) + return IS_GFX_DEVICE(pdev); + else + return pdev->dma_mask > DMA_BIT_MASK(32); } /* Check if the pdev needs to go through non-identity map and unmap process.*/ From 736768325efcbee7b0861d62670d01a54c2d158b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 14:08:36 +0100 Subject: [PATCH 6/8] intel-iommu: No mapping for non-PCI devices This should fix kernel.org bug #11821, where the dcdbas driver makes up a platform device and then uses dma_alloc_coherent() on it, in an attempt to get memory < 4GiB. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 5ee8305257ea..fa052dd89913 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2457,10 +2457,15 @@ static int iommu_should_identity_map(struct pci_dev *pdev) } /* Check if the pdev needs to go through non-identity map and unmap process.*/ -static int iommu_no_mapping(struct pci_dev *pdev) +static int iommu_no_mapping(struct device *dev) { + struct pci_dev *pdev; int found; + if (unlikely(dev->bus != &pci_bus_type)) + return 1; + + pdev = to_pci_dev(dev); if (iommu_dummy(pdev)) return 1; @@ -2516,7 +2521,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, BUG_ON(dir == DMA_NONE); - if (iommu_no_mapping(pdev)) + if (iommu_no_mapping(hwdev)) return paddr; domain = get_valid_domain_for_dev(pdev); @@ -2656,7 +2661,7 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, struct iova *iova; struct intel_iommu *iommu; - if (iommu_no_mapping(pdev)) + if (iommu_no_mapping(dev)) return; domain = find_domain(pdev); @@ -2747,7 +2752,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, struct iova *iova; struct intel_iommu *iommu; - if (iommu_no_mapping(pdev)) + if (iommu_no_mapping(hwdev)) return; domain = find_domain(pdev); @@ -2806,7 +2811,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne struct intel_iommu *iommu; BUG_ON(dir == DMA_NONE); - if (iommu_no_mapping(pdev)) + if (iommu_no_mapping(hwdev)) return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir); domain = get_valid_domain_for_dev(pdev); From 6941af2810c6fc970b88f7c0d52ba4e286acbee5 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 18:24:27 +0100 Subject: [PATCH 7/8] intel-iommu: Use iommu_should_identity_map() at startup time too. At boot time, the dma_mask won't have been set on any devices, so we assume that all devices will be 64-bit capable (and thus get a 1:1 map). Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index fa052dd89913..f9fc4f3bfa3f 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2117,6 +2117,17 @@ static int domain_add_dev_info(struct dmar_domain *domain, return 0; } +static int iommu_should_identity_map(struct pci_dev *pdev, int startup) +{ + if (iommu_identity_mapping == 2) + return IS_GFX_DEVICE(pdev); + + if (!startup) + return pdev->dma_mask > DMA_BIT_MASK(32); + + return 1; +} + static int iommu_prepare_static_identity_mapping(void) { struct pci_dev *pdev = NULL; @@ -2127,7 +2138,7 @@ static int iommu_prepare_static_identity_mapping(void) return -EFAULT; for_each_pci_dev(pdev) { - if (iommu_identity_mapping == 1 || IS_GFX_DEVICE(pdev)) { + if (iommu_should_identity_map(pdev, 1)) { printk(KERN_INFO "IOMMU: identity mapping for device %s\n", pci_name(pdev)); @@ -2448,14 +2459,6 @@ static int iommu_dummy(struct pci_dev *pdev) return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO; } -static int iommu_should_identity_map(struct pci_dev *pdev) -{ - if (iommu_identity_mapping == 2) - return IS_GFX_DEVICE(pdev); - else - return pdev->dma_mask > DMA_BIT_MASK(32); -} - /* Check if the pdev needs to go through non-identity map and unmap process.*/ static int iommu_no_mapping(struct device *dev) { @@ -2474,7 +2477,7 @@ static int iommu_no_mapping(struct device *dev) found = identity_mapping(pdev); if (found) { - if (iommu_should_identity_map(pdev)) + if (iommu_should_identity_map(pdev, 0)) return 1; else { /* @@ -2491,7 +2494,7 @@ static int iommu_no_mapping(struct device *dev) * In case of a detached 64 bit DMA device from vm, the device * is put into si_domain for identity mapping. */ - if (iommu_should_identity_map(pdev)) { + if (iommu_should_identity_map(pdev, 0)) { int ret; ret = domain_add_dev_info(si_domain, pdev); if (ret) From 3dfc813d94bba2046c6aed216e0fd69ac93a8e03 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 4 Jul 2009 19:11:08 +0100 Subject: [PATCH 8/8] intel-iommu: Don't use identity mapping for PCI devices behind bridges Our current strategy for pass-through mode is to put all devices into the 1:1 domain at startup (which is before we know what their dma_mask will be), and only _later_ take them out of that domain, if it turns out that they really can't address all of memory. However, when there are a bunch of PCI devices behind a bridge, they all end up with the same source-id on their DMA transactions, and hence in the same IOMMU domain. This means that we _can't_ easily move them from the 1:1 domain into their own domain at runtime, because there might be DMA in-flight from their siblings. So we have to adjust our pass-through strategy: For PCI devices not on the root bus, and for the bridges which will take responsibility for their transactions, we have to start up _out_ of the 1:1 domain, just in case. This fixes the BUG() we see when we have 32-bit-capable devices behind a PCI-PCI bridge, and use the software identity mapping. It does mean that we might end up using 'normal' mapping mode for some devices which could actually live with the faster 1:1 mapping -- but this is only for PCI devices behind bridges, which presumably aren't the devices for which people are most concerned about performance. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index f9fc4f3bfa3f..360fb67a30d7 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2122,6 +2122,36 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup) if (iommu_identity_mapping == 2) return IS_GFX_DEVICE(pdev); + /* + * We want to start off with all devices in the 1:1 domain, and + * take them out later if we find they can't access all of memory. + * + * However, we can't do this for PCI devices behind bridges, + * because all PCI devices behind the same bridge will end up + * with the same source-id on their transactions. + * + * Practically speaking, we can't change things around for these + * devices at run-time, because we can't be sure there'll be no + * DMA transactions in flight for any of their siblings. + * + * So PCI devices (unless they're on the root bus) as well as + * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of + * the 1:1 domain, just in _case_ one of their siblings turns out + * not to be able to map all of memory. + */ + if (!pdev->is_pcie) { + if (!pci_is_root_bus(pdev->bus)) + return 0; + if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI) + return 0; + } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE) + return 0; + + /* + * At boot time, we don't yet know if devices will be 64-bit capable. + * Assume that they will -- if they turn out not to be, then we can + * take them out of the 1:1 domain later. + */ if (!startup) return pdev->dma_mask > DMA_BIT_MASK(32);